From 0128b9505c90be6acc22750c4df310667d879d00 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 24 Jul 2020 13:02:33 +0100 Subject: [PATCH 0001/1035] Revert rG5dd566b7c7b78bd- "PassManager.h - remove unnecessary Function.h/Module.h includes. NFCI." This reverts commit 5dd566b7c7b78bd385418c72d63c79895be9ae97. Causing some buildbot failures that I'm not seeing on MSVC builds. --- llvm/include/llvm/Analysis/DemandedBits.h | 2 -- llvm/include/llvm/Analysis/DominanceFrontier.h | 1 - llvm/include/llvm/Analysis/LazyValueInfo.h | 1 - llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h | 2 +- llvm/include/llvm/Analysis/ScalarEvolution.h | 1 - llvm/include/llvm/IR/PassManager.h | 7 ++----- llvm/include/llvm/Transforms/IPO/HotColdSplitting.h | 1 - .../llvm/Transforms/Scalar/AlignmentFromAssumptions.h | 2 -- llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h | 3 --- llvm/include/llvm/Transforms/Scalar/Reassociate.h | 2 +- llvm/include/llvm/Transforms/Utils/Debugify.h | 1 - llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h | 1 - llvm/lib/Analysis/CallPrinter.cpp | 8 +++----- llvm/lib/Analysis/LoopPass.cpp | 2 -- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp | 1 - llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp | 3 +-- llvm/lib/CodeGen/CFGuardLongjmp.cpp | 1 - llvm/lib/CodeGen/MachineSizeOpts.cpp | 3 +-- llvm/lib/CodeGen/RegAllocBase.cpp | 1 - llvm/lib/CodeGen/WinEHPrepare.cpp | 1 - llvm/lib/IR/PassManager.cpp | 2 -- llvm/lib/Linker/IRMover.cpp | 2 -- llvm/lib/Target/BPF/BTFDebug.cpp | 1 - llvm/lib/Target/TargetMachine.cpp | 2 -- llvm/lib/Transforms/Instrumentation/CGProfile.cpp | 3 ++- llvm/lib/Transforms/Scalar/ConstantHoisting.cpp | 1 - llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 1 - llvm/lib/Transforms/Utils/Debugify.cpp | 1 + llvm/lib/Transforms/Utils/DemoteRegToStack.cpp | 4 +--- llvm/lib/Transforms/Utils/LowerSwitch.cpp | 1 - llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp | 7 +++---- llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp | 1 - 32 files changed, 16 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/Analysis/DemandedBits.h b/llvm/include/llvm/Analysis/DemandedBits.h index ebea1d3554cb4..04db3eb57c18e 100644 --- a/llvm/include/llvm/Analysis/DemandedBits.h +++ b/llvm/include/llvm/Analysis/DemandedBits.h @@ -36,8 +36,6 @@ class Function; class Instruction; struct KnownBits; class raw_ostream; -class Use; -class Value; class DemandedBits { public: diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h index e1c6c947d420c..f67929c997f93 100644 --- a/llvm/include/llvm/Analysis/DominanceFrontier.h +++ b/llvm/include/llvm/Analysis/DominanceFrontier.h @@ -30,7 +30,6 @@ namespace llvm { -class BasicBlock; class Function; class raw_ostream; diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h index b02a034a62b10..1bc88235273ee 100644 --- a/llvm/include/llvm/Analysis/LazyValueInfo.h +++ b/llvm/include/llvm/Analysis/LazyValueInfo.h @@ -19,7 +19,6 @@ namespace llvm { class AssumptionCache; - class BasicBlock; class Constant; class ConstantRange; class DataLayout; diff --git a/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h b/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h index b89ea496cd8a3..ab97d5b8504e1 100644 --- a/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h +++ b/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h @@ -17,11 +17,11 @@ #include "llvm/ADT/Optional.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" namespace llvm { +class Function; class Value; /// The optimization diagnostic interface. diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 6b96eaa01f059..81c5fc9325884 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -35,7 +35,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index 88d4059ebd4eb..c2f535037a56d 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -39,16 +39,16 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PassInstrumentation.h" #include "llvm/IR/PassManagerInternal.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/TypeName.h" -#include "llvm/Support/raw_ostream.h" #include #include #include @@ -62,9 +62,6 @@ namespace llvm { -class Function; -class Module; - /// A special type used by analysis passes to provide an address that /// identifies that particular analysis pass type. /// diff --git a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h index 93d304f78e15e..8c3049fbaac46 100644 --- a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h +++ b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h @@ -16,7 +16,6 @@ namespace llvm { -class BasicBlock; class Module; class ProfileSummaryInfo; class BlockFrequencyInfo; diff --git a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h index 41d29064d0490..be119b8ab8552 100644 --- a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h +++ b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h @@ -22,11 +22,9 @@ namespace llvm { class AssumptionCache; -class CallInst; class DominatorTree; class ScalarEvolution; class SCEV; -class Value; struct AlignmentFromAssumptionsPass : public PassInfoMixin { diff --git a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h index 91ace385c8df4..26d4a2476a86f 100644 --- a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h +++ b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h @@ -53,15 +53,12 @@ class BlockFrequencyInfo; class Constant; class ConstantInt; class ConstantExpr; -class DataLayout; class DominatorTree; class Function; class GlobalVariable; class Instruction; -class LLVMContext; class ProfileSummaryInfo; class TargetTransformInfo; -class Type; /// A private "module" namespace for types and utilities used by /// ConstantHoisting. These are implementation details and should not be used by diff --git a/llvm/include/llvm/Transforms/Scalar/Reassociate.h b/llvm/include/llvm/Transforms/Scalar/Reassociate.h index a20b6fd2866a3..28794d27325ad 100644 --- a/llvm/include/llvm/Transforms/Scalar/Reassociate.h +++ b/llvm/include/llvm/Transforms/Scalar/Reassociate.h @@ -25,7 +25,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/IR/Instruction.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" #include @@ -36,6 +35,7 @@ class APInt; class BasicBlock; class BinaryOperator; class Function; +class Instruction; class IRBuilderBase; class Value; diff --git a/llvm/include/llvm/Transforms/Utils/Debugify.h b/llvm/include/llvm/Transforms/Utils/Debugify.h index a7e311a6d74cd..1b9d43b775e75 100644 --- a/llvm/include/llvm/Transforms/Utils/Debugify.h +++ b/llvm/include/llvm/Transforms/Utils/Debugify.h @@ -18,7 +18,6 @@ #include "llvm/Bitcode/BitcodeWriterPass.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" namespace llvm { diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h index f739708179dbf..ecb44a7b1518d 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -68,7 +68,6 @@ class BlockFrequencyInfo; class DemandedBits; class DominatorTree; class Function; -class Instruction; class Loop; class LoopAccessInfo; class LoopInfo; diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp index 7c04b11179df8..bb447411ec472 100644 --- a/llvm/lib/Analysis/CallPrinter.cpp +++ b/llvm/lib/Analysis/CallPrinter.cpp @@ -14,17 +14,15 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/CallPrinter.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/DOTGraphTraitsPass.h" #include "llvm/Analysis/HeatUtils.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" +#include "llvm/InitializePasses.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" using namespace llvm; diff --git a/llvm/lib/Analysis/LoopPass.cpp b/llvm/lib/Analysis/LoopPass.cpp index 0e253a23b0da9..520f06003dd22 100644 --- a/llvm/lib/Analysis/LoopPass.cpp +++ b/llvm/lib/Analysis/LoopPass.cpp @@ -17,7 +17,6 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include "llvm/IR/OptBisect.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PassTimingInfo.h" @@ -26,7 +25,6 @@ #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" - using namespace llvm; #define DEBUG_TYPE "loop-pass-manager" diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 638acb5e80451..a46de83e555cc 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -20,7 +20,6 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfo.h" -#include "llvm/IR/Function.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp index d5f85f8c01e7e..914308d9147e2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp @@ -17,9 +17,8 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/Instructions.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCStreamer.h" diff --git a/llvm/lib/CodeGen/CFGuardLongjmp.cpp b/llvm/lib/CodeGen/CFGuardLongjmp.cpp index b5d88a7432b17..c3bf938551110 100644 --- a/llvm/lib/CodeGen/CFGuardLongjmp.cpp +++ b/llvm/lib/CodeGen/CFGuardLongjmp.cpp @@ -21,7 +21,6 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" using namespace llvm; diff --git a/llvm/lib/CodeGen/MachineSizeOpts.cpp b/llvm/lib/CodeGen/MachineSizeOpts.cpp index e19862eae5752..584d43b420044 100644 --- a/llvm/lib/CodeGen/MachineSizeOpts.cpp +++ b/llvm/lib/CodeGen/MachineSizeOpts.cpp @@ -12,10 +12,9 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineSizeOpts.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MBFIWrapper.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" -#include "llvm/IR/Function.h" using namespace llvm; diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp index 71647f0abeeb8..d228268536724 100644 --- a/llvm/lib/CodeGen/RegAllocBase.cpp +++ b/llvm/lib/CodeGen/RegAllocBase.cpp @@ -23,7 +23,6 @@ #include "llvm/CodeGen/Spiller.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp index bbad3beb0b8b8..5a25234ba850b 100644 --- a/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -24,7 +24,6 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WinEHFuncInfo.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/MC/MCSymbol.h" diff --git a/llvm/lib/IR/PassManager.cpp b/llvm/lib/IR/PassManager.cpp index 104d3a5234388..624827ff8cd94 100644 --- a/llvm/lib/IR/PassManager.cpp +++ b/llvm/lib/IR/PassManager.cpp @@ -8,9 +8,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PassManagerImpl.h" using namespace llvm; diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 203ad85a528b2..055689b16e8f4 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -16,12 +16,10 @@ #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/GVMaterializer.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" #include "llvm/IR/TypeFinder.h" #include "llvm/Support/Error.h" #include "llvm/Transforms/Utils/Cloning.h" #include - using namespace llvm; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index a8b5ea55c8653..13999d800a800 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -17,7 +17,6 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSectionELF.h" diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index 730bf49cf46f2..074e9fde79e6b 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -18,7 +18,6 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Mangler.h" -#include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" @@ -26,7 +25,6 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/SectionKind.h" #include "llvm/Target/TargetLoweringObjectFile.h" - using namespace llvm; //--------------------------------------------------------------------------- diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp index b143217ae0dc3..0cc0d9b07387b 100644 --- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp +++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Instrumentation/CGProfile.h" + #include "llvm/ADT/MapVector.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" @@ -14,11 +15,11 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Transforms/Instrumentation.h" + #include using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 9cace63fd0b86..7c14b69d658dc 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -52,7 +52,6 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 0e53b27cc4a10..c20e57b02c1a5 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -28,7 +28,6 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index ef156514fa0cd..8f98d81a3d797 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp index 865a22a3369dd..5f53d794fe8a1 100644 --- a/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -8,13 +8,11 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" - using namespace llvm; /// DemoteRegToStack - This function takes a virtual register computed by an diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp index eb5c670625651..34e836d9660f3 100644 --- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -26,7 +26,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp index f885e56bf4eaf..d35a77fa379be 100644 --- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp +++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp @@ -6,15 +6,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/AssumeBundleQueries.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/AssumeBundleQueries.h" #include "llvm/AsmParser/Parser.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Regex.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "gtest/gtest.h" #include diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp index ac8661bafaa13..e810b66b3b7c2 100644 --- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp +++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp @@ -15,7 +15,6 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" From 0ed660f9e5740b46bc10773167cb1e08b6a86389 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 24 Jul 2020 08:29:36 -0400 Subject: [PATCH 0002/1035] [gn build] (manually) port 228f8d89 --- llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index 81da62eff1743..8fe8a46bf5832 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -124,7 +124,6 @@ write_cmake_config("config") { "RETSIGTYPE=void", "LLVM_GISEL_COV_ENABLED=", "LLVM_GISEL_COV_PREFIX=", - "LLVM_HAVE_TF_AOT=", "LLVM_WITH_Z3=", # FIXME: Set to 1 on mac once the 10.14 SDK is in common use. @@ -326,6 +325,7 @@ write_cmake_config("llvm-config") { "LLVM_ENABLE_DUMP=", "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple", "LLVM_HAS_ATOMICS=1", + "LLVM_HAVE_TF_AOT=", "LLVM_HAVE_TF_API=", "LLVM_HOST_TRIPLE=$llvm_current_triple", "LLVM_NATIVE_ARCH=$native_target", From 6371a0a00edb2c56363f7d494a2fac9b6bcaee3c Mon Sep 17 00:00:00 2001 From: Djordje Todorovic Date: Fri, 24 Jul 2020 14:32:25 +0200 Subject: [PATCH 0003/1035] [DWARF][EntryValues] Emit GNU extensions in the case of DWARF 4 + SCE Emit DWARF 5 call-site symbols even though DWARF 4 is set, only in the case of LLDB tuning. This patch addresses PR46643. Differential Revision: https://reviews.llvm.org/D83463 --- .../CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 2 +- .../MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir | 73 ++++++++++--------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index ece6665e99f6c..2de6569767f69 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -990,7 +990,7 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( } bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const { - return DD->getDwarfVersion() == 4 && DD->tuneForGDB(); + return DD->getDwarfVersion() == 4 && !DD->tuneForLLDB(); } dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const { diff --git a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir index fdbebd306f37c..bde717e3c9da5 100644 --- a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir +++ b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir @@ -1,48 +1,54 @@ -# Test the call site encoding in DWARF5 vs GNU extensions. -# -# === DWARF4, tune for gdb === +## Test the call site encoding in DWARF5 vs GNU extensions. + +## === DWARF4, tune for gdb === # RUN: llc -emit-call-site-info -dwarf-version 4 -debugger-tune=gdb -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU -implicit-check-not=DW_AT_call -# + # === DWARF5, tune for gdb === # RUN: llc -dwarf-version 5 -debugger-tune=gdb -emit-call-site-info -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 -implicit-check-not=DW_AT_call -# -# === DWARF4, tune for lldb === + +## === DWARF4, tune for lldb === # RUN: llc -dwarf-version 4 -debugger-tune=lldb -emit-call-site-info -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 -implicit-check-not=DW_AT_call -# -# === DWARF5, tune for lldb === + +## === DWARF5, tune for lldb === # RUN: llc -dwarf-version 5 -debugger-tune=lldb -emit-call-site-info -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 -implicit-check-not=DW_AT_call -# + +## === DWARF4, tune for sce === +# RUN: llc -emit-call-site-info -dwarf-version 4 -filetype=obj -debugger-tune=sce \ +# RUN: -debug-entry-values -mtriple=x86_64-unknown-unknown \ +# RUN: -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU + +## === DWARF5, tune for sce === # RUN: llc -emit-call-site-info -dwarf-version 5 -filetype=obj -debugger-tune=sce \ # RUN: -debug-entry-values -mtriple=x86_64-unknown-unknown \ # RUN: -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 -# -# This is based on the following reproducer: -# -# extern void fn(); -# extern void fn2(int x); -# extern int fn3(); -# -# int fn1(int (*fn4) ()) { -# fn(); -# fn2(5); -# -# int x = (*fn4)(); -# if (!x) -# return fn3(); -# else -# return -1; -# } -# -# Check GNU extensions: -# + +## This is based on the following reproducer: +## +## extern void fn(); +## extern void fn2(int x); +## extern int fn3(); +## +## int fn1(int (*fn4) ()) { +## fn(); +## fn2(5); +## +## int x = (*fn4)(); +## if (!x) +## return fn3(); +## else +## return -1; +## } + +## Check GNU extensions: + # CHECK-GNU: DW_TAG_subprogram # CHECK-GNU: DW_AT_GNU_all_call_sites (true) # CHECK-GNU: DW_TAG_GNU_call_site @@ -58,10 +64,9 @@ # CHECK-GNU-NEXT: DW_AT_abstract_origin # CHECK-GNU-NEXT: DW_AT_GNU_tail_call # CHECK-GNU-NEXT: DW_AT_low_pc -# -# -# Check DWARF 5: -# + +## Check DWARF 5: + # CHECK-DWARF5: DW_TAG_subprogram # CHECK-DWARF5: DW_AT_call_all_calls (true) # CHECK-DWARF5: DW_TAG_call_site @@ -80,7 +85,7 @@ # CHECK-DWARF5-NEXT: DW_AT_call_origin # CHECK-DWARF5-NEXT: DW_AT_call_tail_call # CHECK-DWARF5-NEXT: DW_AT_call_pc -# + --- | ; ModuleID = 'call-site-attrs.c' source_filename = "call-site-attrs.c" From 2ad56119f5dc6c6af2b8ddfd9fc8c6460a7507c8 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 24 Jul 2020 08:37:47 -0400 Subject: [PATCH 0004/1035] [gn build] (manually) port 10b1b4a23 --- llvm/utils/gn/secondary/clang/test/BUILD.gn | 4 ++-- llvm/utils/gn/secondary/lld/test/BUILD.gn | 4 ++-- .../gn/secondary/llvm/include/llvm/Config/BUILD.gn | 14 ++------------ llvm/utils/gn/secondary/llvm/test/BUILD.gn | 4 ++-- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn index 56feaae9a149d..a0680d9848235 100644 --- a/llvm/utils/gn/secondary/clang/test/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn @@ -79,9 +79,9 @@ write_lit_config("lit_site_cfg") { } if (llvm_enable_zlib) { - extra_values += [ "HAVE_LIBZ=1" ] + extra_values += [ "LLVM_ENABLE_ZLIB=1" ] } else { - extra_values += [ "HAVE_LIBZ=0" ] # Must be 0. + extra_values += [ "LLVM_ENABLE_ZLIB=0" ] # Must be 0. } if (host_cpu == "x64") { diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn index dac50890d4ac9..a6fb457cff6ac 100644 --- a/llvm/utils/gn/secondary/lld/test/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn @@ -49,9 +49,9 @@ write_lit_cfg("lit_site_cfg") { } if (llvm_enable_zlib) { - extra_values += [ "HAVE_LIBZ=1" ] + extra_values += [ "LLVM_ENABLE_ZLIB=1" ] } else { - extra_values += [ "HAVE_LIBZ=0" ] # Must be 0. + extra_values += [ "LLVM_ENABLE_ZLIB=0" ] # Must be 0. } if (current_cpu == "x64" || current_cpu == "arm64" || diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index 8fe8a46bf5832..72b5796cd9aaa 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -295,20 +295,10 @@ write_cmake_config("config") { values += [ "LLVM_ENABLE_DIA_SDK=" ] } - # FIXME: Once https://reviews.llvm.org/D79219 is in, remove the two - # redundant HAVE_ variables. if (llvm_enable_zlib) { - values += [ - "HAVE_LIBZ=1", - "HAVE_ZLIB_H=1", - "LLVM_ENABLE_ZLIB=1", - ] + values += [ "LLVM_ENABLE_ZLIB=1" ] } else { - values += [ - "HAVE_LIBZ=", - "HAVE_ZLIB_H=", - "LLVM_ENABLE_ZLIB=", - ] + values += [ "LLVM_ENABLE_ZLIB=" ] } if (llvm_enable_libxml2) { diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn index 3e72ef3c3a446..3ad1d0ba4f443 100644 --- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn @@ -174,9 +174,9 @@ write_lit_config("lit_site_cfg") { } if (llvm_enable_zlib) { - extra_values += [ "HAVE_LIBZ=1" ] + extra_values += [ "LLVM_ENABLE_ZLIB=1" ] } else { - extra_values += [ "HAVE_LIBZ=0" ] # Must be 0. + extra_values += [ "LLVM_ENABLE_ZLIB=0" ] # Must be 0. } } From 0bbaacc8cae0373d4500c4e3f6f128d21f9033b7 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 24 Jul 2020 08:48:14 -0400 Subject: [PATCH 0005/1035] [gn build] (manually) port 10b1b4a23 more --- llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn index 97007eecd1eb9..5f8058699d729 100644 --- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn @@ -86,8 +86,8 @@ write_cmake_config("lit_common_configured") { } if (llvm_enable_zlib) { - values += [ "HAVE_LIBZ=1" ] + values += [ "LLVM_ENABLE_ZLIB=1" ] } else { - values += [ "HAVE_LIBZ=0" ] + values += [ "LLVM_ENABLE_ZLIB=0" ] } } From 5c98631391fa1c7315a687da853a776955d1531e Mon Sep 17 00:00:00 2001 From: George Mitenkov Date: Fri, 24 Jul 2020 15:01:08 +0300 Subject: [PATCH 0006/1035] [MLIR][SPIRVToLLVM] Conversion of SPIR-V variable op The patch introduces the conversion pattern for function-level `spv.Variable`. It is modelled as `llvm.alloca` op. If initialized, then additional store instruction is used. Note that there is no initialization for arrays and structs since constants of these types are not supported in LLVM dialect yet. Also, at the moment initialisation is only possible via `spv.constant` (since `spv.GlobalVariable` conversion is not implemented yet). The input code has some scoping is not taken into account and will be addressed in a different patch. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D84224 --- .../SPIRVToLLVM/ConvertSPIRVToLLVM.cpp | 42 ++++++++++++++++ .../SPIRVToLLVM/memory-ops-to-llvm.mlir | 49 +++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp index 15aaa7947a497..cb4898c346a65 100644 --- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp +++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp @@ -177,6 +177,14 @@ static Type convertStructTypePacked(spirv::StructType type, /*isPacked=*/true); } +/// Creates LLVM dialect constant with the given value. +static Value createI32ConstantOf(Location loc, PatternRewriter &rewriter, + LLVMTypeConverter &converter, unsigned value) { + return rewriter.create( + loc, LLVM::LLVMType::getInt32Ty(converter.getDialect()), + rewriter.getIntegerAttr(rewriter.getI32Type(), value)); +} + //===----------------------------------------------------------------------===// // Type conversion //===----------------------------------------------------------------------===// @@ -733,6 +741,37 @@ class ShiftPattern : public SPIRVToLLVMConversion { } }; +class VariablePattern : public SPIRVToLLVMConversion { +public: + using SPIRVToLLVMConversion::SPIRVToLLVMConversion; + + LogicalResult + matchAndRewrite(spirv::VariableOp varOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + auto srcType = varOp.getType(); + // Initialization is supported for scalars and vectors only. + auto pointerTo = srcType.cast().getPointeeType(); + auto init = varOp.initializer(); + if (init && !pointerTo.isIntOrFloat() && !pointerTo.isa()) + return failure(); + + auto dstType = typeConverter.convertType(srcType); + if (!dstType) + return failure(); + + Location loc = varOp.getLoc(); + Value size = createI32ConstantOf(loc, rewriter, typeConverter, 1); + if (!init) { + rewriter.replaceOpWithNewOp(varOp, dstType, size); + return success(); + } + Value allocated = rewriter.create(loc, dstType, size); + rewriter.create(loc, init, allocated); + rewriter.replaceOp(varOp, allocated); + return success(); + } +}; + //===----------------------------------------------------------------------===// // FuncOp conversion //===----------------------------------------------------------------------===// @@ -933,6 +972,9 @@ void mlir::populateSPIRVToLLVMConversionPatterns( IComparePattern, NotPattern, + // Memory ops + VariablePattern, + // Miscellaneous ops DirectConversionPattern, DirectConversionPattern, diff --git a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir new file mode 100644 index 0000000000000..4c549ab8d619c --- /dev/null +++ b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir @@ -0,0 +1,49 @@ +// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// spv.Variable +//===----------------------------------------------------------------------===// + +func @variable_scalar() { + // CHECK: %[[SIZE1:.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32 + // CHECK: %{{.*}} = llvm.alloca %[[SIZE1]] x !llvm.float : (!llvm.i32) -> !llvm<"float*"> + %0 = spv.Variable : !spv.ptr + // CHECK: %[[SIZE2:.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32 + // CHECK: %{{.*}} = llvm.alloca %[[SIZE2]] x !llvm.i8 : (!llvm.i32) -> !llvm<"i8*"> + %1 = spv.Variable : !spv.ptr + return +} + +func @variable_scalar_with_initialization() { + // CHECK: %[[VALUE:.*]] = llvm.mlir.constant(0 : i64) : !llvm.i64 + // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32 + // CHECK: %[[ALLOCATED:.*]] = llvm.alloca %[[SIZE]] x !llvm.i64 : (!llvm.i32) -> !llvm<"i64*"> + // CHECK: llvm.store %[[VALUE]], %[[ALLOCATED]] : !llvm<"i64*"> + %c = spv.constant 0 : i64 + %0 = spv.Variable init(%c) : !spv.ptr + return +} + +func @variable_vector() { + // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32 + // CHECK: %{{.*}} = llvm.alloca %[[SIZE]] x !llvm<"<3 x float>"> : (!llvm.i32) -> !llvm<"<3 x float>*"> + %0 = spv.Variable : !spv.ptr, Function> + return +} + +func @variable_vector_with_initialization() { + // CHECK: %[[VALUE:.*]] = llvm.mlir.constant(dense : vector<3xi1>) : !llvm<"<3 x i1>"> + // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32 + // CHECK: %[[ALLOCATED:.*]] = llvm.alloca %[[SIZE]] x !llvm<"<3 x i1>"> : (!llvm.i32) -> !llvm<"<3 x i1>*"> + // CHECK: llvm.store %[[VALUE]], %[[ALLOCATED]] : !llvm<"<3 x i1>*"> + %c = spv.constant dense : vector<3xi1> + %0 = spv.Variable init(%c) : !spv.ptr, Function> + return +} + +func @variable_array() { + // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32 + // CHECK: %{{.*}} = llvm.alloca %[[SIZE]] x !llvm<"[10 x i32]"> : (!llvm.i32) -> !llvm<"[10 x i32]*"> + %0 = spv.Variable : !spv.ptr, Function> + return +} From cf428778128fed5eacee884964af53bf4a9f74f2 Mon Sep 17 00:00:00 2001 From: Yitzhak Mandelbaum Date: Tue, 21 Jul 2020 20:09:49 +0000 Subject: [PATCH 0007/1035] [libTooling] Add assorted `EditGenerator` combinators. Summary: This patch adds various combinators that help in constructing `EditGenerator`s: * `noEdits` * `ifBound`, specialized to `ASTEdit` * `flatten` and `flattenVector` which allow for easy construction from a set of sub edits. * `shrinkTo`, which generates edits to shrink a given range to another that it encloses. Reviewers: asoffer, gribozavr2 Subscribers: cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D84310 --- .../clang/Tooling/Transformer/MatchConsumer.h | 6 - .../clang/Tooling/Transformer/RewriteRule.h | 43 ++++++- clang/lib/Tooling/Transformer/RewriteRule.cpp | 18 +++ clang/unittests/Tooling/TransformerTest.cpp | 120 ++++++++++++++++++ 4 files changed, 180 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/Tooling/Transformer/MatchConsumer.h b/clang/include/clang/Tooling/Transformer/MatchConsumer.h index f407ffce3d252..cb0a5f684b7d6 100644 --- a/clang/include/clang/Tooling/Transformer/MatchConsumer.h +++ b/clang/include/clang/Tooling/Transformer/MatchConsumer.h @@ -99,11 +99,5 @@ llvm::Expected MatchComputation::eval( return Output; } } // namespace transformer - -namespace tooling { -// DEPRECATED: Temporary alias supporting client migration to the `transformer` -// namespace. -using transformer::ifBound; -} // namespace tooling } // namespace clang #endif // LLVM_CLANG_TOOLING_TRANSFORMER_MATCH_CONSUMER_H_ diff --git a/clang/include/clang/Tooling/Transformer/RewriteRule.h b/clang/include/clang/Tooling/Transformer/RewriteRule.h index 1be5727364608..c22a2da81fe60 100644 --- a/clang/include/clang/Tooling/Transformer/RewriteRule.h +++ b/clang/include/clang/Tooling/Transformer/RewriteRule.h @@ -107,9 +107,42 @@ struct ASTEdit { /// clients. We recommend use of the \c AtomicChange or \c Replacements classes /// for assistance in detecting such conflicts. EditGenerator editList(llvm::SmallVector Edits); -// Convenience form of `editList` for a single edit. +/// Convenience form of `editList` for a single edit. EditGenerator edit(ASTEdit); +/// Convenience generator for a no-op edit generator. +inline EditGenerator noEdits() { return editList({}); } + +/// Convenience version of `ifBound` specialized to `ASTEdit`. +inline EditGenerator ifBound(std::string ID, ASTEdit TrueEdit, + ASTEdit FalseEdit) { + return ifBound(std::move(ID), edit(std::move(TrueEdit)), + edit(std::move(FalseEdit))); +} + +/// Convenience version of `ifBound` that has no "False" branch. If the node is +/// not bound, then no edits are produced. +inline EditGenerator ifBound(std::string ID, ASTEdit TrueEdit) { + return ifBound(std::move(ID), edit(std::move(TrueEdit)), noEdits()); +} + +/// Flattens a list of generators into a single generator whose elements are the +/// concatenation of the results of the argument generators. +EditGenerator flattenVector(SmallVector Generators); + +namespace detail { +/// Convenience function to construct an \c EditGenerator. Overloaded for common +/// cases so that user doesn't need to specify which factory function to +/// use. This pattern gives benefits similar to implicit constructors, while +/// maintaing a higher degree of explicitness. +inline EditGenerator injectEdits(ASTEdit E) { return edit(std::move(E)); } +inline EditGenerator injectEdits(EditGenerator G) { return G; } +} // namespace detail + +template EditGenerator flatten(Ts &&...Edits) { + return flattenVector({detail::injectEdits(std::forward(Edits))...}); +} + /// Format of the path in an include directive -- angle brackets or quotes. enum class IncludeFormat { Quoted, @@ -291,6 +324,14 @@ inline ASTEdit withMetadata(ASTEdit Edit, Callable Metadata) { return Edit; } +/// Assuming that the inner range is enclosed by the outer range, creates +/// precision edits to remove the parts of the outer range that are not included +/// in the inner range. +inline EditGenerator shrinkTo(RangeSelector outer, RangeSelector inner) { + return editList({remove(enclose(before(outer), before(inner))), + remove(enclose(after(inner), after(outer)))}); +} + /// The following three functions are a low-level part of the RewriteRule /// API. We expose them for use in implementing the fixtures that interpret /// RewriteRule, like Transformer and TransfomerTidy, or for more advanced diff --git a/clang/lib/Tooling/Transformer/RewriteRule.cpp b/clang/lib/Tooling/Transformer/RewriteRule.cpp index a212a868c81d8..c145895af7ab6 100644 --- a/clang/lib/Tooling/Transformer/RewriteRule.cpp +++ b/clang/lib/Tooling/Transformer/RewriteRule.cpp @@ -68,6 +68,24 @@ EditGenerator transformer::edit(ASTEdit Edit) { }; } +EditGenerator +transformer::flattenVector(SmallVector Generators) { + if (Generators.size() == 1) + return std::move(Generators[0]); + return + [Gs = std::move(Generators)]( + const MatchResult &Result) -> llvm::Expected> { + SmallVector AllEdits; + for (const auto &G : Gs) { + llvm::Expected> Edits = G(Result); + if (!Edits) + return Edits.takeError(); + AllEdits.append(Edits->begin(), Edits->end()); + } + return AllEdits; + }; +} + ASTEdit transformer::changeTo(RangeSelector Target, TextGenerator Replacement) { ASTEdit E; E.TargetRange = std::move(Target); diff --git a/clang/unittests/Tooling/TransformerTest.cpp b/clang/unittests/Tooling/TransformerTest.cpp index 59b334b0ea5a4..1a68eb1d172a3 100644 --- a/clang/unittests/Tooling/TransformerTest.cpp +++ b/clang/unittests/Tooling/TransformerTest.cpp @@ -10,6 +10,7 @@ #include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Tooling/Tooling.h" #include "clang/Tooling/Transformer/RangeSelector.h" +#include "clang/Tooling/Transformer/RewriteRule.h" #include "clang/Tooling/Transformer/Stencil.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" @@ -378,6 +379,41 @@ TEST_F(TransformerTest, NodePartMemberMultiToken) { Input, Expected); } +TEST_F(TransformerTest, NoEdits) { + using transformer::noEdits; + std::string Input = "int f(int x) { return x; }"; + testRule(makeRule(returnStmt().bind("return"), noEdits()), Input, Input); +} + +TEST_F(TransformerTest, IfBound2Args) { + using transformer::ifBound; + std::string Input = "int f(int x) { return x; }"; + std::string Expected = "int f(int x) { CHANGE; }"; + testRule(makeRule(returnStmt().bind("return"), + ifBound("return", changeTo(cat("CHANGE;")))), + Input, Expected); +} + +TEST_F(TransformerTest, IfBound3Args) { + using transformer::ifBound; + std::string Input = "int f(int x) { return x; }"; + std::string Expected = "int f(int x) { CHANGE; }"; + testRule(makeRule(returnStmt().bind("return"), + ifBound("nothing", changeTo(cat("ERROR")), + changeTo(cat("CHANGE;")))), + Input, Expected); +} + +TEST_F(TransformerTest, ShrinkTo) { + using transformer::shrinkTo; + std::string Input = "int f(int x) { return x; }"; + std::string Expected = "return x;"; + testRule(makeRule(functionDecl(hasDescendant(returnStmt().bind("return"))) + .bind("function"), + shrinkTo(node("function"), node("return"))), + Input, Expected); +} + TEST_F(TransformerTest, InsertBeforeEdit) { std::string Input = R"cc( int f() { @@ -497,6 +533,90 @@ TEST_F(TransformerTest, MultiChange) { Input, Expected); } +TEST_F(TransformerTest, EditList) { + using clang::transformer::editList; + std::string Input = R"cc( + void foo() { + if (10 > 1.0) + log(1) << "oh no!"; + else + log(0) << "ok"; + } + )cc"; + std::string Expected = R"( + void foo() { + if (true) { /* then */ } + else { /* else */ } + } + )"; + + StringRef C = "C", T = "T", E = "E"; + testRule(makeRule(ifStmt(hasCondition(expr().bind(C)), + hasThen(stmt().bind(T)), hasElse(stmt().bind(E))), + editList({changeTo(node(std::string(C)), cat("true")), + changeTo(statement(std::string(T)), + cat("{ /* then */ }")), + changeTo(statement(std::string(E)), + cat("{ /* else */ }"))})), + Input, Expected); +} + +TEST_F(TransformerTest, Flatten) { + using clang::transformer::editList; + std::string Input = R"cc( + void foo() { + if (10 > 1.0) + log(1) << "oh no!"; + else + log(0) << "ok"; + } + )cc"; + std::string Expected = R"( + void foo() { + if (true) { /* then */ } + else { /* else */ } + } + )"; + + StringRef C = "C", T = "T", E = "E"; + testRule( + makeRule( + ifStmt(hasCondition(expr().bind(C)), hasThen(stmt().bind(T)), + hasElse(stmt().bind(E))), + flatten(changeTo(node(std::string(C)), cat("true")), + changeTo(statement(std::string(T)), cat("{ /* then */ }")), + changeTo(statement(std::string(E)), cat("{ /* else */ }")))), + Input, Expected); +} + +TEST_F(TransformerTest, FlattenWithMixedArgs) { + using clang::transformer::editList; + std::string Input = R"cc( + void foo() { + if (10 > 1.0) + log(1) << "oh no!"; + else + log(0) << "ok"; + } + )cc"; + std::string Expected = R"( + void foo() { + if (true) { /* then */ } + else { /* else */ } + } + )"; + + StringRef C = "C", T = "T", E = "E"; + testRule(makeRule(ifStmt(hasCondition(expr().bind(C)), + hasThen(stmt().bind(T)), hasElse(stmt().bind(E))), + flatten(changeTo(node(std::string(C)), cat("true")), + edit(changeTo(statement(std::string(T)), + cat("{ /* then */ }"))), + editList({changeTo(statement(std::string(E)), + cat("{ /* else */ }"))}))), + Input, Expected); +} + TEST_F(TransformerTest, OrderedRuleUnrelated) { StringRef Flag = "flag"; RewriteRule FlagRule = makeRule( From 783a351785c14b7c2eb9f65bd40d37be11cbf38b Mon Sep 17 00:00:00 2001 From: Frederik Gossen Date: Fri, 24 Jul 2020 13:24:23 +0000 Subject: [PATCH 0008/1035] [MLIR][Shape] Allow `shape.mul` to operate in indices Differential Revision: https://reviews.llvm.org/D84437 --- .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 23 +++++++---- mlir/lib/Dialect/Shape/IR/Shape.cpp | 39 ++++++++++++++----- .../Shape/Transforms/ShapeToShapeLowering.cpp | 4 +- .../ShapeToStandard/shape-to-standard.mlir | 15 +++++-- mlir/test/Dialect/Shape/invalid.mlir | 26 +++++++++++++ mlir/test/Dialect/Shape/ops.mlir | 13 +++++-- 6 files changed, 94 insertions(+), 26 deletions(-) diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index 32d6ebafff321..425cf917283be 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -307,18 +307,25 @@ def Shape_JoinOp : Shape_Op<"join", [Commutative]> { let results = (outs Shape_ShapeOrSizeType:$result); } -def Shape_MulOp : Shape_Op<"mul", [Commutative, SameOperandsAndResultType]> { - let summary = "Multiplication of sizes"; +def Shape_MulOp : Shape_Op<"mul", [Commutative, NoSideEffect]> { + let summary = "Multiplication of sizes and indices"; let description = [{ - Multiplies two valid sizes as follows: - - lhs * rhs = unknown if either lhs or rhs unknown; - - lhs * rhs = (int)lhs * (int)rhs if both known; + Multiplies two sizes or indices. If either operand is an error it will be + propagated to the result. The operands can be of type `size` or `index`. If + at least one of the operands can hold an error, i.e. if it is of type `size`, + then also the result must be of type `size`. If error propagation is not + possible because both operands are of type `index` then the result must also + be of type `index`. }]; - let arguments = (ins Shape_SizeType:$lhs, Shape_SizeType:$rhs); - let results = (outs Shape_SizeType:$result); + let arguments = (ins Shape_SizeOrIndexType:$lhs, Shape_SizeOrIndexType:$rhs); + let results = (outs Shape_SizeOrIndexType:$result); - let assemblyFormat = "$lhs `,` $rhs attr-dict"; + let assemblyFormat = [{ + $lhs `,` $rhs `:` type($lhs) `,` type($rhs) `->` type($result) attr-dict + }]; + + let verifier = [{ return ::verify(*this); }]; } def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> { diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index 3bdc5cc39a7bd..2f641300c4917 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -28,6 +28,13 @@ static RankedTensorType getExtentTensorType(MLIRContext *ctx) { return RankedTensorType::get({ShapedType::kDynamicSize}, IndexType::get(ctx)); } +static bool isErrorPropagationPossible(ArrayRef operandTypes) { + for (Type ty : operandTypes) + if (ty.isa() || ty.isa() || ty.isa()) + return true; + return false; +} + ShapeDialect::ShapeDialect(MLIRContext *context) : Dialect(getDialectNamespace(), context) { addOperations< @@ -539,9 +546,7 @@ static LogicalResult verify(GetExtentOp op) { Type shapeTy = op.shape().getType(); Type dimTy = op.dim().getType(); Type extentTy = op.extent().getType(); - bool errorPropagationPossible = - shapeTy.isa() || dimTy.isa(); - if (errorPropagationPossible) { + if (isErrorPropagationPossible({shapeTy, dimTy})) { if (!extentTy.isa()) op.emitError() << "if at least one of the operands can hold error values then the " @@ -593,9 +598,8 @@ void GetExtentOp::build(OpBuilder &builder, OperationState &result, Value shape, //===----------------------------------------------------------------------===// static LogicalResult verify(shape::RankOp op) { - Type argTy = op.shape().getType(); - Type resultTy = op.rank().getType(); - if (argTy.isa() && !resultTy.isa()) + if (op.shape().getType().isa() && + !op.rank().getType().isa()) return op.emitOpError() << "if operand is of type `shape` then the result must be of type " "`size` to propagate potential errors"; @@ -672,6 +676,25 @@ OpFoldResult NumElementsOp::fold(ArrayRef operands) { return builder.getIndexAttr(product.getLimitedValue()); } +//===----------------------------------------------------------------------===// +// MulOp +//===----------------------------------------------------------------------===// + +static LogicalResult verify(MulOp op) { + Type resultTy = op.result().getType(); + if (isErrorPropagationPossible({op.lhs().getType(), op.rhs().getType()})) { + if (!resultTy.isa()) + return op.emitOpError() + << "if at least one of the operands can hold error values then " + "the result must be of type `size` to propagate them"; + } else { + if (resultTy.isa()) + return op.emitError() << "if none of the operands can hold error values " + "then the result must be of type `index`"; + } + return success(); +} + //===----------------------------------------------------------------------===// // ShapeOfOp //===----------------------------------------------------------------------===// @@ -685,15 +708,13 @@ OpFoldResult ShapeOfOp::fold(ArrayRef) { } static LogicalResult verify(ShapeOfOp op) { - Type argTy = op.arg().getType(); Type resultTy = op.result().getType(); - if (argTy.isa()) { + if (isErrorPropagationPossible(op.arg().getType())) { if (!resultTy.isa()) return op.emitOpError() << "if operand is of type `value_shape` then the result must be " "of type `shape` to propagate potential error shapes"; } else { - assert(argTy.isa()); if (resultTy != getExtentTensorType(op.getContext())) return op.emitOpError() << "if operand is a shaped type then the result " "must be an extent tensor"; diff --git a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp index 467f3d33ce231..bb2b03b8ec081 100644 --- a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp +++ b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp @@ -38,8 +38,8 @@ NumElementsOpConverter::matchAndRewrite(NumElementsOp op, // Generate reduce operator. Block *body = reduce.getBody(); OpBuilder b = OpBuilder::atBlockEnd(body); - Value product = - b.create(loc, body->getArgument(1), body->getArgument(2)); + Value product = b.create(loc, b.getType(), + body->getArgument(1), body->getArgument(2)); b.create(loc, product); rewriter.replaceOp(op, reduce.result()); diff --git a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir index 908acabe5345d..8236c6f279755 100644 --- a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir +++ b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir @@ -24,10 +24,19 @@ func @shape_id(%shape : !shape.shape) -> !shape.shape { // CHECK-LABEL: @binary_ops // CHECK-SAME: (%[[LHS:.*]]: index, %[[RHS:.*]]: index) func @binary_ops(%lhs : !shape.size, %rhs : !shape.size) { + // CHECK: addi %[[LHS]], %[[RHS]] : index %sum = "shape.add"(%lhs, %rhs) : (!shape.size, !shape.size) -> !shape.size - // CHECK-NEXT: addi %[[LHS]], %[[RHS]] : index - %product = shape.mul %lhs, %rhs - // CHECK-NEXT: muli %[[LHS]], %[[RHS]] : index + return +} + +// ----- + +// Lower binary ops. +// CHECK-LABEL: @binary_ops +// CHECK-SAME: (%[[LHS:.*]]: index, %[[RHS:.*]]: index) +func @binary_ops(%lhs : index, %rhs : index) { + // CHECK: muli %[[LHS]], %[[RHS]] : index + %product = shape.mul %lhs, %rhs : index, index -> index return } diff --git a/mlir/test/Dialect/Shape/invalid.mlir b/mlir/test/Dialect/Shape/invalid.mlir index d7e9e40ed3f2e..b4900e491fb82 100644 --- a/mlir/test/Dialect/Shape/invalid.mlir +++ b/mlir/test/Dialect/Shape/invalid.mlir @@ -6,6 +6,7 @@ func @reduce_op_args_num_mismatch(%shape : !shape.shape, %init : !shape.size) { ^bb0(%index: index, %dim: !shape.size): shape.yield %dim : !shape.size } + return } // ----- @@ -18,6 +19,7 @@ func @reduce_op_arg0_wrong_type(%shape : !shape.shape, %init : !shape.size) { : (!shape.size, !shape.size) -> !shape.size shape.yield %new_acc : !shape.size } + return } // ----- @@ -28,6 +30,7 @@ func @reduce_op_arg1_wrong_type(%shape : !shape.shape, %init : !shape.size) { ^bb0(%index: index, %dim: f32, %lci: !shape.size): shape.yield } + return } // ----- @@ -38,6 +41,7 @@ func @reduce_op_arg1_wrong_type(%shape : tensor, %init : index) { ^bb0(%index: index, %dim: f32, %lci: index): shape.yield } + return } // ----- @@ -48,6 +52,7 @@ func @reduce_op_init_type_mismatch(%shape : !shape.shape, %init : f32) { ^bb0(%index: index, %dim: !shape.size, %lci: !shape.size): shape.yield } + return } // ----- @@ -58,6 +63,7 @@ func @yield_op_args_num_mismatch(%shape : !shape.shape, %init : !shape.size) { ^bb0(%index: index, %dim: !shape.size, %lci: !shape.size): shape.yield %dim, %dim : !shape.size, !shape.size } + return } // ----- @@ -69,6 +75,7 @@ func @yield_op_type_mismatch(%shape : !shape.shape, %init : !shape.size) { %c0 = constant 1 : index shape.yield %c0 : index } + return } // ----- @@ -85,6 +92,7 @@ func @shape_of(%value_arg : !shape.value_shape, %shaped_arg : tensor) { // expected-error@+1 {{if operand is of type `value_shape` then the result must be of type `shape` to propagate potential error shapes}} %0 = shape.shape_of %value_arg : !shape.value_shape -> tensor + return } // ----- @@ -93,6 +101,7 @@ func @shape_of(%value_arg : !shape.value_shape, %shaped_arg : tensor) { // expected-error@+1 {{if operand is a shaped type then the result must be an extent tensor}} %1 = shape.shape_of %shaped_arg : tensor -> !shape.shape + return } // ----- @@ -100,6 +109,7 @@ func @shape_of(%value_arg : !shape.value_shape, func @rank(%arg : !shape.shape) { // expected-error@+1 {{if operand is of type `shape` then the result must be of type `size` to propagate potential errors}} %0 = shape.rank %arg : !shape.shape -> index + return } // ----- @@ -120,3 +130,19 @@ func @get_extent_error_possible(%arg : tensor) -> index { return %result : index } +// ----- + +func @mul_error_free(%arg : index) -> !shape.size { + // expected-error@+1 {{if none of the operands can hold error values then the result must be of type `index`}} + %result = shape.mul %arg, %arg : index, index -> !shape.size + return %result : !shape.size +} + +// ----- + +func @mul_error_possible(%lhs : !shape.size, %rhs : index) -> index { + // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}} + %result = shape.mul %lhs, %rhs : !shape.size, index -> index + return %result : index +} + diff --git a/mlir/test/Dialect/Shape/ops.mlir b/mlir/test/Dialect/Shape/ops.mlir index b6b839251a884..3a0cb7781ec72 100644 --- a/mlir/test/Dialect/Shape/ops.mlir +++ b/mlir/test/Dialect/Shape/ops.mlir @@ -9,6 +9,7 @@ func @shape_num_elements(%shape : !shape.shape) -> !shape.size { %num_elements = shape.reduce(%shape, %init) : !shape.shape -> !shape.size { ^bb0(%index : index, %extent : !shape.size, %acc : !shape.size): %acc_next = shape.mul %acc, %extent + : !shape.size, !shape.size -> !shape.size shape.yield %acc_next : !shape.size } return %num_elements : !shape.size @@ -19,7 +20,7 @@ func @extent_tensor_num_elements(%shape : tensor) -> index { %init = constant 1 : index %num_elements = shape.reduce(%shape, %init) : tensor -> index { ^bb0(%index : index, %extent : index, %acc : index): - %acc_next = muli %acc, %extent : index + %acc_next = shape.mul %acc, %extent : index, index -> index shape.yield %acc_next : index } return %num_elements : index @@ -110,9 +111,13 @@ func @broadcastable_on_extent_tensors(%lhs : tensor, return } -func @test_mul(%lhs: !shape.size, %rhs: !shape.size) -> !shape.size { - %product = shape.mul %lhs, %rhs - return %product: !shape.size +func @mul(%size_arg : !shape.size, %index_arg : index) { + %size_prod = shape.mul %size_arg, %size_arg + : !shape.size, !shape.size -> !shape.size + %index_prod = shape.mul %index_arg, %index_arg : index, index -> index + %mixed_prod = shape.mul %size_arg, %index_arg + : !shape.size, index -> !shape.size + return } func @const_size() { From 670ae4b6da874270aa0cd8ab32120c17b2eadb95 Mon Sep 17 00:00:00 2001 From: Frederik Gossen Date: Fri, 24 Jul 2020 13:29:51 +0000 Subject: [PATCH 0009/1035] [MLIR][Shape] Fold `shape.mul` Implement constant folding for `shape.mul`. Differential Revision: https://reviews.llvm.org/D84438 --- .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 1 + mlir/lib/Dialect/Shape/IR/Shape.cpp | 12 ++++++ mlir/test/Dialect/Shape/canonicalize.mlir | 40 +++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index 425cf917283be..797dc0bc0cb6a 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -326,6 +326,7 @@ def Shape_MulOp : Shape_Op<"mul", [Commutative, NoSideEffect]> { }]; let verifier = [{ return ::verify(*this); }]; + let hasFolder = 1; } def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> { diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index 2f641300c4917..d2b0dbdedb052 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -695,6 +695,18 @@ static LogicalResult verify(MulOp op) { return success(); } +OpFoldResult MulOp::fold(ArrayRef operands) { + auto lhs = operands[0].dyn_cast_or_null(); + if (!lhs) + return nullptr; + auto rhs = operands[1].dyn_cast_or_null(); + if (!rhs) + return nullptr; + APInt folded = lhs.getValue() * rhs.getValue(); + Type indexTy = IndexType::get(getContext()); + return IntegerAttr::get(indexTy, folded); +} + //===----------------------------------------------------------------------===// // ShapeOfOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index b4dca5e3c2bf5..577656a0b362a 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -734,3 +734,43 @@ func @shape_eq_do_not_fold(%a : !shape.shape) -> i1 { %result = shape.shape_eq %a, %b : !shape.shape, !shape.shape return %result : i1 } + +// ----- + +// Fold `mul` for constant sizes. +// CHECK-LABEL: @fold_mul_size +func @fold_mul_size() -> !shape.size { + // CHECK: %[[RESULT:.*]] = shape.const_size 6 + // CHECK: return %[[RESULT]] : !shape.size + %c2 = shape.const_size 2 + %c3 = shape.const_size 3 + %result = shape.mul %c2, %c3 : !shape.size, !shape.size -> !shape.size + return %result : !shape.size +} + +// ----- + +// Fold `mul` for constant indices. +// CHECK-LABEL: @fold_mul_index +func @fold_mul_index() -> index { + // CHECK: %[[RESULT:.*]] = constant 6 : index + // CHECK: return %[[RESULT]] : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %result = shape.mul %c2, %c3 : index, index -> index + return %result : index +} + +// ----- + +// Fold `mul` for mixed constants. +// CHECK-LABEL: @fold_mul_mixed +func @fold_mul_mixed() -> !shape.size { + // CHECK: %[[RESULT:.*]] = shape.const_size 6 + // CHECK: return %[[RESULT]] : !shape.size + %c2 = shape.const_size 2 + %c3 = constant 3 : index + %result = shape.mul %c2, %c3 : !shape.size, index -> !shape.size + return %result : !shape.size +} + From 8be0371eb79fb25ba22846410e1a31c88b4c89c2 Mon Sep 17 00:00:00 2001 From: George Mitenkov Date: Fri, 24 Jul 2020 15:55:07 +0300 Subject: [PATCH 0010/1035] [MLIR][SPIRVToLLVM] Conversion of load and store SPIR-V ops This patch introduces conversion pattern for `spv.Store` and `spv.Load`. Only op with `Function` Storage Class is supported at the moment because `spv.GlobalVariable` has not been introduced yet. If the op has memory access attribute, then there are the following cases. If the access is `Aligned`, add alignment to the op builder. Otherwise the conversion fails as other cases are not supported yet because they require additional attributes for `llvm.store`/`llvm.load` ops: e.g. `volatile` and `nontemporal`. Reviewed By: antiagainst Differential Revision: https://reviews.llvm.org/D84236 --- .../SPIRVToLLVM/ConvertSPIRVToLLVM.cpp | 45 +++++++++++++++++ .../SPIRVToLLVM/memory-ops-to-llvm.mlir | 50 +++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp index cb4898c346a65..12aa6573ef379 100644 --- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp +++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp @@ -185,6 +185,25 @@ static Value createI32ConstantOf(Location loc, PatternRewriter &rewriter, rewriter.getIntegerAttr(rewriter.getI32Type(), value)); } +/// Utility for `spv.Load` and `spv.Store` conversion. +static LogicalResult replaceWithLoadOrStore(Operation *op, + ConversionPatternRewriter &rewriter, + LLVMTypeConverter &typeConverter, + unsigned alignment) { + if (auto loadOp = dyn_cast(op)) { + auto dstType = typeConverter.convertType(loadOp.getType()); + if (!dstType) + return failure(); + rewriter.replaceOpWithNewOp(loadOp, dstType, loadOp.ptr(), + alignment); + return success(); + } + auto storeOp = cast(op); + rewriter.replaceOpWithNewOp(storeOp, storeOp.value(), + storeOp.ptr(), alignment); + return success(); +} + //===----------------------------------------------------------------------===// // Type conversion //===----------------------------------------------------------------------===// @@ -566,6 +585,31 @@ class IComparePattern : public SPIRVToLLVMConversion { } }; +/// Converts `spv.Load` and `spv.Store` to LLVM dialect. +template +class LoadStorePattern : public SPIRVToLLVMConversion { +public: + using SPIRVToLLVMConversion::SPIRVToLLVMConversion; + + LogicalResult + matchAndRewrite(SPIRVop op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + if (op.memory_access().hasValue() && + op.memory_access().getValue() != spirv::MemoryAccess::None) { + auto memoryAccess = op.memory_access().getValue(); + if (memoryAccess == spirv::MemoryAccess::Aligned) { + unsigned alignment = op.alignment().getValue().getZExtValue(); + replaceWithLoadOrStore(op, rewriter, this->typeConverter, alignment); + return success(); + } + // There is no support of other memory access attributes. + return failure(); + } + replaceWithLoadOrStore(op, rewriter, this->typeConverter, 0); + return success(); + } +}; + /// Converts `spv.Not` and `spv.LogicalNot` into LLVM dialect. template class NotPattern : public SPIRVToLLVMConversion { @@ -973,6 +1017,7 @@ void mlir::populateSPIRVToLLVMConversionPatterns( NotPattern, // Memory ops + LoadStorePattern, LoadStorePattern, VariablePattern, // Miscellaneous ops diff --git a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir index 4c549ab8d619c..362c702e8d3c4 100644 --- a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir +++ b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir @@ -1,5 +1,55 @@ // RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s +//===----------------------------------------------------------------------===// +// spv.Load +//===----------------------------------------------------------------------===// + +func @load() { + %0 = spv.Variable : !spv.ptr + // CHECK: %{{.*}} = llvm.load %{{.*}} : !llvm<"float*"> + %1 = spv.Load "Function" %0 : f32 + return +} + +func @load_none() { + %0 = spv.Variable : !spv.ptr + // CHECK: %{{.*}} = llvm.load %{{.*}} : !llvm<"float*"> + %1 = spv.Load "Function" %0 ["None"] : f32 + return +} + +func @load_with_alignment() { + %0 = spv.Variable : !spv.ptr + // CHECK: %{{.*}} = llvm.load %{{.*}} {alignment = 4 : i64} : !llvm<"float*"> + %1 = spv.Load "Function" %0 ["Aligned", 4] : f32 + return +} + +//===----------------------------------------------------------------------===// +// spv.Store +//===----------------------------------------------------------------------===// + +func @store(%arg0 : f32) -> () { + %0 = spv.Variable : !spv.ptr + // CHECK: llvm.store %{{.*}}, %{{.*}} : !llvm<"float*"> + spv.Store "Function" %0, %arg0 : f32 + return +} + +func @store_composite(%arg0 : !spv.struct) -> () { + %0 = spv.Variable : !spv.ptr, Function> + // CHECK: llvm.store %{{.*}}, %{{.*}} : !llvm<"<{ double }>*"> + spv.Store "Function" %0, %arg0 : !spv.struct + return +} + +func @store_with_alignment(%arg0 : f32) -> () { + %0 = spv.Variable : !spv.ptr + // CHECK: llvm.store %{{.*}}, %{{.*}} {alignment = 4 : i64} : !llvm<"float*"> + spv.Store "Function" %0, %arg0 ["Aligned", 4] : f32 + return +} + //===----------------------------------------------------------------------===// // spv.Variable //===----------------------------------------------------------------------===// From bb099c87abbfb78b2bdda395ea3f04d2a77c3082 Mon Sep 17 00:00:00 2001 From: David Truby Date: Mon, 20 Jul 2020 12:11:26 +0100 Subject: [PATCH 0011/1035] [openmp] Don't copy exports into the source folder by default. Additionally fix the copy if enabled on multi-config targets. Summary: This changes the copy command for libomp.so to use the output of the target as the source of the copy, rather than trying to find it based on ${LIBOMP_LIBRARY_DIR}, which appears to be incorrect in multi-config generator builds. Reviewers: jdoerfert Subscribers: mgorny, yaxunl, guansong, sstefan1, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D84148 --- openmp/runtime/CMakeLists.txt | 2 +- openmp/runtime/cmake/LibompExports.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt index 40a0fb9afee1d..e24528e9842fa 100644 --- a/openmp/runtime/CMakeLists.txt +++ b/openmp/runtime/CMakeLists.txt @@ -114,7 +114,7 @@ set(LIBOMP_FFLAGS "" CACHE STRING # Turning this to FALSE aids parallel builds to not interfere with each other. # Currently, the testsuite module expects the just built OpenMP library to be located inside the exports/ # directory. TODO: have testsuite run under llvm-lit directly. We can then get rid of copying to exports/ -set(LIBOMP_COPY_EXPORTS TRUE CACHE STRING +set(LIBOMP_COPY_EXPORTS FALSE CACHE STRING "Should exports be copied into source exports/ directory?") # HWLOC-support diff --git a/openmp/runtime/cmake/LibompExports.cmake b/openmp/runtime/cmake/LibompExports.cmake index f98de2631b831..96dab9f4a4657 100644 --- a/openmp/runtime/cmake/LibompExports.cmake +++ b/openmp/runtime/cmake/LibompExports.cmake @@ -78,7 +78,7 @@ if(NOT LIBOMP_OUTPUT_DIRECTORY) endif() add_custom_command(TARGET omp POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR} - COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy $ ${LIBOMP_EXPORTS_LIB_DIR} ) # Copy Windows import library into exports/ directory post build From 6b8948922c59c9f786996387b41d8e29ed3e7a8a Mon Sep 17 00:00:00 2001 From: Dmitry Preobrazhensky Date: Fri, 24 Jul 2020 16:39:42 +0300 Subject: [PATCH 0012/1035] [AMDGPU][MC] Added support of SP3 syntax for MTBUF format modifier Currently supported LLVM MTBUF syntax is shown below. It is not compatible with SP3. op dst, addr, rsrc, FORMAT, soffset This change adds support for SP3 syntax: op dst, addr, rsrc, soffset SP3FORMAT In addition to being compatible with SP3, this syntax allows using symbolic names for data, numeric and unified formats. Below is a list of added syntax variants. format: format:[,] format:[,] format:[] format:[] format:[] The last syntax variant is supported for GFX10 only. See llvm bug 37738 Reviewers: arsenm, rampitec, vpykhtin Differential Revision: https://reviews.llvm.org/D84026 --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 189 ++++++- llvm/lib/Target/AMDGPU/BUFInstructions.td | 1 + .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 47 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + llvm/lib/Target/AMDGPU/SIDefines.h | 146 ++++- .../Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp | 245 +++++++++ llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h | 14 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 81 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 24 + .../llvm.amdgcn.raw.tbuffer.load.d16.ll | 18 +- .../AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll | 48 +- .../llvm.amdgcn.raw.tbuffer.store.d16.ll | 16 +- .../AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll | 36 +- .../llvm.amdgcn.struct.tbuffer.load.d16.ll | 16 +- .../AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll | 56 +- .../llvm.amdgcn.struct.tbuffer.store.d16.ll | 16 +- .../llvm.amdgcn.struct.tbuffer.store.ll | 52 +- .../AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll | 10 +- .../llvm.amdgcn.tbuffer.load.dwordx3.ll | 12 +- .../AMDGPU/llvm.amdgcn.tbuffer.load.ll | 28 +- .../AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll | 10 +- .../llvm.amdgcn.tbuffer.store.dwordx3.ll | 6 +- .../AMDGPU/llvm.amdgcn.tbuffer.store.ll | 26 +- llvm/test/MC/AMDGPU/buf-fmt-d16-packed.s | 16 +- llvm/test/MC/AMDGPU/buf-fmt-d16-unpacked.s | 16 +- llvm/test/MC/AMDGPU/mtbuf-gfx10.s | 501 ++++++++++++++++-- llvm/test/MC/AMDGPU/mtbuf.s | 326 ++++++++++-- .../AMDGPU/buf_fmt_packed_d16.txt | 16 +- .../AMDGPU/buf_fmt_unpacked_d16.txt | 16 +- .../MC/Disassembler/AMDGPU/mtbuf_gfx10.txt | 106 ++-- llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt | 81 ++- 31 files changed, 1823 insertions(+), 354 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 9f3a6ffc35e6f..2833875e438cd 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -689,6 +689,11 @@ class AMDGPUOperand : public MCParsedAsmOperand { return Imm.Val; } + void setImm(int64_t Val) { + assert(isImm()); + Imm.Val = Val; + } + ImmTy getImmTy() const { assert(isImm()); return Imm.Type; @@ -1297,8 +1302,13 @@ class AMDGPUAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); OperandMatchResultTy parseDfmtNfmt(int64_t &Format); OperandMatchResultTy parseUfmt(int64_t &Format); + OperandMatchResultTy parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format); + OperandMatchResultTy parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format); OperandMatchResultTy parseFORMAT(OperandVector &Operands); + OperandMatchResultTy parseSymbolicOrNumericFormat(int64_t &Format); + OperandMatchResultTy parseNumericFormat(int64_t &Format); bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val); + bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); } @@ -1367,6 +1377,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool trySkipToken(const AsmToken::TokenKind Kind); bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + bool parseId(StringRef &Val, const StringRef ErrMsg); + void peekTokens(MutableArrayRef Tokens); AsmToken::TokenKind getTokenKind() const; bool parseExpr(int64_t &Imm); @@ -4926,8 +4938,8 @@ AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { if (Dfmt == DFMT_UNDEF && Nfmt == NFMT_UNDEF) return MatchOperand_NoMatch; - Dfmt = (Dfmt == DFMT_UNDEF)? DFMT_DEFAULT : Dfmt; - Nfmt = (Nfmt == NFMT_UNDEF)? NFMT_DEFAULT : Nfmt; + Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt; + Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt; Format = encodeDfmtNfmt(Dfmt, Nfmt); return MatchOperand_Success; @@ -4949,20 +4961,177 @@ AMDGPUAsmParser::parseUfmt(int64_t &Format) { return MatchOperand_Success; } +bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt, + int64_t &Nfmt, + StringRef FormatStr, + SMLoc Loc) { + using namespace llvm::AMDGPU::MTBUFFormat; + int64_t Format; + + Format = getDfmt(FormatStr); + if (Format != DFMT_UNDEF) { + Dfmt = Format; + return true; + } + + Format = getNfmt(FormatStr, getSTI()); + if (Format != NFMT_UNDEF) { + Nfmt = Format; + return true; + } + + Error(Loc, "unsupported format"); + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr, + SMLoc FormatLoc, + int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + + int64_t Dfmt = DFMT_UNDEF; + int64_t Nfmt = NFMT_UNDEF; + if (!matchDfmtNfmt(Dfmt, Nfmt, FormatStr, FormatLoc)) + return MatchOperand_ParseFail; + + if (trySkipToken(AsmToken::Comma)) { + StringRef Str; + SMLoc Loc = getLoc(); + if (!parseId(Str, "expected a format string") || + !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc)) { + return MatchOperand_ParseFail; + } + if (Dfmt == DFMT_UNDEF) { + Error(Loc, "duplicate numeric format"); + } else if (Nfmt == NFMT_UNDEF){ + Error(Loc, "duplicate data format"); + } + } + + Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt; + Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt; + + if (isGFX10()) { + auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt); + if (Ufmt == UFMT_UNDEF) + Error(FormatLoc, "unsupported format"); + Format = Ufmt; + } else { + Format = encodeDfmtNfmt(Dfmt, Nfmt); + } + + return MatchOperand_Success; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr, + SMLoc Loc, + int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + + auto Id = getUnifiedFormat(FormatStr); + if (Id == UFMT_UNDEF) + return MatchOperand_NoMatch; + + if (!isGFX10()) { + Error(Loc, "unified format is not supported on this GPU"); + return MatchOperand_ParseFail; + } + + Format = Id; + return MatchOperand_Success; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseNumericFormat(int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + SMLoc Loc = getLoc(); + + if (!parseExpr(Format)) + return MatchOperand_ParseFail; + if (!isValidFormatEncoding(Format, getSTI())) { + Error(Loc, "out of range format"); + return MatchOperand_ParseFail; + } + + return MatchOperand_Success; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + + if (!trySkipId("format", AsmToken::Colon)) + return MatchOperand_NoMatch; + + if (trySkipToken(AsmToken::LBrac)) { + StringRef FormatStr; + SMLoc Loc = getLoc(); + if (!parseId(FormatStr, "expected a format string")) + return MatchOperand_ParseFail; + + auto Res = parseSymbolicUnifiedFormat(FormatStr, Loc, Format); + if (Res == MatchOperand_NoMatch) + Res = parseSymbolicSplitFormat(FormatStr, Loc, Format); + if (Res != MatchOperand_Success) + return Res; + + skipToken(AsmToken::RBrac, "expected a closing square bracket"); + return MatchOperand_Success; + } + + return parseNumericFormat(Format); +} + OperandMatchResultTy AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) { using namespace llvm::AMDGPU::MTBUFFormat; - int64_t Format = isGFX10() ? UFMT_DEFAULT : DFMT_NFMT_DEFAULT; + int64_t Format = getDefaultFormatEncoding(getSTI()); OperandMatchResultTy Res; SMLoc Loc = getLoc(); + // Parse legacy format syntax. Res = isGFX10() ? parseUfmt(Format) : parseDfmtNfmt(Format); if (Res == MatchOperand_ParseFail) return Res; + bool FormatFound = (Res == MatchOperand_Success); + Operands.push_back( AMDGPUOperand::CreateImm(this, Format, Loc, AMDGPUOperand::ImmTyFORMAT)); + + if (FormatFound) + trySkipToken(AsmToken::Comma); + + if (isToken(AsmToken::EndOfStatement)) { + // We are expecting an soffset operand, + // but let matcher handle the error. + return MatchOperand_Success; + } + + // Parse soffset. + Res = parseRegOrImm(Operands); + if (Res != MatchOperand_Success) + return Res; + + trySkipToken(AsmToken::Comma); + + if (!FormatFound) { + if (parseSymbolicOrNumericFormat(Format) == MatchOperand_Success) { + auto Size = Operands.size(); + AMDGPUOperand &Op = static_cast(*Operands[Size - 2]); + assert(Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyFORMAT); + Op.setImm(Format); + } + return MatchOperand_Success; + } + + if (isId("format") && peekToken().is(AsmToken::Colon)) { + Error(getLoc(), "duplicate format"); + return MatchOperand_ParseFail; + } return MatchOperand_Success; } @@ -5616,6 +5785,18 @@ AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { } } +bool +AMDGPUAsmParser::parseId(StringRef &Val, const StringRef ErrMsg) { + if (isToken(AsmToken::Identifier)) { + Val = getTokenStr(); + lex(); + return true; + } else { + Error(getLoc(), ErrMsg); + return false; + } +} + AsmToken AMDGPUAsmParser::getToken() const { return Parser.getTok(); @@ -5623,7 +5804,7 @@ AMDGPUAsmParser::getToken() const { AsmToken AMDGPUAsmParser::peekToken() { - return getLexer().peekTok(); + return isToken(AsmToken::EndOfStatement) ? getToken() : getLexer().peekTok(); } void diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 370e9db9e83e9..d175edd93c642 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -114,6 +114,7 @@ class MTBUF_Real : let isCodeGenOnly = 0; // copy relevant pseudo op flags + let UseNamedOperandTable = ps.UseNamedOperandTable; let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; let Constraints = ps.Constraints; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index e44c0194e811b..7a54f49c18b08 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -299,23 +299,48 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { +} + +void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { using namespace llvm::AMDGPU::MTBUFFormat; + int OpNo = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::format); + assert(OpNo != -1); + unsigned Val = MI->getOperand(OpNo).getImm(); if (AMDGPU::isGFX10(STI)) { if (Val == UFMT_DEFAULT) return; - O << " format:" << Val; + if (isValidUnifiedFormat(Val)) { + O << " format:[" << getUnifiedFormatName(Val) << ']'; + } else { + O << " format:" << Val; + } } else { if (Val == DFMT_NFMT_DEFAULT) return; - unsigned Dfmt; - unsigned Nfmt; - decodeDfmtNfmt(Val, Dfmt, Nfmt); - O << " dfmt:" << Dfmt; - O << ", nfmt:" << Nfmt; + if (isValidDfmtNfmt(Val, STI)) { + unsigned Dfmt; + unsigned Nfmt; + decodeDfmtNfmt(Val, Dfmt, Nfmt); + O << " format:["; + if (Dfmt != DFMT_DEFAULT) { + O << getDfmtName(Dfmt); + if (Nfmt != NFMT_DEFAULT) { + O << ','; + } + } + if (Nfmt != NFMT_DEFAULT) { + O << getNfmtName(Nfmt, STI); + } + O << ']'; + } else { + O << " format:" << Val; + } } - O << ','; } void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, @@ -682,6 +707,14 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, printDefaultVccOperand(OpNo, STI, O); break; } + + if (Desc.TSFlags & SIInstrFlags::MTBUF) { + int SOffsetIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::soffset); + assert(SOffsetIdx != -1); + if ((int)OpNo == SOffsetIdx) + printSymbolicFormat(MI, STI, O); + } } void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 78a66a78ec341..ed45c5309ea25 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -99,6 +99,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O); void printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSymbolicFormat(const MCInst *MI, + const MCSubtargetInfo &STI, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 9c9dd66a4a79c..58d77f3b224b5 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -394,27 +394,56 @@ enum ModeRegisterMasks : uint32_t { namespace MTBUFFormat { -enum DataFormat { - DFMT_MAX = 15, +enum DataFormat : int64_t { + DFMT_INVALID = 0, + DFMT_8, + DFMT_16, + DFMT_8_8, + DFMT_32, + DFMT_16_16, + DFMT_10_11_11, + DFMT_11_11_10, + DFMT_10_10_10_2, + DFMT_2_10_10_10, + DFMT_8_8_8_8, + DFMT_32_32, + DFMT_16_16_16_16, + DFMT_32_32_32, + DFMT_32_32_32_32, + DFMT_RESERVED_15, + + DFMT_MIN = DFMT_INVALID, + DFMT_MAX = DFMT_RESERVED_15, DFMT_UNDEF = -1, - DFMT_DEFAULT = 1, + DFMT_DEFAULT = DFMT_8, DFMT_SHIFT = 0, - DFMT_MASK = DFMT_MAX + DFMT_MASK = 0xF }; -enum NumFormat { - NFMT_MAX = 7, +enum NumFormat : int64_t { + NFMT_UNORM = 0, + NFMT_SNORM, + NFMT_USCALED, + NFMT_SSCALED, + NFMT_UINT, + NFMT_SINT, + NFMT_RESERVED_6, // VI and GFX9 + NFMT_SNORM_OGL = NFMT_RESERVED_6, // SI and CI only + NFMT_FLOAT, + + NFMT_MIN = NFMT_UNORM, + NFMT_MAX = NFMT_FLOAT, NFMT_UNDEF = -1, - NFMT_DEFAULT = 0, + NFMT_DEFAULT = NFMT_UNORM, NFMT_SHIFT = 4, - NFMT_MASK = NFMT_MAX + NFMT_MASK = 7 }; -enum MergedFormat { +enum MergedFormat : int64_t { DFMT_NFMT_UNDEF = -1, DFMT_NFMT_DEFAULT = ((DFMT_DEFAULT & DFMT_MASK) << DFMT_SHIFT) | ((NFMT_DEFAULT & NFMT_MASK) << NFMT_SHIFT), @@ -425,11 +454,106 @@ enum MergedFormat { DFMT_NFMT_MAX = DFMT_NFMT_MASK }; -enum UnifiedFormat { +enum UnifiedFormat : int64_t { + UFMT_INVALID = 0, + + UFMT_8_UNORM, + UFMT_8_SNORM, + UFMT_8_USCALED, + UFMT_8_SSCALED, + UFMT_8_UINT, + UFMT_8_SINT, + + UFMT_16_UNORM, + UFMT_16_SNORM, + UFMT_16_USCALED, + UFMT_16_SSCALED, + UFMT_16_UINT, + UFMT_16_SINT, + UFMT_16_FLOAT, + + UFMT_8_8_UNORM, + UFMT_8_8_SNORM, + UFMT_8_8_USCALED, + UFMT_8_8_SSCALED, + UFMT_8_8_UINT, + UFMT_8_8_SINT, + + UFMT_32_UINT, + UFMT_32_SINT, + UFMT_32_FLOAT, + + UFMT_16_16_UNORM, + UFMT_16_16_SNORM, + UFMT_16_16_USCALED, + UFMT_16_16_SSCALED, + UFMT_16_16_UINT, + UFMT_16_16_SINT, + UFMT_16_16_FLOAT, + + UFMT_10_11_11_UNORM, + UFMT_10_11_11_SNORM, + UFMT_10_11_11_USCALED, + UFMT_10_11_11_SSCALED, + UFMT_10_11_11_UINT, + UFMT_10_11_11_SINT, + UFMT_10_11_11_FLOAT, + + UFMT_11_11_10_UNORM, + UFMT_11_11_10_SNORM, + UFMT_11_11_10_USCALED, + UFMT_11_11_10_SSCALED, + UFMT_11_11_10_UINT, + UFMT_11_11_10_SINT, + UFMT_11_11_10_FLOAT, + + UFMT_10_10_10_2_UNORM, + UFMT_10_10_10_2_SNORM, + UFMT_10_10_10_2_USCALED, + UFMT_10_10_10_2_SSCALED, + UFMT_10_10_10_2_UINT, + UFMT_10_10_10_2_SINT, + + UFMT_2_10_10_10_UNORM, + UFMT_2_10_10_10_SNORM, + UFMT_2_10_10_10_USCALED, + UFMT_2_10_10_10_SSCALED, + UFMT_2_10_10_10_UINT, + UFMT_2_10_10_10_SINT, + + UFMT_8_8_8_8_UNORM, + UFMT_8_8_8_8_SNORM, + UFMT_8_8_8_8_USCALED, + UFMT_8_8_8_8_SSCALED, + UFMT_8_8_8_8_UINT, + UFMT_8_8_8_8_SINT, + + UFMT_32_32_UINT, + UFMT_32_32_SINT, + UFMT_32_32_FLOAT, + + UFMT_16_16_16_16_UNORM, + UFMT_16_16_16_16_SNORM, + UFMT_16_16_16_16_USCALED, + UFMT_16_16_16_16_SSCALED, + UFMT_16_16_16_16_UINT, + UFMT_16_16_16_16_SINT, + UFMT_16_16_16_16_FLOAT, + + UFMT_32_32_32_UINT, + UFMT_32_32_32_SINT, + UFMT_32_32_32_FLOAT, + UFMT_32_32_32_32_UINT, + UFMT_32_32_32_32_SINT, + UFMT_32_32_32_32_FLOAT, + + UFMT_FIRST = UFMT_INVALID, + UFMT_LAST = UFMT_32_32_32_32_FLOAT, + UFMT_MAX = 127, UFMT_UNDEF = -1, - UFMT_DEFAULT = 1 + UFMT_DEFAULT = UFMT_8_UNORM }; } // namespace MTBUFFormat diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 5819a621f55d6..1d645bda4b3fa 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// #include "AMDGPUAsmUtils.h" +#include "SIDefines.h" namespace llvm { namespace AMDGPU { @@ -87,6 +88,250 @@ const char* const IdSymbolic[] = { } // namespace Hwreg +namespace MTBUFFormat { + +StringLiteral const DfmtSymbolic[] = { + "BUF_DATA_FORMAT_INVALID", + "BUF_DATA_FORMAT_8", + "BUF_DATA_FORMAT_16", + "BUF_DATA_FORMAT_8_8", + "BUF_DATA_FORMAT_32", + "BUF_DATA_FORMAT_16_16", + "BUF_DATA_FORMAT_10_11_11", + "BUF_DATA_FORMAT_11_11_10", + "BUF_DATA_FORMAT_10_10_10_2", + "BUF_DATA_FORMAT_2_10_10_10", + "BUF_DATA_FORMAT_8_8_8_8", + "BUF_DATA_FORMAT_32_32", + "BUF_DATA_FORMAT_16_16_16_16", + "BUF_DATA_FORMAT_32_32_32", + "BUF_DATA_FORMAT_32_32_32_32", + "BUF_DATA_FORMAT_RESERVED_15" +}; + +StringLiteral const NfmtSymbolicGFX10[] = { + "BUF_NUM_FORMAT_UNORM", + "BUF_NUM_FORMAT_SNORM", + "BUF_NUM_FORMAT_USCALED", + "BUF_NUM_FORMAT_SSCALED", + "BUF_NUM_FORMAT_UINT", + "BUF_NUM_FORMAT_SINT", + "", + "BUF_NUM_FORMAT_FLOAT" +}; + +StringLiteral const NfmtSymbolicSICI[] = { + "BUF_NUM_FORMAT_UNORM", + "BUF_NUM_FORMAT_SNORM", + "BUF_NUM_FORMAT_USCALED", + "BUF_NUM_FORMAT_SSCALED", + "BUF_NUM_FORMAT_UINT", + "BUF_NUM_FORMAT_SINT", + "BUF_NUM_FORMAT_SNORM_OGL", + "BUF_NUM_FORMAT_FLOAT" +}; + +StringLiteral const NfmtSymbolicVI[] = { // VI and GFX9 + "BUF_NUM_FORMAT_UNORM", + "BUF_NUM_FORMAT_SNORM", + "BUF_NUM_FORMAT_USCALED", + "BUF_NUM_FORMAT_SSCALED", + "BUF_NUM_FORMAT_UINT", + "BUF_NUM_FORMAT_SINT", + "BUF_NUM_FORMAT_RESERVED_6", + "BUF_NUM_FORMAT_FLOAT" +}; + +StringLiteral const UfmtSymbolic[] = { + "BUF_FMT_INVALID", + + "BUF_FMT_8_UNORM", + "BUF_FMT_8_SNORM", + "BUF_FMT_8_USCALED", + "BUF_FMT_8_SSCALED", + "BUF_FMT_8_UINT", + "BUF_FMT_8_SINT", + + "BUF_FMT_16_UNORM", + "BUF_FMT_16_SNORM", + "BUF_FMT_16_USCALED", + "BUF_FMT_16_SSCALED", + "BUF_FMT_16_UINT", + "BUF_FMT_16_SINT", + "BUF_FMT_16_FLOAT", + + "BUF_FMT_8_8_UNORM", + "BUF_FMT_8_8_SNORM", + "BUF_FMT_8_8_USCALED", + "BUF_FMT_8_8_SSCALED", + "BUF_FMT_8_8_UINT", + "BUF_FMT_8_8_SINT", + + "BUF_FMT_32_UINT", + "BUF_FMT_32_SINT", + "BUF_FMT_32_FLOAT", + + "BUF_FMT_16_16_UNORM", + "BUF_FMT_16_16_SNORM", + "BUF_FMT_16_16_USCALED", + "BUF_FMT_16_16_SSCALED", + "BUF_FMT_16_16_UINT", + "BUF_FMT_16_16_SINT", + "BUF_FMT_16_16_FLOAT", + + "BUF_FMT_10_11_11_UNORM", + "BUF_FMT_10_11_11_SNORM", + "BUF_FMT_10_11_11_USCALED", + "BUF_FMT_10_11_11_SSCALED", + "BUF_FMT_10_11_11_UINT", + "BUF_FMT_10_11_11_SINT", + "BUF_FMT_10_11_11_FLOAT", + + "BUF_FMT_11_11_10_UNORM", + "BUF_FMT_11_11_10_SNORM", + "BUF_FMT_11_11_10_USCALED", + "BUF_FMT_11_11_10_SSCALED", + "BUF_FMT_11_11_10_UINT", + "BUF_FMT_11_11_10_SINT", + "BUF_FMT_11_11_10_FLOAT", + + "BUF_FMT_10_10_10_2_UNORM", + "BUF_FMT_10_10_10_2_SNORM", + "BUF_FMT_10_10_10_2_USCALED", + "BUF_FMT_10_10_10_2_SSCALED", + "BUF_FMT_10_10_10_2_UINT", + "BUF_FMT_10_10_10_2_SINT", + + "BUF_FMT_2_10_10_10_UNORM", + "BUF_FMT_2_10_10_10_SNORM", + "BUF_FMT_2_10_10_10_USCALED", + "BUF_FMT_2_10_10_10_SSCALED", + "BUF_FMT_2_10_10_10_UINT", + "BUF_FMT_2_10_10_10_SINT", + + "BUF_FMT_8_8_8_8_UNORM", + "BUF_FMT_8_8_8_8_SNORM", + "BUF_FMT_8_8_8_8_USCALED", + "BUF_FMT_8_8_8_8_SSCALED", + "BUF_FMT_8_8_8_8_UINT", + "BUF_FMT_8_8_8_8_SINT", + + "BUF_FMT_32_32_UINT", + "BUF_FMT_32_32_SINT", + "BUF_FMT_32_32_FLOAT", + + "BUF_FMT_16_16_16_16_UNORM", + "BUF_FMT_16_16_16_16_SNORM", + "BUF_FMT_16_16_16_16_USCALED", + "BUF_FMT_16_16_16_16_SSCALED", + "BUF_FMT_16_16_16_16_UINT", + "BUF_FMT_16_16_16_16_SINT", + "BUF_FMT_16_16_16_16_FLOAT", + + "BUF_FMT_32_32_32_UINT", + "BUF_FMT_32_32_32_SINT", + "BUF_FMT_32_32_32_FLOAT", + "BUF_FMT_32_32_32_32_UINT", + "BUF_FMT_32_32_32_32_SINT", + "BUF_FMT_32_32_32_32_FLOAT" +}; + +unsigned const DfmtNfmt2UFmt[] = { + DFMT_INVALID | (NFMT_UNORM << NFMT_SHIFT), + + DFMT_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_8_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_16_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_10_11_11 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_UINT << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_SINT << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_11_11_10 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_UINT << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_SINT << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_10_10_10_2 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_UINT << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_2_10_10_10 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_UINT << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_8_8_8_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_16_16_16_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_32_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT) +}; + +} // namespace MTBUFFormat + namespace Swizzle { // This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index cd91c5f6edd56..a6053735d7956 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -9,8 +9,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H +#include "llvm/ADT/StringRef.h" + namespace llvm { namespace AMDGPU { + namespace SendMsg { // Symbolic names for the sendmsg(...) syntax. extern const char* const IdSymbolic[]; @@ -25,6 +28,17 @@ extern const char* const IdSymbolic[]; } // namespace Hwreg +namespace MTBUFFormat { + +extern StringLiteral const DfmtSymbolic[]; +extern StringLiteral const NfmtSymbolicGFX10[]; +extern StringLiteral const NfmtSymbolicSICI[]; +extern StringLiteral const NfmtSymbolicVI[]; +extern StringLiteral const UfmtSymbolic[]; +extern unsigned const DfmtNfmt2UFmt[]; + +} // namespace MTBUFFormat + namespace Swizzle { // Symbolic names for the swizzle(...) syntax. extern const char* const IdSymbolic[]; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b89e34e4c99c1..497a04e25e628 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -789,6 +789,52 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) namespace MTBUFFormat { +int64_t getDfmt(const StringRef Name) { + for (int Id = DFMT_MIN; Id <= DFMT_MAX; ++Id) { + if (Name == DfmtSymbolic[Id]) + return Id; + } + return DFMT_UNDEF; +} + +StringRef getDfmtName(unsigned Id) { + assert(Id <= DFMT_MAX); + return DfmtSymbolic[Id]; +} + +static StringLiteral const *getNfmtLookupTable(const MCSubtargetInfo &STI) { + if (isSI(STI) || isCI(STI)) + return NfmtSymbolicSICI; + if (isVI(STI) || isGFX9(STI)) + return NfmtSymbolicVI; + return NfmtSymbolicGFX10; +} + +int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI) { + auto lookupTable = getNfmtLookupTable(STI); + for (int Id = NFMT_MIN; Id <= NFMT_MAX; ++Id) { + if (Name == lookupTable[Id]) + return Id; + } + return NFMT_UNDEF; +} + +StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI) { + assert(Id <= NFMT_MAX); + return getNfmtLookupTable(STI)[Id]; +} + +bool isValidDfmtNfmt(unsigned Id, const MCSubtargetInfo &STI) { + unsigned Dfmt; + unsigned Nfmt; + decodeDfmtNfmt(Id, Dfmt, Nfmt); + return isValidNfmt(Nfmt, STI); +} + +bool isValidNfmt(unsigned Id, const MCSubtargetInfo &STI) { + return !getNfmtName(Id, STI).empty(); +} + int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt) { return (Dfmt << DFMT_SHIFT) | (Nfmt << NFMT_SHIFT); } @@ -798,6 +844,41 @@ void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt) { Nfmt = (Format >> NFMT_SHIFT) & NFMT_MASK; } +int64_t getUnifiedFormat(const StringRef Name) { + for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) { + if (Name == UfmtSymbolic[Id]) + return Id; + } + return UFMT_UNDEF; +} + +StringRef getUnifiedFormatName(unsigned Id) { + return isValidUnifiedFormat(Id) ? UfmtSymbolic[Id] : ""; +} + +bool isValidUnifiedFormat(unsigned Id) { + return Id <= UFMT_LAST; +} + +int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt) { + int64_t Fmt = encodeDfmtNfmt(Dfmt, Nfmt); + for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) { + if (Fmt == DfmtNfmt2UFmt[Id]) + return Id; + } + return UFMT_UNDEF; +} + +bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI) { + return isGFX10(STI) ? (Val <= UFMT_MAX) : (Val <= DFMT_NFMT_MAX); +} + +unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI) { + if (isGFX10(STI)) + return UFMT_DEFAULT; + return DFMT_NFMT_DEFAULT; +} + } // namespace MTBUFFormat //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a9ea05755a676..d250cc2ec03db 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -489,6 +489,30 @@ int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt); void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt); +int64_t getDfmt(const StringRef Name); + +StringRef getDfmtName(unsigned Id); + +int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI); + +StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI); + +bool isValidDfmtNfmt(unsigned Val, const MCSubtargetInfo &STI); + +bool isValidNfmt(unsigned Val, const MCSubtargetInfo &STI); + +int64_t getUnifiedFormat(const StringRef Name); + +StringRef getUnifiedFormatName(unsigned Id); + +bool isValidUnifiedFormat(unsigned Val); + +int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt); + +bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI); + +unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI); + } // namespace MTBUFFormat namespace SendMsg { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll index 0d3e3b7efa258..db7949f540964 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll @@ -4,8 +4,8 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s ; GCN-LABEL: {{^}}tbuffer_load_d16_x: -; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { main_body: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) @@ -13,11 +13,11 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xy: -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { main_body: @@ -27,12 +27,12 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll index 1aff4b9bc7e2f..462c9dcf1a239 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -3,14 +3,14 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s ; GCN-LABEL: {{^}}tbuffer_load: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 glc -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:63, 0 glc -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 slc -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 glc dlc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] glc +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] glc +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] slc +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] glc dlc ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -29,8 +29,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 offset:42 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) @@ -39,12 +39,12 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:47, 61 offset:4095 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:62, {{s[0-9]+}} offset:73 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:77, {{s[0-9]+}} offset:1 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] offset:73 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) @@ -60,8 +60,8 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l } ; GCN-LABEL: {{^}}tbuffer_load_ofs: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 offen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0) @@ -70,8 +70,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 offen offset:52 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offen offset:52 define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { main_body: %ofs = add i32 %voffs, 52 @@ -81,8 +81,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_load_xy: -; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 -; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 +; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] +; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -90,8 +90,8 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { } ; GCN-LABEL: {{^}}buffer_load_x: -; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 -; GFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 +; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] +; GFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index b337714f27b1c..5041cf3197342 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ ; GCN-DAG: s_load_dwordx4 ; GCN-DAG: s_load_dword s[[S_LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] -; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 -; GFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], format:33, 0 +; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -22,10 +22,10 @@ main_body: ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] -; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 -; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], format:33, 0 +; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -43,13 +43,13 @@ main_body: ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] -; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 -; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:33, 0 +; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll index 9c88de9c4f88e..6bc79150b8af5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -3,14 +3,14 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}tbuffer_store: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 -; PREGFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 glc -; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:44, 0 -; GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], format:61, 0 glc -; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 slc -; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 glc dlc +; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] +; PREGFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] glc +; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_UNORM] +; GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] glc +; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc +; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -24,8 +24,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_immoffs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, 0 offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -34,8 +34,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, {{s[0-9]+}} offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:117 offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -44,8 +44,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_ofs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen -; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:115, 0 offen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen +; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -54,8 +54,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x1: -; PREGFX10: tbuffer_store_format_x v0, off, s[0:3], dfmt:13, nfmt:7, 0 -; GFX10: tbuffer_store_format_x v0, off, s[0:3], format:125, 0 +; PREGFX10: tbuffer_store_format_x v0, off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] +; GFX10: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) { main_body: %data.i = bitcast float %data to i32 @@ -64,8 +64,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x2: -; PREGFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], dfmt:1, nfmt:2, 0 -; GFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], format:33, 0 +; PREGFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll index 36aa5012a17c9..2fd21a10564d4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll @@ -5,8 +5,8 @@ ; GCN-LABEL: {{^}}tbuffer_load_d16_x: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen -; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen +; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { main_body: %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) @@ -15,11 +15,11 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_d16_xy: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen ; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen -; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen +; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { main_body: @@ -30,11 +30,11 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen ; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen -; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen +; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll index 1150347650c8d..ac18761534ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -4,14 +4,14 @@ ; GCN-LABEL: {{^}}tbuffer_load: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 idxen glc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen slc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen glc -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:63, 0 idxen glc -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 idxen slc -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 idxen glc dlc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen glc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen slc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen glc +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] idxen glc +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen slc +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen glc dlc ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -31,8 +31,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_immoffs: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offset:42 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) @@ -42,12 +42,12 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 idxen offset:4095 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} idxen offset:73 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} idxen offset:1 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:47, 61 idxen offset:4095 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:62, {{s[0-9]+}} idxen offset:73 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:77, {{s[0-9]+}} idxen offset:1 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen offset:4095 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] idxen offset:73 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:1 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] idxen offset:4095 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] idxen offset:73 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) @@ -63,8 +63,8 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l } ; GCN-LABEL: {{^}}tbuffer_load_idx: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) @@ -73,8 +73,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) @@ -83,8 +83,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen offset:52 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offen offset:52 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen offset:52 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen offset:52 define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { main_body: %ofs = add i32 %voffs, 52 @@ -94,8 +94,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_both: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) @@ -105,8 +105,8 @@ main_body: ; GCN-LABEL: {{^}}buffer_load_xy: -; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen -; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 idxen +; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -114,8 +114,8 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { } ; GCN-LABEL: {{^}}buffer_load_x: -; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen -; GFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 idxen +; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; GFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 519eef4cb5b7c..ca78b29cc8f53 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ ; GCN-DAG: s_load_dwordx4 ; GCN-DAG: s_load_dword{{[x0-2]*}} s{{\[}}[[S_LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] -; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], format:33, 0 idxen +; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -22,10 +22,10 @@ main_body: ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], format:33, 0 idxen +; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -43,12 +43,12 @@ main_body: ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] -; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], format:33, 0 idxen +; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll index 254ad80c2bd9b..24247d320c88a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -4,14 +4,14 @@ ; GCN-LABEL: {{^}}tbuffer_store: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:12, nfmt:2, 0 idxen -; PREGFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], dfmt:13, nfmt:3, 0 idxen glc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen slc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen glc -; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], format:44, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], format:61, 0 idxen glc -; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], format:78, 0 idxen slc -; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], format:78, 0 idxen glc dlc +; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] idxen +; PREGFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] idxen glc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen slc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc +; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_FMT_10_10_10_2_UNORM] idxen +; GFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen glc +; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen slc +; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen glc dlc define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -26,8 +26,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_immoffs: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], format:117, 0 idxen offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:117 idxen offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -37,8 +37,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} idxen offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], format:117, {{s[0-9]+}} idxen offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], {{s[0-9]+}} format:117 idxen offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -47,8 +47,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_idx: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:47, 0 idxen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SSCALED] idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -57,8 +57,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_ofs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], dfmt:3, nfmt:7, 0 idxen offen -; GFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], format:115, 0 idxen offen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] idxen offen +; GFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 format:115 idxen offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -67,8 +67,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_both: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen -; GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], format:70, 0 idxen offen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen +; GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_16_16_16_16_SINT] idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -79,13 +79,13 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; ; GCN-LABEL: {{^}}buffer_store_wait: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:63, 0 idxen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen +; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_SINT] idxen ; VERDE: s_waitcnt expcnt(0) ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], format:46, 0 idxen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_10_10_10_2_USCALED] idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -97,8 +97,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x1: -; PREGFX10: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen -; GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:125, 0 idxen +; PREGFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen +; GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: %data.i = bitcast float %data to i32 @@ -107,8 +107,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x2: -; PREGFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], format:33, 0 idxen +; PREGFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll index 8850acae1b9cc..205cc5f78d335 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s ; GCN-LABEL: {{^}}tbuffer_load_d16_x: -; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { main_body: %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) @@ -11,10 +11,10 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xy: -; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { main_body: @@ -24,10 +24,10 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: -; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll index 0193d97398910..b04f3c09729a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll @@ -2,8 +2,8 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GCNX3 ; GCN-LABEL: {{^}}tbuffer_raw_load_immoffs_x3: -; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 -; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 define amdgpu_vs <3 x float> @tbuffer_raw_load_immoffs_x3(<4 x i32> inreg) { main_body: %vdata = call <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) @@ -14,8 +14,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_struct_load_immoffs_x3: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 -; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 define amdgpu_vs <3 x float> @tbuffer_struct_load_immoffs_x3(<4 x i32> inreg) { main_body: %vdata = call <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) @@ -25,8 +25,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_format_immoffs_x3: -; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 -; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(<4 x i32> inreg) { main_body: %vdata = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll index 712ee7ad1e5cb..aca9e0aaf9a26 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -2,10 +2,10 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}tbuffer_load: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -24,7 +24,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) @@ -33,9 +33,9 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) @@ -51,7 +51,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l } ; GCN-LABEL: {{^}}tbuffer_load_idx: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) @@ -60,7 +60,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) @@ -69,7 +69,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52 define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0) @@ -78,7 +78,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_both: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) @@ -88,7 +88,7 @@ main_body: ; GCN-LABEL: {{^}}buffer_load_xy: -; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -96,7 +96,7 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { } ; GCN-LABEL: {{^}}buffer_load_x: -; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll index 85aaee3dd2e38..4dd76a3a632dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_x: ; GCN: s_load_dword s[[S_LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] -; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) @@ -19,9 +19,9 @@ main_body: ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) @@ -39,11 +39,11 @@ main_body: ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] -; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll index a39614c1cf052..dc5abf418131c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN ; GCN-LABEL: {{^}}tbuffer_raw_store_immoffs_x3: -; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_raw_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> @@ -12,7 +12,7 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_struct_store_immoffs_x3: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; GCN: tbuffer_store_format_xyz v[0:2], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42 +; GCN: tbuffer_store_format_xyz v[0:2], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 define amdgpu_ps void @tbuffer_struct_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> @@ -21,7 +21,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_immoffs_x3: -; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll index 4caa8081530ba..ed31ea3c40fea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -2,10 +2,10 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}tbuffer_store: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 -; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc -; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc -; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] +; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -19,7 +19,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_immoffs: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -28,7 +28,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -37,7 +37,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_idx: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -46,7 +46,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_ofs: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -55,7 +55,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_both: -; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen +; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -66,11 +66,11 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; ; GCN-LABEL: {{^}}buffer_store_wait: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen ; VERDE: s_waitcnt expcnt(0) ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) -; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -82,7 +82,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x1: -; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen +; GCN: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: %data.i = bitcast float %data to i32 @@ -91,7 +91,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x2: -; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/MC/AMDGPU/buf-fmt-d16-packed.s b/llvm/test/MC/AMDGPU/buf-fmt-d16-packed.s index 108e709e29e0b..196dcada2ebea 100644 --- a/llvm/test/MC/AMDGPU/buf-fmt-d16-packed.s +++ b/llvm/test/MC/AMDGPU/buf-fmt-d16-packed.s @@ -44,31 +44,31 @@ buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 //===----------------------------------------------------------------------===// tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_load_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] tbuffer_load_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_load_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_load_format_d16_xy v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] // UNPACKED-ERR: error: instruction not supported on this GPU tbuffer_load_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_load_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_load_format_d16_xyz v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] // UNPACKED-ERR: error: instruction not supported on this GPU tbuffer_load_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_load_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] // UNPACKED-ERR: error: instruction not supported on this GPU tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_store_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] tbuffer_store_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_store_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_store_format_d16_xy v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] // UNPACKED-ERR: error: instruction not supported on this GPU tbuffer_store_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_store_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_store_format_d16_xyz v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] // UNPACKED-ERR: error: instruction not supported on this GPU tbuffer_store_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// PACKED: tbuffer_store_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] +// PACKED: tbuffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] // UNPACKED-ERR: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/buf-fmt-d16-unpacked.s b/llvm/test/MC/AMDGPU/buf-fmt-d16-unpacked.s index 8db9f491e035a..15cfb225b8b55 100644 --- a/llvm/test/MC/AMDGPU/buf-fmt-d16-unpacked.s +++ b/llvm/test/MC/AMDGPU/buf-fmt-d16-unpacked.s @@ -43,31 +43,31 @@ buffer_store_format_d16_xyzw v[1:4], off, s[4:7], s1 //===----------------------------------------------------------------------===// tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_load_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] tbuffer_load_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_load_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_load_format_d16_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] // PACKED-ERR: error: instruction not supported on this GPU tbuffer_load_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_load_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_load_format_d16_xyz v[1:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] // PACKED-ERR: error: instruction not supported on this GPU tbuffer_load_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_load_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_load_format_d16_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] // PACKED-ERR: error: instruction not supported on this GPU tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_store_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] tbuffer_store_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_store_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_store_format_d16_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] // PACKED-ERR: error: instruction not supported on this GPU tbuffer_store_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_store_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_store_format_d16_xyz v[1:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] // PACKED-ERR: error: instruction not supported on this GPU tbuffer_store_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 -// UNPACKED: tbuffer_store_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED: tbuffer_store_format_d16_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] // PACKED-ERR: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/mtbuf-gfx10.s b/llvm/test/MC/AMDGPU/mtbuf-gfx10.s index 40f082d02ebce..8ea86e7de9657 100644 --- a/llvm/test/MC/AMDGPU/mtbuf-gfx10.s +++ b/llvm/test/MC/AMDGPU/mtbuf-gfx10.s @@ -1,80 +1,116 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX10-ERR %s -// GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], format:22, 0 ; encoding: [0x00,0x00,0xb0,0xe8,0x00,0x00,0x20,0x80] +//===----------------------------------------------------------------------===// +// Positive tests for legacy format syntax. +//===----------------------------------------------------------------------===// + +// GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb0,0xe8,0x00,0x00,0x20,0x80] tbuffer_load_format_d16_x v0, off, s[0:3], format:22, 0 -// GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], format:22, 0 ; encoding: [0x00,0x00,0xb1,0xe8,0x00,0x00,0x20,0x80] + +// GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb1,0xe8,0x00,0x00,0x20,0x80] tbuffer_load_format_d16_xy v0, off, s[0:3], format:22, 0 -// GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], format:22, 0 ; encoding: [0x00,0x00,0xb3,0xe8,0x00,0x00,0x20,0x80] + +// GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb3,0xe8,0x00,0x00,0x20,0x80] tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], format:22, 0 -// GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:78, 0 ; encoding: [0x00,0x00,0x73,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 0 format:78 ; encoding: [0x00,0x00,0x73,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:78, 0 -// GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], format:22, 0 slc ; encoding: [0x00,0x00,0xb3,0xe8,0x00,0x08,0x40,0x80] + +// GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] slc ; encoding: [0x00,0x00,0xb3,0xe8,0x00,0x08,0x40,0x80] tbuffer_load_format_xyzw v[8:11], off, s[0:3], format:22, 0 slc -// GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], format:63, 0 glc ; encoding: [0x00,0x40,0xfb,0xe9,0x00,0x04,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_SINT] glc ; encoding: [0x00,0x40,0xfb,0xe9,0x00,0x04,0x00,0x80] tbuffer_load_format_xyzw v[4:7], off, s[0:3], format:63, 0 glc -// GFX10: tbuffer_load_format_xyzw v[12:15], off, s[0:3], format:23, 0 glc dlc ; encoding: [0x00,0xc0,0xbb,0xe8,0x00,0x0c,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[12:15], off, s[0:3], 0 format:[BUF_FMT_16_16_UNORM] glc dlc ; encoding: [0x00,0xc0,0xbb,0xe8,0x00,0x0c,0x00,0x80] tbuffer_load_format_xyzw v[12:15], off, s[0:3], format:23, 0 glc dlc -// GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:78, 0 offset:42 ; encoding: [0x2a,0x00,0x73,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 0 format:78 offset:42 ; encoding: [0x2a,0x00,0x73,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:78, 0 offset:42 -// GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], format:62, s4 offset:73 ; encoding: [0x49,0x00,0xf3,0xe9,0x00,0x04,0x00,0x04] + +// GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], s4 format:[BUF_FMT_32_32_UINT] offset:73 ; encoding: [0x49,0x00,0xf3,0xe9,0x00,0x04,0x00,0x04] tbuffer_load_format_xyzw v[4:7], off, s[0:3], format:62, s4 offset:73 -// GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:47, 61 offset:4095 ; encoding: [0xff,0x0f,0x7b,0xe9,0x00,0x00,0x00,0xbd] + +// GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 61 format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095 ; encoding: [0xff,0x0f,0x7b,0xe9,0x00,0x00,0x00,0xbd] tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:47, 61 offset:4095 -// GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], format:77, s4 offset:1 ; encoding: [0x01,0x00,0x6b,0xea,0x00,0x08,0x00,0x04] + +// GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], s4 format:[BUF_FMT_32_32_32_32_FLOAT] offset:1 ; encoding: [0x01,0x00,0x6b,0xea,0x00,0x08,0x00,0x04] tbuffer_load_format_xyzw v[8:11], off, s[0:3], format:77, s4 offset:1 -// GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 idxen ; encoding: [0x00,0x20,0x73,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:78 idxen ; encoding: [0x00,0x20,0x73,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 idxen -// GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 offen ; encoding: [0x00,0x10,0x73,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:78 offen ; encoding: [0x00,0x10,0x73,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 offen -// GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 offen offset:52 ; encoding: [0x34,0x10,0x73,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:78 offen offset:52 ; encoding: [0x34,0x10,0x73,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 offen offset:52 -// GFX10: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], format:78, 0 idxen offen ; encoding: [0x00,0x30,0x73,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 format:78 idxen offen ; encoding: [0x00,0x30,0x73,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], format:78, 0 idxen offen -// GFX10: tbuffer_load_format_xy v[0:1], off, s[0:3], format:77, 0 ; encoding: [0x00,0x00,0x69,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] ; encoding: [0x00,0x00,0x69,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_xy v[0:1], off, s[0:3], format:77, 0 -// GFX10: tbuffer_load_format_x v0, off, s[0:3], format:77, 0 ; encoding: [0x00,0x00,0x68,0xea,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_load_format_x v0, off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] ; encoding: [0x00,0x00,0x68,0xea,0x00,0x00,0x00,0x80] tbuffer_load_format_x v0, off, s[0:3], format:77, 0 -// GFX10: tbuffer_store_format_d16_x v0, v1, s[4:7], format:33, 0 idxen ; encoding: [0x00,0x20,0x0c,0xe9,0x01,0x00,0x21,0x80] + +// GFX10: tbuffer_store_format_d16_x v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0c,0xe9,0x01,0x00,0x21,0x80] tbuffer_store_format_d16_x v0, v1, s[4:7], format:33, 0 idxen -// GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], format:33, 0 idxen ; encoding: [0x00,0x20,0x0d,0xe9,0x01,0x00,0x21,0x80] + +// GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0d,0xe9,0x01,0x00,0x21,0x80] tbuffer_store_format_d16_xy v0, v1, s[4:7], format:33, 0 idxen -// GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], format:33, 0 idxen ; encoding: [0x00,0x20,0x0f,0xe9,0x02,0x00,0x21,0x80] + +// GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0f,0xe9,0x02,0x00,0x21,0x80] tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], format:33, 0 idxen -// GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:44, 0 ; encoding: [0x00,0x00,0x67,0xe9,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_UNORM] ; encoding: [0x00,0x00,0x67,0xe9,0x00,0x00,0x00,0x80] tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:44, 0 -// GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], format:61, 0 glc ; encoding: [0x00,0x40,0xef,0xe9,0x00,0x04,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] glc ; encoding: [0x00,0x40,0xef,0xe9,0x00,0x04,0x00,0x80] tbuffer_store_format_xyzw v[4:7], off, s[0:3], format:61, 0 glc -// GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 slc ; encoding: [0x00,0x00,0x77,0xea,0x00,0x08,0x40,0x80] + +// GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc ; encoding: [0x00,0x00,0x77,0xea,0x00,0x08,0x40,0x80] tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 slc -// GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 ; encoding: [0x00,0x00,0x77,0xea,0x00,0x08,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 ; encoding: [0x00,0x00,0x77,0xea,0x00,0x08,0x00,0x80] tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 -// GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, 0 offset:42 ; encoding: [0x2a,0x00,0xaf,0xeb,0x00,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 ; encoding: [0x2a,0x00,0xaf,0xeb,0x00,0x00,0x00,0x80] tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, 0 offset:42 -// GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, s4 offset:42 ; encoding: [0x2a,0x00,0xaf,0xeb,0x00,0x00,0x00,0x04] + +// GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42 ; encoding: [0x2a,0x00,0xaf,0xeb,0x00,0x00,0x00,0x04] tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, s4 offset:42 -// GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:47, 0 idxen ; encoding: [0x00,0x20,0x7f,0xe9,0x04,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SSCALED] idxen ; encoding: [0x00,0x20,0x7f,0xe9,0x04,0x00,0x00,0x80] tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:47, 0 idxen -// GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:115, 0 offen ; encoding: [0x00,0x10,0x9f,0xeb,0x04,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen ; encoding: [0x00,0x10,0x9f,0xeb,0x04,0x00,0x00,0x80] tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:115, 0 offen -// GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], format:70, 0 idxen offen ; encoding: [0x00,0x30,0x37,0xea,0x04,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_16_16_16_16_SINT] idxen offen ; encoding: [0x00,0x30,0x37,0xea,0x04,0x00,0x00,0x80] tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], format:70, 0 idxen offen -// GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:63, 0 idxen ; encoding: [0x00,0x20,0xff,0xe9,0x04,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_SINT] idxen ; encoding: [0x00,0x20,0xff,0xe9,0x04,0x00,0x00,0x80] tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:63, 0 idxen -// GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], format:46, 0 idxen ; encoding: [0x00,0x20,0x77,0xe9,0x06,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_10_10_10_2_USCALED] idxen ; encoding: [0x00,0x20,0x77,0xe9,0x06,0x00,0x00,0x80] tbuffer_store_format_xyzw v[0:3], v6, s[0:3], format:46, 0 idxen -// GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:125, 0 idxen ; encoding: [0x00,0x20,0xec,0xeb,0x01,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen ; encoding: [0x00,0x20,0xec,0xeb,0x01,0x00,0x00,0x80] tbuffer_store_format_x v0, v1, s[0:3], format:125, 0 idxen -// GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], format:33, 0 idxen ; encoding: [0x00,0x20,0x0d,0xe9,0x02,0x00,0x00,0x80] + +// GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0d,0xe9,0x02,0x00,0x00,0x80] tbuffer_store_format_xy v[0:1], v2, s[0:3], format:33, 0 idxen -// GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:127, 0 idxen ; encoding: [0x00,0x20,0xfc,0xeb,0x01,0x00,0x00,0x80] +// GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:127 idxen ; encoding: [0x00,0x20,0xfc,0xeb,0x01,0x00,0x00,0x80] tbuffer_store_format_x v0, v1, s[0:3], format:127, 0 idxen -// GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:127, 0 idxen ; encoding: [0x00,0x20,0xfc,0xeb,0x01,0x00,0x00,0x80] +// GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:127 idxen ; encoding: [0x00,0x20,0xfc,0xeb,0x01,0x00,0x00,0x80] tbuffer_store_format_x v0, v1, s[0:3] format:127 0 idxen -// GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:0, s0 idxen ; encoding: [0x00,0x20,0x04,0xe8,0x01,0x00,0x00,0x00] +// GFX10: tbuffer_store_format_x v0, v1, s[0:3], s0 format:[BUF_FMT_INVALID] idxen ; encoding: [0x00,0x20,0x04,0xe8,0x01,0x00,0x00,0x00] tbuffer_store_format_x v0, v1, s[0:3] format:0 s0 idxen // GFX10: tbuffer_store_format_x v0, v1, s[0:3], s0 idxen ; encoding: [0x00,0x20,0x0c,0xe8,0x01,0x00,0x00,0x00] @@ -86,8 +122,15 @@ tbuffer_store_format_x v0, v1, s[0:3], 0 idxen // GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], s0 ; encoding: [0x00,0x00,0x08,0xe8,0x00,0x00,0x20,0x00] tbuffer_load_format_d16_x v0, off, s[0:3] s0 +MAX_FORMAT=127 +// GFX10: tbuffer_store_format_x v0, v1, s[0:3], s0 format:[BUF_FMT_INVALID] idxen ; encoding: [0x00,0x20,0x04,0xe8,0x01,0x00,0x00,0x00] +tbuffer_store_format_x v0, v1, s[0:3] format:0 s0 idxen + +// GFX10: tbuffer_store_format_x v0, v1, s[0:3], s0 format:[BUF_FMT_INVALID] idxen ; encoding: [0x00,0x20,0x04,0xe8,0x01,0x00,0x00,0x00] +tbuffer_store_format_x v0, v1, s[0:3] format:0 s0 idxen + //===----------------------------------------------------------------------===// -// Errors handling. +// Negative tests for legacy format syntax. //===----------------------------------------------------------------------===// // GFX10-ERR: error: out of range format @@ -111,5 +154,387 @@ tbuffer_load_format_d16_x v0, off, s[0:3], format:1,, s0 // GFX10-ERR: error: unknown token in expression tbuffer_load_format_d16_x v0, off, s[0:3], format:1:, s0 -// GFX10-ERR: error: not a valid operand +// GFX10-ERR: error: unknown token in expression tbuffer_load_format_d16_x v0, off, s[0:3],, format:1, s0 + +//===----------------------------------------------------------------------===// +// Positive tests for symbolic MTBUF format. +//===----------------------------------------------------------------------===// + +// Format may be specified in numeric form (min value). +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:0 idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_INVALID] idxen ; encoding: [0x00,0x20,0x07,0xe8,0x01,0x01,0x01,0x00] + +// Format may be specified in numeric form (max value). +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:127 idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:127 idxen ; encoding: [0x00,0x20,0xff,0xeb,0x01,0x01,0x01,0x00] + +// Format may be specified in numeric form (first unsupported value). +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:78 idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:78 idxen ; encoding: [0x00,0x20,0x77,0xea,0x01,0x01,0x01,0x00] + +// Format may be specified as an expression. +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:(2 + 3 * 16) idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_UNORM] idxen ; encoding: [0x00,0x20,0x97,0xe9,0x01,0x01,0x01,0x00] + +// format may be specified as a list of dfmt, nfmt: +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_8,BUF_NUM_FORMAT_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 idxen ; encoding: [0x00,0x20,0x0f,0xe8,0x01,0x01,0x01,0x00] + +// nfmt and dfmt can be in either order: +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_SNORM, BUF_DATA_FORMAT_16] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_SNORM] idxen ; encoding: [0x00,0x20,0x47,0xe8,0x01,0x01,0x01,0x00] + +// nfmt may be omitted: +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[ BUF_DATA_FORMAT_8_8 ] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_UNORM] idxen ; encoding: [0x00,0x20,0x77,0xe8,0x01,0x01,0x01,0x00] + +// dfmt may be omitted: +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_USCALED] idxen ; encoding: [0x00,0x20,0x1f,0xe8,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_16_16] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_UNORM] idxen ; encoding: [0x00,0x20,0xbf,0xe8,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_10_11_11] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_UNORM] idxen ; encoding: [0x00,0x20,0xf7,0xe8,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_11_11_10] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_UNORM] idxen ; encoding: [0x00,0x20,0x2f,0xe9,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_10_10_10_2] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_UNORM] idxen ; encoding: [0x00,0x20,0x67,0xe9,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_2_10_10_10] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_UNORM] idxen ; encoding: [0x00,0x20,0x97,0xe9,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_8_8_8_8] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_UNORM] idxen ; encoding: [0x00,0x20,0xc7,0xe9,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_16_16_16_16] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_UNORM] idxen ; encoding: [0x00,0x20,0x0f,0xea,0x01,0x01,0x01,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_INVALID] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_INVALID] idxen ; encoding: [0x00,0x20,0x07,0xe8,0x01,0x01,0x01,0x00] + +// Check nfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SSCALED] idxen ; encoding: [0x00,0x20,0x27,0xe8,0x01,0x01,0x01,0x00] + +// Check nfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_UINT] idxen ; encoding: [0x00,0x20,0x2f,0xe8,0x01,0x01,0x01,0x00] + +// Check nfmt formats +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SINT] idxen ; encoding: [0x00,0x20,0x37,0xe8,0x01,0x01,0x01,0x00] + +//===----------------------------------------------------------------------===// +// Negative tests for symbolic format errors handling. +//===----------------------------------------------------------------------===// + +// Unknown format specifier +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT] idxen +// GFX10-ERR: error: unsupported format + +// Valid but unsupported format specifier (SNORM_OGL is supported for SI/CI only) +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_SNORM_OGL] idxen +// GFX10-ERR: error: unsupported format + +// Valid but unsupported format specifier (RESERVED_6 is supported for VI/GFX9 only) +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_RESERVED_6] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_32] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_32_32] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32_32] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32_32, BUF_NUM_FORMAT_UNORM] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_NUM_FORMAT_FLOAT] idxen +// GFX10-ERR: error: unsupported format + +// Unsupported format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_FLOAT] idxen +// GFX10-ERR: error: unsupported format + +//===----------------------------------------------------------------------===// +// Positive tests for unified MTBUF format (GFX10+). +//===----------------------------------------------------------------------===// + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_INVALID] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_INVALID] idxen ; encoding: [0x00,0x20,0x07,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 idxen ; encoding: [0x00,0x20,0x0f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SNORM] idxen ; encoding: [0x00,0x20,0x17,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_USCALED] idxen ; encoding: [0x00,0x20,0x1f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SSCALED] idxen ; encoding: [0x00,0x20,0x27,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_UINT] idxen ; encoding: [0x00,0x20,0x2f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SINT] idxen ; encoding: [0x00,0x20,0x37,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_UNORM] idxen ; encoding: [0x00,0x20,0x3f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_SNORM] idxen ; encoding: [0x00,0x20,0x47,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_USCALED] idxen ; encoding: [0x00,0x20,0x4f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_SSCALED] idxen ; encoding: [0x00,0x20,0x57,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_UINT] idxen ; encoding: [0x00,0x20,0x5f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_SINT] idxen ; encoding: [0x00,0x20,0x67,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_FLOAT] idxen ; encoding: [0x00,0x20,0x6f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_UNORM] idxen ; encoding: [0x00,0x20,0x77,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_SNORM] idxen ; encoding: [0x00,0x20,0x7f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_USCALED] idxen ; encoding: [0x00,0x20,0x87,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_SSCALED] idxen ; encoding: [0x00,0x20,0x8f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_UINT] idxen ; encoding: [0x00,0x20,0x97,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_SINT] idxen ; encoding: [0x00,0x20,0x9f,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_UINT] idxen ; encoding: [0x00,0x20,0xa7,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_SINT] idxen ; encoding: [0x00,0x20,0xaf,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_FLOAT] idxen ; encoding: [0x00,0x20,0xb7,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_UNORM] idxen ; encoding: [0x00,0x20,0xbf,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_SNORM] idxen ; encoding: [0x00,0x20,0xc7,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_USCALED] idxen ; encoding: [0x00,0x20,0xcf,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_SSCALED] idxen ; encoding: [0x00,0x20,0xd7,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_UINT] idxen ; encoding: [0x00,0x20,0xdf,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_SINT] idxen ; encoding: [0x00,0x20,0xe7,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_FLOAT] idxen ; encoding: [0x00,0x20,0xef,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_UNORM] idxen ; encoding: [0x00,0x20,0xf7,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_SNORM] idxen ; encoding: [0x00,0x20,0xff,0xe8,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_USCALED] idxen ; encoding: [0x00,0x20,0x07,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_UINT] idxen ; encoding: [0x00,0x20,0x17,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_SINT] idxen ; encoding: [0x00,0x20,0x1f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_11_11_FLOAT] idxen ; encoding: [0x00,0x20,0x27,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_UNORM] idxen ; encoding: [0x00,0x20,0x2f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_SNORM] idxen ; encoding: [0x00,0x20,0x37,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_USCALED] idxen ; encoding: [0x00,0x20,0x3f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_SSCALED] idxen ; encoding: [0x00,0x20,0x47,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_UINT] idxen ; encoding: [0x00,0x20,0x4f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_SINT] idxen ; encoding: [0x00,0x20,0x57,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_11_11_10_FLOAT] idxen ; encoding: [0x00,0x20,0x5f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_UNORM] idxen ; encoding: [0x00,0x20,0x67,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; encoding: [0x00,0x20,0x6f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_USCALED] idxen ; encoding: [0x00,0x20,0x77,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_SSCALED] idxen ; encoding: [0x00,0x20,0x7f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_UINT] idxen ; encoding: [0x00,0x20,0x87,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_10_10_10_2_SINT] idxen ; encoding: [0x00,0x20,0x8f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_UNORM] idxen ; encoding: [0x00,0x20,0x97,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_SNORM] idxen ; encoding: [0x00,0x20,0x9f,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_USCALED] idxen ; encoding: [0x00,0x20,0xa7,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_SSCALED] idxen ; encoding: [0x00,0x20,0xaf,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_UINT] idxen ; encoding: [0x00,0x20,0xb7,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_2_10_10_10_SINT] idxen ; encoding: [0x00,0x20,0xbf,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_UNORM] idxen ; encoding: [0x00,0x20,0xc7,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_SNORM] idxen ; encoding: [0x00,0x20,0xcf,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_USCALED] idxen ; encoding: [0x00,0x20,0xd7,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_SSCALED] idxen ; encoding: [0x00,0x20,0xdf,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_UINT] idxen ; encoding: [0x00,0x20,0xe7,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_8_8_8_SINT] idxen ; encoding: [0x00,0x20,0xef,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_UINT] idxen ; encoding: [0x00,0x20,0xf7,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_SINT] idxen ; encoding: [0x00,0x20,0xff,0xe9,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_FLOAT] idxen ; encoding: [0x00,0x20,0x07,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_UNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_UNORM] idxen ; encoding: [0x00,0x20,0x0f,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_SNORM] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_SNORM] idxen ; encoding: [0x00,0x20,0x17,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_USCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_USCALED] idxen ; encoding: [0x00,0x20,0x1f,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_SSCALED] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_SSCALED] idxen ; encoding: [0x00,0x20,0x27,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_UINT] idxen ; encoding: [0x00,0x20,0x2f,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_SINT] idxen ; encoding: [0x00,0x20,0x37,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_16_16_16_16_FLOAT] idxen ; encoding: [0x00,0x20,0x3f,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_UINT] idxen ; encoding: [0x00,0x20,0x47,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_SINT] idxen ; encoding: [0x00,0x20,0x4f,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_FLOAT] idxen ; encoding: [0x00,0x20,0x57,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_32_UINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_32_UINT] idxen ; encoding: [0x00,0x20,0x5f,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_32_SINT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_32_SINT] idxen ; encoding: [0x00,0x20,0x67,0xea,0x01,0x01,0x01,0x00] + +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen +// GFX10: tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen ; encoding: [0x00,0x20,0x6f,0xea,0x01,0x01,0x01,0x00] + +//===----------------------------------------------------------------------===// +// Negative tests for unified MTBUF format (GFX10+). +//===----------------------------------------------------------------------===// + +// Excessive commas +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SNORM,] idxen +// GFX10-ERR: error: expected a closing square bracket + +// Duplicate format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SNORM,BUF_FMT_8_SNORM] idxen +// GFX10-ERR: error: expected a closing square bracket + +// Duplicate format +tbuffer_store_format_xyzw v[1:4], v1, s[4:7], s0 format:[BUF_FMT_8_SNORM,BUF_DATA_FORMAT_8] idxen +// GFX10-ERR: error: expected a closing square bracket diff --git a/llvm/test/MC/AMDGPU/mtbuf.s b/llvm/test/MC/AMDGPU/mtbuf.s index 9d207ff326060..f7fdd29bb83b8 100644 --- a/llvm/test/MC/AMDGPU/mtbuf.s +++ b/llvm/test/MC/AMDGPU/mtbuf.s @@ -2,60 +2,60 @@ // RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICI %s // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN-ERR,SICI-ERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN-ERR,SICI-ERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN-ERR,VI-ERR %s //===----------------------------------------------------------------------===// -// Test for dfmt and nfmt (tbuffer only) +// Positive tests for legacy dfmt/nfmt syntax. //===----------------------------------------------------------------------===// tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] +// SICI: tbuffer_load_format_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01] +// SICI: tbuffer_load_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01] tbuffer_load_format_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_load_format_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_load_format_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] +// SICI: tbuffer_load_format_xyz v[1:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_xyz v[1:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7b,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01] +// SICI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7b,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01] tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] +// SICI: tbuffer_store_format_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_store_format_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01] +// SICI: tbuffer_store_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_store_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01] tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01] +// SICI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01] tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] -// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] // nfmt is optional: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, ttmp1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe8,0x00,0x01,0x1d,0x71] -// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x00,0x7f,0xe8,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71] // dfmt is optional: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, ttmp1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:1, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x0f,0xe9,0x00,0x01,0x1d,0x71] -// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:1, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x0b,0xe9,0x00,0x01,0x1d,0x71] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x0f,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x0b,0xe9,0x00,0x01,0x1d,0x71] // nfmt and dfmt can be in either order: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, dfmt:15, ttmp1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] -// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] // nfmt and dfmt may be omitted: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 @@ -64,13 +64,13 @@ tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 // Check dfmt/nfmt min values tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:0, ttmp1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:0, ttmp1 ; encoding: [0x00,0x00,0x07,0xe8,0x00,0x01,0x1d,0x71] -// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:0, ttmp1 ; encoding: [0x00,0x80,0x03,0xe8,0x00,0x01,0x1d,0x71] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x00,0x07,0xe8,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x80,0x03,0xe8,0x00,0x01,0x1d,0x71] // Check dfmt/nfmt max values tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:7, ttmp1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:7, ttmp1 ; encoding: [0x00,0x00,0xff,0xeb,0x00,0x01,0x1d,0x71] -// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:7, ttmp1 ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x00,0xff,0xeb,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71] // Check default dfmt/nfmt values tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:1, nfmt:0, ttmp1 @@ -79,11 +79,18 @@ tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:1, nfmt:0, ttmp1 // Check that comma separators are optional tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:15 nfmt:7 ttmp1 -// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:7, ttmp1 ; encoding: [0x00,0x00,0xff,0xeb,0x00,0x01,0x1d,0x71] -// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:7, ttmp1 ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x00,0xff,0xeb,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71] + +dfmt=15 +nfmt=7 +// Check expressions with dfmt +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:-1+dfmt+1 nfmt:nfmt ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x00,0xff,0xeb,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71] //===----------------------------------------------------------------------===// -// Errors handling. +// Negative tests for legacy dfmt/nfmt syntax. //===----------------------------------------------------------------------===// tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:-1 nfmt:1 s0 @@ -102,7 +109,7 @@ tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] // GCN-ERR: error: too few operands for instruction tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7],, dfmt:1 nfmt:1 s0 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: unknown token in expression tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:1,, nfmt:1 s0 // GCN-ERR: error: unknown token in expression @@ -111,19 +118,262 @@ tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:1 nfmt:1,, s0 // GCN-ERR: error: unknown token in expression tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:1 dfmt:1 s0 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid operand for instruction tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] nfmt:1 nfmt:1 s0 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid operand for instruction tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:1 nfmt:1 dfmt:1 s0 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid operand for instruction tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] nfmt:1 dfmt:1 nfmt:1 s0 -// GCN-ERR: error: not a valid operand +// GCN-ERR: error: invalid operand for instruction tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:1: nfmt:1 s0 // GCN-ERR: error: unknown token in expression tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7] dfmt:1 nfmt:1: s0 // GCN-ERR: error: unknown token in expression + +//===----------------------------------------------------------------------===// +// Tests for symbolic MTBUF format +//===----------------------------------------------------------------------===// + +// Format may be specified in numeric form (min value). +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:0 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x00,0x07,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x80,0x03,0xe8,0x00,0x01,0x1d,0x00] + +// Format may be specified in numeric form (max value). +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:127 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x00,0xff,0xeb,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x00] + +// Format may be specified as an expression. +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:(2 + 3 * 16) +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16,BUF_NUM_FORMAT_SSCALED] ; encoding: [0x00,0x00,0x97,0xe9,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16,BUF_NUM_FORMAT_SSCALED] ; encoding: [0x00,0x80,0x93,0xe9,0x00,0x01,0x1d,0x00] + +// format may be specified as a list of dfmt, nfmt: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8,BUF_NUM_FORMAT_UNORM] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 ; encoding: [0x00,0x00,0x0f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 ; encoding: [0x00,0x80,0x0b,0xe8,0x00,0x01,0x1d,0x00] + +// nfmt and dfmt can be in either order: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SNORM, BUF_DATA_FORMAT_16] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16,BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x00,0x97,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16,BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x80,0x93,0xe8,0x00,0x01,0x1d,0x00] + +// nfmt may be omitted: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[ BUF_DATA_FORMAT_8_8 ] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8_8] ; encoding: [0x00,0x00,0x1f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8_8] ; encoding: [0x00,0x80,0x1b,0xe8,0x00,0x01,0x1d,0x00] + +// dfmt may be omitted: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_USCALED] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x0f,0xe9,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x0b,0xe9,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32] ; encoding: [0x00,0x00,0x27,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32] ; encoding: [0x00,0x80,0x23,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16_16] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16_16] ; encoding: [0x00,0x00,0x2f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16_16] ; encoding: [0x00,0x80,0x2b,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_10_11_11] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_10_11_11] ; encoding: [0x00,0x00,0x37,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_10_11_11] ; encoding: [0x00,0x80,0x33,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_11_11_10] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_11_11_10] ; encoding: [0x00,0x00,0x3f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_11_11_10] ; encoding: [0x00,0x80,0x3b,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_10_10_10_2] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_10_10_10_2] ; encoding: [0x00,0x00,0x47,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_10_10_10_2] ; encoding: [0x00,0x80,0x43,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_2_10_10_10] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_2_10_10_10] ; encoding: [0x00,0x00,0x4f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_2_10_10_10] ; encoding: [0x00,0x80,0x4b,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8_8_8_8] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8_8_8_8] ; encoding: [0x00,0x00,0x57,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8_8_8_8] ; encoding: [0x00,0x80,0x53,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32] ; encoding: [0x00,0x00,0x5f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32] ; encoding: [0x00,0x80,0x5b,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16_16_16_16] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16_16_16_16] ; encoding: [0x00,0x00,0x67,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_16_16_16_16] ; encoding: [0x00,0x80,0x63,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32] ; encoding: [0x00,0x00,0x6f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32] ; encoding: [0x00,0x80,0x6b,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32_32] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32_32] ; encoding: [0x00,0x00,0x77,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32_32] ; encoding: [0x00,0x80,0x73,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x00,0x7f,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x00] + +// Check dfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_INVALID] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x00,0x07,0xe8,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x80,0x03,0xe8,0x00,0x01,0x1d,0x00] + +// Check nfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SSCALED] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SSCALED] ; encoding: [0x00,0x00,0x8f,0xe9,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SSCALED] ; encoding: [0x00,0x80,0x8b,0xe9,0x00,0x01,0x1d,0x00] + +// Check nfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_UINT] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_UINT] ; encoding: [0x00,0x00,0x0f,0xea,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_UINT] ; encoding: [0x00,0x80,0x0b,0xea,0x00,0x01,0x1d,0x00] + +// Check nfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SINT] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SINT] ; encoding: [0x00,0x00,0x8f,0xea,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SINT] ; encoding: [0x00,0x80,0x8b,0xea,0x00,0x01,0x1d,0x00] + +// Check nfmt formats +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_FLOAT] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x00,0x8f,0xeb,0x00,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x80,0x8b,0xeb,0x00,0x01,0x1d,0x00] + +// Check optional comma separators +tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0, format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT], idxen +// SICI: tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] idxen ; encoding: [0x00,0x20,0xa7,0xeb,0x01,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] idxen ; encoding: [0x00,0xa0,0xa3,0xeb,0x01,0x01,0x1d,0x00] + +// Check offen and offset +tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0, format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offen offset:52 +// SICI: tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offen offset:52 ; encoding: [0x34,0x10,0xa7,0xeb,0x01,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offen offset:52 ; encoding: [0x34,0x90,0xa3,0xeb,0x01,0x01,0x1d,0x00] + +// Check idxen and offen +tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0, format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] idxen offen offset:52 +// SICI: tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] idxen offen offset:52 ; encoding: [0x34,0x30,0xa7,0xeb,0x01,0x01,0x1d,0x00] +// VI: tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] idxen offen offset:52 ; encoding: [0x34,0xb0,0xa3,0xeb,0x01,0x01,0x1d,0x00] + +// Check addr64 +tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0, format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] addr64 +// SICI: tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] addr64 ; encoding: [0x00,0x80,0xa7,0xeb,0x01,0x01,0x1d,0x00] +// VI-ERR: error: instruction not supported on this GPU + +//===----------------------------------------------------------------------===// +// Tests for symbolic format errors handling +//===----------------------------------------------------------------------===// + +// Missing soffset +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], format:[BUF_DATA_FORMAT_32] +// GCN-ERR: error: not a valid operand. + +// Invalid soffset +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s[255] format:[BUF_NUM_FORMAT_FLOAT] +// GCN-ERR: error: not a valid operand. + +// Both legacy and symbolic formats are specified +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:1 s0 format:[BUF_NUM_FORMAT_FLOAT] +// GCN-ERR: error: duplicate format + +// Missing format number +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format: offset:52 +// GCN-ERR: error: expected absolute expression + +// Invalid number +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:-1 +// GCN-ERR: error: out of range format + +// Invalid number +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:128 +// GCN-ERR: error: out of range format + +MAXVAL=127 +// Invalid expression +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:MAXVAL+1 +// GCN-ERR: error: out of range format + +// Empty list +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[] +// GCN-ERR: error: expected a format string + +// More than 2 format specifiers +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT,BUF_DATA_FORMAT_8] +// GCN-ERR: error: expected a closing square bracket + +// More than 2 format specifiers +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_UINT,BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] +// GCN-ERR: error: expected a closing square bracket + +// Missing brackets +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:BUF_NUM_FORMAT_UINT +// GCN-ERR: error: expected absolute expression + +// Unpaired brackets +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_UINT +// GCN-ERR: error: expected a closing square bracket + +// Unpaired brackets +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:BUF_NUM_FORMAT_UINT] +// GCN-ERR: error: expected absolute expression + +// Missing comma +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_UINT BUF_DATA_FORMAT_32] +// GCN-ERR: error: expected a closing square bracket + +// Duplicate dfmt +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_DATA_FORMAT_32] +// GCN-ERR: error: duplicate data format + +// Duplicate dfmt +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32,BUF_DATA_FORMAT_8] +// GCN-ERR: error: duplicate data format + +// Duplicate nfmt +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_UINT,BUF_NUM_FORMAT_FLOAT] +// GCN-ERR: error: duplicate numeric format + +// Unknown format specifier +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT] +// GCN-ERR: error: unsupported format + +// Valid but unsupported format specifier (SNORM_OGL is supported for SI/CI only) +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SNORM_OGL] +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_SNORM_OGL] ; encoding: [0x00,0x00,0x0f,0xeb,0x00,0x01,0x1d,0x00] +// VI-ERR: error: unsupported format + +// Valid but unsupported format specifier (RESERVED_6 is supported for VI/GFX9 only) +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_RESERVED_6] +// SICI-ERR: error: unsupported format +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_RESERVED_6] ; encoding: [0x00,0x80,0x0b,0xeb,0x00,0x01,0x1d,0x00] + +// Excessive commas +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7],, s0 format:[BUF_DATA_FORMAT_8] +// GCN-ERR: error: unknown token in expression + +// Excessive commas +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0,, format:[BUF_DATA_FORMAT_8] +// GCN-ERR: error: not a valid operand. + +// Excessive commas +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8],, offset:52 +// GCN-ERR: error: unknown token in expression diff --git a/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_packed_d16.txt b/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_packed_d16.txt index 495e78f691e83..a7599f29fa0a6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_packed_d16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_packed_d16.txt @@ -25,26 +25,26 @@ # PACKED: buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x01,0x01] 0x00,0x00,0x3c,0xe0,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_load_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_load_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_load_format_d16_xy v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_load_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_load_format_d16_xyz v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_load_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_store_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_store_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_store_format_d16_xy v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_store_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_store_format_d16_xyz v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01 -# PACKED: tbuffer_store_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] +# PACKED: tbuffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01 diff --git a/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_unpacked_d16.txt b/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_unpacked_d16.txt index 2df262bdceb05..9c78e97c8e3ca 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_unpacked_d16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/buf_fmt_unpacked_d16.txt @@ -25,26 +25,26 @@ # UNPACKED: buffer_store_format_d16_xyzw v[1:4], off, s[4:7], s1 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x01,0x01] 0x00,0x00,0x3c,0xe0,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_load_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_load_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_load_format_d16_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_load_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_load_format_d16_xyz v[1:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_load_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_load_format_d16_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_store_format_d16_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_store_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_store_format_d16_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_store_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_store_format_d16_xyz v[1:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] 0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01 -# UNPACKED: tbuffer_store_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] +# UNPACKED: tbuffer_store_format_d16_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] 0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01 diff --git a/llvm/test/MC/Disassembler/AMDGPU/mtbuf_gfx10.txt b/llvm/test/MC/Disassembler/AMDGPU/mtbuf_gfx10.txt index 10d92d9990dc0..4007c5f0dea70 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/mtbuf_gfx10.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/mtbuf_gfx10.txt @@ -1,79 +1,111 @@ # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX10 -# GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], format:22, 0 +# GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] 0x00,0x00,0xb0,0xe8,0x00,0x00,0x20,0x80 -# GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], format:22, 0 + +# GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] 0x00,0x00,0xb1,0xe8,0x00,0x00,0x20,0x80 -# GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], format:22, 0 + +# GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] 0x00,0x00,0xb3,0xe8,0x00,0x00,0x20,0x80 -# GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:78, 0 + +# GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 0 format:78 0x00,0x00,0x73,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], format:22, 0 slc + +# GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] slc 0x00,0x00,0xb3,0xe8,0x00,0x08,0x40,0x80 -# GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], format:63, 0 glc + +# GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_SINT] glc 0x00,0x40,0xfb,0xe9,0x00,0x04,0x00,0x80 -# GFX10: tbuffer_load_format_xyzw v[12:15], off, s[0:3], format:23, 0 glc dlc + +# GFX10: tbuffer_load_format_xyzw v[12:15], off, s[0:3], 0 format:[BUF_FMT_16_16_UNORM] glc dlc 0x00,0xc0,0xbb,0xe8,0x00,0x0c,0x00,0x80 -# GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:78, 0 offset:42 + +# GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 0 format:78 offset:42 0x2a,0x00,0x73,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], format:62, s4 offset:73 + +# GFX10: tbuffer_load_format_xyzw v[4:7], off, s[0:3], s4 format:[BUF_FMT_32_32_UINT] offset:73 0x49,0x00,0xf3,0xe9,0x00,0x04,0x00,0x04 -# GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], format:47, 61 offset:4095 + +# GFX10: tbuffer_load_format_xyzw v[0:3], off, s[0:3], 61 format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095 0xff,0x0f,0x7b,0xe9,0x00,0x00,0x00,0xbd -# GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], format:77, s4 offset:1 + +# GFX10: tbuffer_load_format_xyzw v[8:11], off, s[0:3], s4 format:[BUF_FMT_32_32_32_32_FLOAT] offset:1 0x01,0x00,0x6b,0xea,0x00,0x08,0x00,0x04 -# GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 idxen + +# GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:78 idxen 0x00,0x20,0x73,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 offen + +# GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:78 offen 0x00,0x10,0x73,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], format:78, 0 offen offset:52 + +# GFX10: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:78 offen offset:52 0x34,0x10,0x73,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], format:78, 0 idxen offen + +# GFX10: tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 format:78 idxen offen 0x00,0x30,0x73,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_xy v[0:1], off, s[0:3], format:77, 0 + +# GFX10: tbuffer_load_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] 0x00,0x00,0x69,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_x v0, off, s[0:3], format:77, 0 + +# GFX10: tbuffer_load_format_x v0, off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] 0x00,0x00,0x68,0xea,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_d16_x v0, v1, s[4:7], format:33, 0 idxen + +# GFX10: tbuffer_store_format_d16_x v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen 0x00,0x20,0x0c,0xe9,0x01,0x00,0x21,0x80 -# GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], format:33, 0 idxen + +# GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen 0x00,0x20,0x0d,0xe9,0x01,0x00,0x21,0x80 -# GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], format:33, 0 idxen + +# GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen 0x00,0x20,0x0f,0xe9,0x02,0x00,0x21,0x80 -# GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:44, 0 + +# GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_UNORM] 0x00,0x00,0x67,0xe9,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], format:61, 0 glc + +# GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] glc 0x00,0x40,0xef,0xe9,0x00,0x04,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 slc + +# GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc 0x00,0x00,0x77,0xea,0x00,0x08,0x40,0x80 -# GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 + +# GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 0x00,0x00,0x77,0xea,0x00,0x08,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, 0 offset:42 + +# GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 0x2a,0x00,0xaf,0xeb,0x00,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, s4 offset:42 + +# GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42 0x2a,0x00,0xaf,0xeb,0x00,0x00,0x00,0x04 -# GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:47, 0 idxen + +# GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SSCALED] idxen 0x00,0x20,0x7f,0xe9,0x04,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:115, 0 offen + +# GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen 0x00,0x10,0x9f,0xeb,0x04,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], format:70, 0 idxen offen + +# GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_16_16_16_16_SINT] idxen offen 0x00,0x30,0x37,0xea,0x04,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:63, 0 idxen + +# GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_SINT] idxen 0x00,0x20,0xff,0xe9,0x04,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], format:46, 0 idxen + +# GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_10_10_10_2_USCALED] idxen 0x00,0x20,0x77,0xe9,0x06,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:125, 0 idxen + +# GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen 0x00,0x20,0xec,0xeb,0x01,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], format:33, 0 idxen + +# GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen 0x00,0x20,0x0d,0xe9,0x02,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:127, 0 idxen ; encoding: [0x00,0x20,0xfc,0xeb,0x01,0x00,0x00,0x80] +# GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:127 idxen ; encoding: [0x00,0x20,0xfc,0xeb,0x01,0x00,0x00,0x80] 0x00,0x20,0xfc,0xeb,0x01,0x00,0x00,0x80 -# GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:0, 0 idxen ; encoding: [0x00,0x20,0x04,0xe8,0x01,0x00,0x00,0x80] +# GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_INVALID] idxen ; encoding: [0x00,0x20,0x04,0xe8,0x01,0x00,0x00,0x80] 0x00,0x20,0x04,0xe8,0x01,0x00,0x00,0x80 -# GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], format:0, s0 ; encoding: [0x00,0x00,0x00,0xe8,0x00,0x00,0x20,0x00] +# GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], s0 format:[BUF_FMT_INVALID] ; encoding: [0x00,0x00,0x00,0xe8,0x00,0x00,0x20,0x00] 0x00,0x00,0x00,0xe8,0x00,0x00,0x20,0x00 # GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x0c,0xe8,0x01,0x00,0x00,0x80] @@ -82,5 +114,5 @@ # GFX10: tbuffer_load_format_d16_x v0, off, s[0:3], s0 ; encoding: [0x00,0x00,0x08,0xe8,0x00,0x00,0x20,0x00] 0x00,0x00,0x08,0xe8,0x00,0x00,0x20,0x00 -# GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:2, s0 idxen ; encoding: [0x00,0x20,0x14,0xe8,0x01,0x00,0x00,0x00] +# GFX10: tbuffer_store_format_x v0, v1, s[0:3], s0 format:[BUF_FMT_8_SNORM] idxen ; encoding: [0x00,0x20,0x14,0xe8,0x01,0x00,0x00,0x00] 0x00,0x20,0x14,0xe8,0x01,0x00,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt b/llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt index 35f9d3bfd18f4..d32545f7f48c9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/mtbuf_vi.txt @@ -1,40 +1,97 @@ # RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck %s -check-prefix=VI -# VI: tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] +# VI: tbuffer_load_format_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] 0x00 0x00 0x78 0xe9 0x00 0x01 0x01 0x01 -# VI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01] +# VI: tbuffer_load_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01] 0x00 0x80 0x78 0xe9 0x00 0x01 0x01 0x01 -# VI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01] +# VI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01] 0x00 0x80 0x79 0xe9 0x00 0x01 0x01 0x01 -# VI: tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] +# VI: tbuffer_store_format_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] 0x00 0x00 0x7a 0xe9 0x00 0x01 0x01 0x01 -# VI: tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01] +# VI: tbuffer_store_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01] 0x00 0x80 0x7a 0xe9 0x00 0x01 0x01 0x01 -# VI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01] +# VI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01] 0x00 0x80 0x7b 0xe9 0x00 0x01 0x01 0x01 -# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] 0x00 0x80 0x7b 0xe9 0x00 0x01 0x1d 0x71 -# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x01,0x1d,0x71] +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x01,0x1d,0x71] 0x00,0x80,0x03,0xe9,0x00,0x01,0x1d,0x71 -# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71] +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71] 0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71 # VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 ; encoding: [0x00,0x80,0x0b,0xe8,0x00,0x01,0x1d,0x71] 0x00,0x80,0x0b,0xe8,0x00,0x01,0x1d,0x71 -# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:0, ttmp1 ; encoding: [0x00,0x80,0x03,0xe8,0x00,0x01,0x1d,0x71] +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x80,0x03,0xe8,0x00,0x01,0x1d,0x71] 0x00,0x80,0x03,0xe8,0x00,0x01,0x1d,0x71 -# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:1, nfmt:1, ttmp1 ; encoding: [0x00,0x80,0x8b,0xe8,0x00,0x01,0x1d,0x71] +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x80,0x8b,0xe8,0x00,0x01,0x1d,0x71] 0x00,0x80,0x8b,0xe8,0x00,0x01,0x1d,0x71 -# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:7, ttmp1 ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71] +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71] 0x00,0x80,0xfb,0xeb,0x00,0x01,0x1d,0x71 + +# VI: tbuffer_load_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID] ; encoding: [0x00,0x80,0x00,0xe8,0x00,0x01,0x01,0x01] +0x00 0x80 0x00 0xe8 0x00 0x01 0x01 0x01 + +# VI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x80,0x89,0xe8,0x00,0x01,0x01,0x01] +0x00 0x80 0x89 0xe8 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_x v1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_16,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x00,0x92,0xeb,0x00,0x01,0x01,0x01] +0x00 0x00 0x92 0xeb 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_xy v[1:2], off, s[4:7], s1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_RESERVED_6] ; encoding: [0x00,0x80,0x22,0xeb,0x00,0x01,0x01,0x01] +0x00 0x80 0x22 0xeb 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_SSCALED] ; encoding: [0x00,0x80,0x9b,0xe9,0x00,0x01,0x01,0x01] +0x00 0x80 0x9b 0xe9 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_UINT] ; encoding: [0x00,0x80,0x2b,0xea,0x00,0x01,0x1d,0x71] +0x00 0x80 0x2b 0xea 0x00 0x01 0x1d 0x71 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_32_32,BUF_NUM_FORMAT_SINT] ; encoding: [0x00,0x80,0xdb,0xea,0x00,0x01,0x1d,0x71] +0x00,0x80,0xdb,0xea,0x00,0x01,0x1d,0x71 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_10_11_11] ; encoding: [0x00,0x80,0x33,0xe8,0x00,0x01,0x1d,0x71] +0x00,0x80,0x33,0xe8,0x00,0x01,0x1d,0x71 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 ; encoding: [0x00,0x80,0x0b,0xe8,0x00,0x01,0x1d,0x71] +0x00,0x80,0x0b,0xe8,0x00,0x01,0x1d,0x71 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_16_16_16_16] ; encoding: [0x00,0x80,0x63,0xe8,0x00,0x01,0x1d,0x71] +0x00,0x80,0x63,0xe8,0x00,0x01,0x1d,0x71 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_NUM_FORMAT_SNORM] ; encoding: [0x00,0x80,0x8b,0xe8,0x00,0x01,0x1d,0x71] +0x00,0x80,0x8b,0xe8,0x00,0x01,0x1d,0x71 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_11_11_10,BUF_NUM_FORMAT_FLOAT] ; encoding: [0x00,0x80,0xbb,0xeb,0x00,0x01,0x1d,0x71] +0x00,0x80,0xbb,0xeb,0x00,0x01,0x1d,0x71 + +# VI: tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_10_10_10_2,BUF_NUM_FORMAT_FLOAT] idxen +0x00,0xa0,0xc3,0xeb,0x01,0x01,0x1d,0x00 + +# VI: tbuffer_store_format_xyzw v[1:4], v1, ttmp[4:7], s0 format:[BUF_DATA_FORMAT_2_10_10_10,BUF_NUM_FORMAT_FLOAT] offen offset:52 +0x34,0x90,0xcb,0xeb,0x01,0x01,0x1d,0x00 + +# VI: tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0 format:[BUF_DATA_FORMAT_8_8_8_8,BUF_NUM_FORMAT_FLOAT] idxen offen offset:52 +0x34,0xb0,0xd3,0xeb,0x01,0x01,0x1d,0x00 + +# VI: tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen offen offset:52 +0x34,0xb0,0xeb,0xeb,0x01,0x01,0x1d,0x00 + +# VI: tbuffer_store_format_xyzw v[1:4], v[1:2], ttmp[4:7], s0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen offen offset:52 +0x34,0xb0,0xf3,0xeb,0x01,0x01,0x1d,0x00 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_RESERVED_6] +0x00,0x80,0x0b,0xeb,0x00,0x01,0x1d,0x00 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], s0 format:[BUF_NUM_FORMAT_FLOAT] +0x00,0x80,0x8b,0xeb,0x00,0x01,0x1d,0x00 From 4ef2e594d5be2e0e6d4446c8082b15466bc7ffcb Mon Sep 17 00:00:00 2001 From: David Truby Date: Thu, 16 Jul 2020 14:15:07 +0100 Subject: [PATCH 0013/1035] [flang] Run non-gtest unit tests with lit. Summary: As a corrollary, these tests are now run as part of the check-flang target. Reviewers: sscalpone Subscribers: mgorny, delcypher, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D83946 --- flang/CMakeLists.txt | 1 - flang/test/CMakeLists.txt | 7 ++ flang/test/NonGtestUnit/lit.cfg.py | 16 +++++ flang/test/NonGtestUnit/lit.site.cfg.py.in | 27 +++++++ flang/unittests/CMakeLists.txt | 8 +++ flang/unittests/Decimal/CMakeLists.txt | 9 +-- flang/unittests/Evaluate/CMakeLists.txt | 82 ++++------------------ flang/unittests/Runtime/CMakeLists.txt | 41 ++--------- llvm/utils/lit/lit/formats/__init__.py | 3 +- llvm/utils/lit/lit/formats/base.py | 17 +++++ 10 files changed, 97 insertions(+), 114 deletions(-) create mode 100644 flang/test/NonGtestUnit/lit.cfg.py create mode 100644 flang/test/NonGtestUnit/lit.site.cfg.py.in diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 9dd6281d410bf..f1aaa5c6473fe 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -380,7 +380,6 @@ endif() add_subdirectory(runtime) if (FLANG_INCLUDE_TESTS) - enable_testing() add_subdirectory(test) if (FLANG_GTEST_AVAIL) add_subdirectory(unittests) diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt index ad47fff8c4250..7da1d94d84c4e 100644 --- a/flang/test/CMakeLists.txt +++ b/flang/test/CMakeLists.txt @@ -19,6 +19,13 @@ configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py ) +configure_lit_site_cfg( + ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.site.cfg.py.in + ${CMAKE_CURRENT_BINARY_DIR}/NonGtestUnit/lit.site.cfg.py + MAIN_CONFIG + ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.cfg.py +) + set(FLANG_TEST_PARAMS flang_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py) diff --git a/flang/test/NonGtestUnit/lit.cfg.py b/flang/test/NonGtestUnit/lit.cfg.py new file mode 100644 index 0000000000000..7f53f861bc65c --- /dev/null +++ b/flang/test/NonGtestUnit/lit.cfg.py @@ -0,0 +1,16 @@ +import os + +import lit.Test + +config.name = 'flang-OldUnit' + +config.suffixes = [".test"] + +config.test_source_root = os.path.join(config.flang_obj_root, 'unittests') +config.test_exec_root = config.test_source_root + +config.test_format = lit.formats.ExecutableTest() + +path = os.path.pathsep.join((config.flang_libs_dir, config.llvm_libs_dir, + config.environment.get('LD_LIBRARY_PATH',''))) +config.environment['LD_LIBRARY_PATH'] = path diff --git a/flang/test/NonGtestUnit/lit.site.cfg.py.in b/flang/test/NonGtestUnit/lit.site.cfg.py.in new file mode 100644 index 0000000000000..3218fe0b5ce3d --- /dev/null +++ b/flang/test/NonGtestUnit/lit.site.cfg.py.in @@ -0,0 +1,27 @@ +@LIT_SITE_CFG_IN_HEADER@ + +config.llvm_src_root = "@LLVM_SOURCE_DIR@" +config.llvm_obj_root = "@LLVM_BINARY_DIR@" +config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" +config.llvm_libs_dir = "@LLVM_LIBS_DIR@" +config.llvm_build_mode = "@LLVM_BUILD_MODE@" +config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" +config.flang_obj_root = "@FLANG_BINARY_DIR@" +config.flang_src_root = "@FLANG_SOURCE_DIR@" +config.flang_libs_dir = "@LLVM_LIBRARY_OUTPUT_INTDIR@" +config.flang_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" +config.target_triple = "@TARGET_TRIPLE@" +config.python_executable = "@Python3_EXECUTABLE@" + +# Support substitution of the tools and libs dirs with user parameters. This is +# used when we can't determine the tool dir at configuration time. +try: + config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params + config.llvm_libs_dir = config.llvm_libs_dir % lit_config.params + config.llvm_build_mode = config.llvm_build_mode % lit_config.params +except KeyError as e: + key, = e.args + lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key)) + +# Let the main config do the real work. +lit_config.load_config(config, "@FLANG_SOURCE_DIR@/test/NonGtestUnit/lit.cfg.py") diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt index 21da59f3afcbf..a30f0edaec615 100644 --- a/flang/unittests/CMakeLists.txt +++ b/flang/unittests/CMakeLists.txt @@ -9,6 +9,14 @@ if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) add_compile_options("-Wno-suggest-override") endif() +function(add_flang_nongtest_unittest test_name) + add_executable(${test_name}.test ${test_name}.cpp) + + target_link_libraries(${test_name}.test ${ARGN}) + + add_dependencies(FlangUnitTests ${test_name}.test) +endfunction() + add_subdirectory(Optimizer) add_subdirectory(Decimal) add_subdirectory(Evaluate) diff --git a/flang/unittests/Decimal/CMakeLists.txt b/flang/unittests/Decimal/CMakeLists.txt index f26aca5d0e9b6..112b02f9029f9 100644 --- a/flang/unittests/Decimal/CMakeLists.txt +++ b/flang/unittests/Decimal/CMakeLists.txt @@ -1,13 +1,10 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) -add_executable(quick-sanity-test - quick-sanity-test.cpp -) - -target_link_libraries(quick-sanity-test +add_flang_nongtest_unittest(quick-sanity-test FortranDecimal LLVMSupport ) +# This test is not run by default as it takes a long time to execute add_executable(thorough-test thorough-test.cpp ) @@ -16,5 +13,3 @@ target_link_libraries(thorough-test FortranDecimal LLVMSupport ) - -add_test(NAME Sanity COMMAND quick-sanity-test) diff --git a/flang/unittests/Evaluate/CMakeLists.txt b/flang/unittests/Evaluate/CMakeLists.txt index 54676b0b5a0ad..c56789b598ce7 100644 --- a/flang/unittests/Evaluate/CMakeLists.txt +++ b/flang/unittests/Evaluate/CMakeLists.txt @@ -8,43 +8,22 @@ target_link_libraries(FortranEvaluateTesting LLVMSupport ) -add_executable(leading-zero-bit-count-test - leading-zero-bit-count.cpp -) - -target_link_libraries(leading-zero-bit-count-test +add_flang_nongtest_unittest(leading-zero-bit-count FortranEvaluateTesting LLVMSupport ) -add_executable(bit-population-count-test - bit-population-count.cpp -) - -target_link_libraries(bit-population-count-test +add_flang_nongtest_unittest(bit-population-count FortranEvaluateTesting LLVMSupport ) -add_executable(uint128-test - uint128.cpp -) - -target_link_libraries(uint128-test +add_flang_nongtest_unittest(uint128 FortranEvaluateTesting LLVMSupport ) -# These routines live in lib/Common but we test them here. -add_test(UINT128 uint128-test) -add_test(Leadz leading-zero-bit-count-test) -add_test(PopPar bit-population-count-test) - -add_executable(expression-test - expression.cpp -) - -target_link_libraries(expression-test +add_flang_nongtest_unittest(expression FortranCommon FortranEvaluateTesting FortranEvaluate @@ -53,22 +32,14 @@ target_link_libraries(expression-test LLVMSupport ) -add_executable(integer-test - integer.cpp -) - -target_link_libraries(integer-test +add_flang_nongtest_unittest(integer FortranEvaluateTesting FortranEvaluate FortranSemantics LLVMSupport ) -add_executable(intrinsics-test - intrinsics.cpp -) - -target_link_libraries(intrinsics-test +add_flang_nongtest_unittest(intrinsics FortranCommon FortranEvaluateTesting FortranEvaluate @@ -79,11 +50,7 @@ target_link_libraries(intrinsics-test LLVMSupport ) -add_executable(logical-test - logical.cpp -) - -target_link_libraries(logical-test +add_flang_nongtest_unittest(logical FortranEvaluateTesting FortranEvaluate FortranSemantics @@ -96,24 +63,16 @@ target_link_libraries(logical-test # C++ exceptions are enabled for this test. set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) -add_executable(real-test - real.cpp -) -llvm_update_compile_flags(real-test) - -target_link_libraries(real-test +add_flang_nongtest_unittest(real FortranEvaluateTesting FortranEvaluate FortranDecimal FortranSemantics LLVMSupport ) +llvm_update_compile_flags(real.test) -add_executable(reshape-test - reshape.cpp -) - -target_link_libraries(reshape-test +add_flang_nongtest_unittest(reshape FortranEvaluateTesting FortranSemantics FortranEvaluate @@ -121,11 +80,7 @@ target_link_libraries(reshape-test LLVMSupport ) -add_executable(ISO-Fortran-binding-test - ISO-Fortran-binding.cpp -) - -target_link_libraries(ISO-Fortran-binding-test +add_flang_nongtest_unittest(ISO-Fortran-binding FortranEvaluateTesting FortranEvaluate FortranSemantics @@ -133,23 +88,10 @@ target_link_libraries(ISO-Fortran-binding-test LLVMSupport ) -add_executable(folding-test - folding.cpp -) - -target_link_libraries(folding-test +add_flang_nongtest_unittest(folding FortranCommon FortranEvaluateTesting FortranEvaluate FortranSemantics LLVMSupport ) - -add_test(Expression expression-test) -add_test(Integer integer-test) -add_test(Intrinsics intrinsics-test) -add_test(Logical logical-test) -add_test(Real real-test) -add_test(RESHAPE reshape-test) -add_test(ISO-binding ISO-Fortran-binding-test) -add_test(folding folding-test) diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt index b13c84e88a028..041b631f59d6b 100644 --- a/flang/unittests/Runtime/CMakeLists.txt +++ b/flang/unittests/Runtime/CMakeLists.txt @@ -13,30 +13,19 @@ target_link_libraries(RuntimeTesting LLVMSupport ) -add_executable(format-test - format.cpp -) - -target_link_libraries(format-test +add_flang_nongtest_unittest(format RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME Format COMMAND format-test) - -add_executable(hello-world - hello.cpp -) - -target_link_libraries(hello-world +add_flang_nongtest_unittest(hello RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME HelloWorld COMMAND hello-world) - +# This test is not run by default as it requires input. add_executable(external-hello-world external-hello.cpp ) @@ -46,38 +35,20 @@ target_link_libraries(external-hello-world LLVMSupport ) -add_executable(external-io - external-io.cpp -) - -target_link_libraries(external-io +add_flang_nongtest_unittest(external-io RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME ExternalIO COMMAND external-io) - -add_executable(list-input-test - list-input.cpp -) - -target_link_libraries(list-input-test +add_flang_nongtest_unittest(list-input RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME ListInput COMMAND list-input-test) - -add_executable(character-test - character.cpp -) - -target_link_libraries(character-test +add_flang_nongtest_unittest(character RuntimeTesting FortranRuntime LLVMSupport ) - -add_test(NAME CharacterTest COMMAND character-test) diff --git a/llvm/utils/lit/lit/formats/__init__.py b/llvm/utils/lit/lit/formats/__init__.py index 3ff46e93ead2e..7a357657670f3 100644 --- a/llvm/utils/lit/lit/formats/__init__.py +++ b/llvm/utils/lit/lit/formats/__init__.py @@ -1,7 +1,8 @@ from lit.formats.base import ( # noqa: F401 TestFormat, FileBasedTest, - OneCommandPerFileTest + OneCommandPerFileTest, + ExecutableTest ) from lit.formats.googletest import GoogleTest # noqa: F401 diff --git a/llvm/utils/lit/lit/formats/base.py b/llvm/utils/lit/lit/formats/base.py index 6721d17e334e6..b44a606e76a82 100644 --- a/llvm/utils/lit/lit/formats/base.py +++ b/llvm/utils/lit/lit/formats/base.py @@ -115,3 +115,20 @@ def execute(self, test, litConfig): report += """Output:\n--\n%s--""" % diags return lit.Test.FAIL, report + + +### + +# Check exit code of a simple executable with no input +class ExecutableTest(FileBasedTest): + def execute(self, test, litConfig): + if test.config.unsupported: + return lit.Test.UNSUPPORTED + + out, err, exitCode = lit.util.executeCommand(test.getSourcePath()) + + if not exitCode: + return lit.Test.PASS, '' + + return lit.Test.FAIL, out+err + From c332a984aefc6f8a6b44fd3687a5bbce3f8f035c Mon Sep 17 00:00:00 2001 From: Yitzhak Mandelbaum Date: Fri, 24 Jul 2020 13:27:51 +0000 Subject: [PATCH 0014/1035] [libTooling] Add an `EditGenerator` that applies a rule throughout a bound node. The new combinator, `rewriteDescendants`, applies a rewrite rule to all descendants of a specified bound node. That rewrite rule can refer to nodes bound by the parent, both in the matcher and in the edits. Reviewed By: gribozavr2 Differential Revision: https://reviews.llvm.org/D84409 --- .../clang/Tooling/Transformer/RewriteRule.h | 17 +++ clang/lib/Tooling/Transformer/RewriteRule.cpp | 137 +++++++++++++++++- clang/lib/Tooling/Transformer/Transformer.cpp | 7 +- clang/unittests/Tooling/TransformerTest.cpp | 118 ++++++++++++++- 4 files changed, 269 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Tooling/Transformer/RewriteRule.h b/clang/include/clang/Tooling/Transformer/RewriteRule.h index c22a2da81fe60..2a26d32817dde 100644 --- a/clang/include/clang/Tooling/Transformer/RewriteRule.h +++ b/clang/include/clang/Tooling/Transformer/RewriteRule.h @@ -332,6 +332,23 @@ inline EditGenerator shrinkTo(RangeSelector outer, RangeSelector inner) { remove(enclose(after(inner), after(outer)))}); } +/// Applies `Rule` to all descendants of the node bound to `NodeId`. `Rule` can +/// refer to nodes bound by the calling rule. `Rule` is not applied to the node +/// itself. +/// +/// For example, +/// ``` +/// auto InlineX = +/// makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); +/// makeRule(functionDecl(hasName("f"), hasBody(stmt().bind("body"))).bind("f"), +/// flatten( +/// changeTo(name("f"), cat("newName")), +/// rewriteDescendants("body", InlineX))); +/// ``` +/// Here, we find the function `f`, change its name to `newName` and change all +/// appearances of `x` in its body to `3`. +EditGenerator rewriteDescendants(std::string NodeId, RewriteRule Rule); + /// The following three functions are a low-level part of the RewriteRule /// API. We expose them for use in implementing the fixtures that interpret /// RewriteRule, like Transformer and TransfomerTidy, or for more advanced diff --git a/clang/lib/Tooling/Transformer/RewriteRule.cpp b/clang/lib/Tooling/Transformer/RewriteRule.cpp index c145895af7ab6..ce773b59a7e7e 100644 --- a/clang/lib/Tooling/Transformer/RewriteRule.cpp +++ b/clang/lib/Tooling/Transformer/RewriteRule.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/Transformer/RewriteRule.h" +#include "clang/AST/ASTTypeTraits.h" +#include "clang/AST/Stmt.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Basic/SourceLocation.h" @@ -115,15 +117,144 @@ ASTEdit transformer::remove(RangeSelector S) { return change(std::move(S), std::make_shared("")); } -RewriteRule transformer::makeRule(ast_matchers::internal::DynTypedMatcher M, - EditGenerator Edits, +RewriteRule transformer::makeRule(DynTypedMatcher M, EditGenerator Edits, TextGenerator Explanation) { return RewriteRule{{RewriteRule::Case{ std::move(M), std::move(Edits), std::move(Explanation), {}}}}; } +namespace { + +/// Unconditionally binds the given node set before trying `InnerMatcher` and +/// keeps the bound nodes on a successful match. +template +class BindingsMatcher : public ast_matchers::internal::MatcherInterface { + ast_matchers::BoundNodes Nodes; + const ast_matchers::internal::Matcher InnerMatcher; + +public: + explicit BindingsMatcher(ast_matchers::BoundNodes Nodes, + ast_matchers::internal::Matcher InnerMatcher) + : Nodes(std::move(Nodes)), InnerMatcher(std::move(InnerMatcher)) {} + + bool matches( + const T &Node, ast_matchers::internal::ASTMatchFinder *Finder, + ast_matchers::internal::BoundNodesTreeBuilder *Builder) const override { + ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder); + for (const auto &N : Nodes.getMap()) + Result.setBinding(N.first, N.second); + if (InnerMatcher.matches(Node, Finder, &Result)) { + *Builder = std::move(Result); + return true; + } + return false; + } +}; + +/// Matches nodes of type T that have at least one descendant node for which the +/// given inner matcher matches. Will match for each descendant node that +/// matches. Based on ForEachDescendantMatcher, but takes a dynamic matcher, +/// instead of a static one, because it is used by RewriteRule, which carries +/// (only top-level) dynamic matchers. +template +class DynamicForEachDescendantMatcher + : public ast_matchers::internal::MatcherInterface { + const DynTypedMatcher DescendantMatcher; + +public: + explicit DynamicForEachDescendantMatcher(DynTypedMatcher DescendantMatcher) + : DescendantMatcher(std::move(DescendantMatcher)) {} + + bool matches( + const T &Node, ast_matchers::internal::ASTMatchFinder *Finder, + ast_matchers::internal::BoundNodesTreeBuilder *Builder) const override { + return Finder->matchesDescendantOf( + Node, this->DescendantMatcher, Builder, + ast_matchers::internal::ASTMatchFinder::BK_All); + } +}; + +template +ast_matchers::internal::Matcher +forEachDescendantDynamically(ast_matchers::BoundNodes Nodes, + DynTypedMatcher M) { + return ast_matchers::internal::makeMatcher(new BindingsMatcher( + std::move(Nodes), + ast_matchers::internal::makeMatcher( + new DynamicForEachDescendantMatcher(std::move(M))))); +} + +class ApplyRuleCallback : public MatchFinder::MatchCallback { +public: + ApplyRuleCallback(RewriteRule Rule) : Rule(std::move(Rule)) {} + + template + void registerMatchers(const ast_matchers::BoundNodes &Nodes, + MatchFinder *MF) { + for (auto &Matcher : transformer::detail::buildMatchers(Rule)) + MF->addMatcher(forEachDescendantDynamically(Nodes, Matcher), this); + } + + void run(const MatchFinder::MatchResult &Result) override { + if (!Edits) + return; + transformer::RewriteRule::Case Case = + transformer::detail::findSelectedCase(Result, Rule); + auto Transformations = Case.Edits(Result); + if (!Transformations) { + Edits = Transformations.takeError(); + return; + } + Edits->append(Transformations->begin(), Transformations->end()); + } + + RewriteRule Rule; + + // Initialize to a non-error state. + Expected> Edits = SmallVector(); +}; +} // namespace + +template +llvm::Expected> +rewriteDescendantsImpl(const T &Node, RewriteRule Rule, + const MatchResult &Result) { + ApplyRuleCallback Callback(std::move(Rule)); + MatchFinder Finder; + Callback.registerMatchers(Result.Nodes, &Finder); + Finder.match(Node, *Result.Context); + return std::move(Callback.Edits); +} + +EditGenerator transformer::rewriteDescendants(std::string NodeId, + RewriteRule Rule) { + // FIXME: warn or return error if `Rule` contains any `AddedIncludes`, since + // these will be dropped. + return [NodeId = std::move(NodeId), + Rule = std::move(Rule)](const MatchResult &Result) + -> llvm::Expected> { + const ast_matchers::BoundNodes::IDToNodeMap &NodesMap = + Result.Nodes.getMap(); + auto It = NodesMap.find(NodeId); + if (It == NodesMap.end()) + return llvm::make_error(llvm::errc::invalid_argument, + "ID not bound: " + NodeId); + if (auto *Node = It->second.get()) + return rewriteDescendantsImpl(*Node, std::move(Rule), Result); + if (auto *Node = It->second.get()) + return rewriteDescendantsImpl(*Node, std::move(Rule), Result); + if (auto *Node = It->second.get()) + return rewriteDescendantsImpl(*Node, std::move(Rule), Result); + + return llvm::make_error( + llvm::errc::invalid_argument, + "type unsupported for recursive rewriting, ID=\"" + NodeId + + "\", Kind=" + It->second.getNodeKind().asStringRef()); + }; +} + void transformer::addInclude(RewriteRule &Rule, StringRef Header, - IncludeFormat Format) { + IncludeFormat Format) { for (auto &Case : Rule.Cases) Case.AddedIncludes.emplace_back(Header.str(), Format); } diff --git a/clang/lib/Tooling/Transformer/Transformer.cpp b/clang/lib/Tooling/Transformer/Transformer.cpp index e8fc00c4e953f..5b5be7a396db4 100644 --- a/clang/lib/Tooling/Transformer/Transformer.cpp +++ b/clang/lib/Tooling/Transformer/Transformer.cpp @@ -38,13 +38,8 @@ void Transformer::run(const MatchFinder::MatchResult &Result) { return; } - if (Transformations->empty()) { - // No rewrite applied (but no error encountered either). - transformer::detail::getRuleMatchLoc(Result).print( - llvm::errs() << "note: skipping match at loc ", *Result.SourceManager); - llvm::errs() << "\n"; + if (Transformations->empty()) return; - } // Group the transformations, by file, into AtomicChanges, each anchored by // the location of the first change in that file. diff --git a/clang/unittests/Tooling/TransformerTest.cpp b/clang/unittests/Tooling/TransformerTest.cpp index 1a68eb1d172a3..77fd380410b27 100644 --- a/clang/unittests/Tooling/TransformerTest.cpp +++ b/clang/unittests/Tooling/TransformerTest.cpp @@ -114,7 +114,9 @@ class ClangRefactoringTestBase : public testing::Test { if (C) { Changes.push_back(std::move(*C)); } else { - consumeError(C.takeError()); + // FIXME: stash this error rather then printing. + llvm::errs() << "Error generating changes: " + << llvm::toString(C.takeError()) << "\n"; ++ErrorCount; } }; @@ -414,6 +416,120 @@ TEST_F(TransformerTest, ShrinkTo) { Input, Expected); } +// Rewrite various Stmts inside a Decl. +TEST_F(TransformerTest, RewriteDescendantsDeclChangeStmt) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + std::string Expected = + "int f(int x) { int y = 3; { int z = 3 * 3; } return 3; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + testRule(makeRule(functionDecl(hasName("f")).bind("fun"), + rewriteDescendants("fun", InlineX)), + Input, Expected); +} + +// Rewrite various TypeLocs inside a Decl. +TEST_F(TransformerTest, RewriteDescendantsDeclChangeTypeLoc) { + std::string Input = "int f(int *x) { return *x; }"; + std::string Expected = "char f(char *x) { return *x; }"; + auto IntToChar = makeRule(typeLoc(loc(qualType(isInteger(), builtinType()))), + changeTo(cat("char"))); + testRule(makeRule(functionDecl(hasName("f")).bind("fun"), + rewriteDescendants("fun", IntToChar)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsStmt) { + // Add an unrelated definition to the header that also has a variable named + // "x", to test that the rewrite is limited to the scope we intend. + appendToHeader(R"cc(int g(int x) { return x; })cc"); + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + std::string Expected = + "int f(int x) { int y = 3; { int z = 3 * 3; } return 3; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + testRule(makeRule(functionDecl(hasName("f"), hasBody(stmt().bind("body"))), + rewriteDescendants("body", InlineX)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsStmtWithAdditionalChange) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + std::string Expected = + "int newName(int x) { int y = 3; { int z = 3 * 3; } return 3; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + testRule( + makeRule( + functionDecl(hasName("f"), hasBody(stmt().bind("body"))).bind("f"), + flatten(changeTo(name("f"), cat("newName")), + rewriteDescendants("body", InlineX))), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsTypeLoc) { + std::string Input = "int f(int *x) { return *x; }"; + std::string Expected = "int f(char *x) { return *x; }"; + auto IntToChar = + makeRule(typeLoc(loc(qualType(isInteger(), builtinType()))).bind("loc"), + changeTo(cat("char"))); + testRule( + makeRule(functionDecl(hasName("f"), + hasParameter(0, varDecl(hasTypeLoc( + typeLoc().bind("parmType"))))), + rewriteDescendants("parmType", IntToChar)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsReferToParentBinding) { + std::string Input = + "int f(int p) { int y = p; { int z = p * p; } return p; }"; + std::string Expected = + "int f(int p) { int y = 3; { int z = 3 * 3; } return 3; }"; + std::string VarId = "var"; + auto InlineVar = makeRule(declRefExpr(to(varDecl(equalsBoundNode(VarId)))), + changeTo(cat("3"))); + testRule(makeRule(functionDecl(hasName("f"), + hasParameter(0, varDecl().bind(VarId))) + .bind("fun"), + rewriteDescendants("fun", InlineVar)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsUnboundNode) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + Transformer T(makeRule(functionDecl(hasName("f")), + rewriteDescendants("UNBOUND", InlineX)), + consumer()); + T.registerMatchers(&MatchFinder); + EXPECT_FALSE(rewrite(Input)); + EXPECT_THAT(Changes, IsEmpty()); + EXPECT_EQ(ErrorCount, 1); +} + +TEST_F(TransformerTest, RewriteDescendantsInvalidNodeType) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + auto IntToChar = + makeRule(qualType(isInteger(), builtinType()), changeTo(cat("char"))); + Transformer T( + makeRule(functionDecl( + hasName("f"), + hasParameter(0, varDecl(hasType(qualType().bind("type"))))), + rewriteDescendants("type", IntToChar)), + consumer()); + T.registerMatchers(&MatchFinder); + EXPECT_FALSE(rewrite(Input)); + EXPECT_THAT(Changes, IsEmpty()); + EXPECT_EQ(ErrorCount, 1); +} + TEST_F(TransformerTest, InsertBeforeEdit) { std::string Input = R"cc( int f() { From 9840208db6980f690d09b209e6ad6d57133ec5e5 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 15 Jul 2020 17:32:02 -0400 Subject: [PATCH 0015/1035] [OPENMP] Fix PR46730: Fix compiler crash on taskloop over constructible loop counters. Summary: If the variable is constrcutible, its copy is created by calling a constructor. Such variables are duplicated and thus, must be captured. Reviewers: jdoerfert Subscribers: yaxunl, guansong, cfe-commits, sstefan1, caomhin Tags: #clang Differential Revision: https://reviews.llvm.org/D83909 --- clang/lib/Sema/SemaOpenMP.cpp | 6 +++++- clang/test/OpenMP/taskloop_codegen.cpp | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 51609e37e20ca..0192df3bd170d 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -2244,7 +2244,11 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, [](OpenMPDirectiveKind K) { return isOpenMPTaskingDirective(K); }, Level)) { bool IsTriviallyCopyable = - D->getType().getNonReferenceType().isTriviallyCopyableType(Context); + D->getType().getNonReferenceType().isTriviallyCopyableType(Context) && + !D->getType() + .getNonReferenceType() + .getCanonicalType() + ->getAsCXXRecordDecl(); OpenMPDirectiveKind DKind = DSAStack->getDirective(Level); SmallVector CaptureRegions; getOpenMPCaptureRegions(CaptureRegions, DKind); diff --git a/clang/test/OpenMP/taskloop_codegen.cpp b/clang/test/OpenMP/taskloop_codegen.cpp index 55e43ff3a1152..7402c2ad65eba 100644 --- a/clang/test/OpenMP/taskloop_codegen.cpp +++ b/clang/test/OpenMP/taskloop_codegen.cpp @@ -229,4 +229,20 @@ struct S { // CHECK: br label % // CHECK: ret i32 0 +class St { +public: + operator int(); + St &operator+=(int); +}; + +// CHECK-LABEL: taskloop_with_class +void taskloop_with_class() { + St s1; + // CHECK: [[TD:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @{{.+}}, i32 [[GTID:%.+]], i32 1, i64 88, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[TD_TYPE:%.+]]*)* @{{.+}} to i32 (i32, i8*)*)) + // CHECK: call void @__kmpc_taskloop(%struct.ident_t* @{{.+}}, i32 [[GTID]], i8* [[TD]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* bitcast (void ([[TD_TYPE]]*, [[TD_TYPE]]*, i32)* @{{.+}} to i8*)) +#pragma omp taskloop + for (St s = St(); s < s1; s += 1) { + } +} + #endif From 77b61177d7d4c4fe8714f8828123f626f4549be1 Mon Sep 17 00:00:00 2001 From: diggerlin Date: Fri, 24 Jul 2020 11:28:17 -0400 Subject: [PATCH 0016/1035] [AIX] remove -u from the clang when invoke aix as assembler SUMMARY: since we add .extern directive for external symbol, the -u option for aix as do not need any more. Reviewers: Jason liu Differential Revision: https://reviews.llvm.org/D84356 --- clang/lib/Driver/ToolChains/AIX.cpp | 6 ------ clang/test/Driver/aix-as.c | 7 ------- 2 files changed, 13 deletions(-) diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index ac5544eedb00b..f9d8e18d6fd01 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -44,12 +44,6 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-a64"); } - // Accept an undefined symbol as an extern so that an error message is not - // displayed. Otherwise, undefined symbols are flagged with error messages. - // FIXME: This should be removed when the assembly generation from the - // compiler is able to write externs properly. - CmdArgs.push_back("-u"); - // Accept any mixture of instructions. // On Power for AIX and Linux, this behaviour matches that of GCC for both the // user-provided assembler source case and the compiler-produced assembler diff --git a/clang/test/Driver/aix-as.c b/clang/test/Driver/aix-as.c index cb3053f5acd31..aa8c610359037 100644 --- a/clang/test/Driver/aix-as.c +++ b/clang/test/Driver/aix-as.c @@ -9,7 +9,6 @@ // CHECK-AS32: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0" // CHECK-AS32: "{{.*}}as{{(.exe)?}}" // CHECK-AS32: "-a32" -// CHECK-AS32: "-u" // CHECK-AS32: "-many" // Check powerpc64-ibm-aix7.1.0.0, 64-bit. @@ -20,7 +19,6 @@ // CHECK-AS64: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0" // CHECK-AS64: "{{.*}}as{{(.exe)?}}" // CHECK-AS64: "-a64" -// CHECK-AS64: "-u" // CHECK-AS64: "-many" // Check powerpc-ibm-aix7.1.0.0, 32-bit. -Xassembler option. @@ -32,7 +30,6 @@ // CHECK-AS32-Xassembler: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0" // CHECK-AS32-Xassembler: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-Xassembler: "-a32" -// CHECK-AS32-Xassembler: "-u" // CHECK-AS32-Xassembler: "-many" // CHECK-AS32-Xassembler: "-w" @@ -45,7 +42,6 @@ // CHECK-AS64-Wa: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0" // CHECK-AS64-Wa: "{{.*}}as{{(.exe)?}}" // CHECK-AS64-Wa: "-a64" -// CHECK-AS64-Wa: "-u" // CHECK-AS64-Wa: "-many" // CHECK-AS64-Wa: "-v" // CHECK-AS64-Wa: "-w" @@ -60,13 +56,10 @@ // CHECK-AS32-MultiInput-NOT: warning: // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-MultiInput: "-a32" -// CHECK-AS32-MultiInput: "-u" // CHECK-AS32-MultiInput: "-many" // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-MultiInput: "-a32" -// CHECK-AS32-MultiInput: "-u" // CHECK-AS32-MultiInput: "-many" // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-MultiInput: "-a32" -// CHECK-AS32-MultiInput: "-u" // CHECK-AS32-MultiInput: "-many" From 5934df0c9abe94fc450fbcf0ceca21cf838840e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Fri, 24 Jul 2020 18:18:09 +0200 Subject: [PATCH 0017/1035] MachineBasicBlock: add printName method Common up some existing MBB name printing logic into a single place. Note that basic block dumping now prints the same set of attributes as the MIRPrinter. Change-Id: I8f022bbd922e831bc96d63143d7472c03282530b Differential Revision: https://reviews.llvm.org/D83253 --- llvm/include/llvm/CodeGen/MachineBasicBlock.h | 8 ++ llvm/lib/CodeGen/MIRPrinter.cpp | 56 +------- llvm/lib/CodeGen/MachineBasicBlock.cpp | 126 +++++++++++++----- 3 files changed, 104 insertions(+), 86 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index d6cb7211cf70e..0360e706cbc4a 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -872,6 +872,14 @@ class MachineBasicBlock void print(raw_ostream &OS, ModuleSlotTracker &MST, const SlotIndexes * = nullptr, bool IsStandalone = true) const; + enum PrintNameFlag { + PrintNameIr = (1 << 0), ///< Add IR name where available + PrintNameAttributes = (1 << 1), ///< Print attributes + }; + + void printName(raw_ostream &os, unsigned printNameFlags = PrintNameIr, + ModuleSlotTracker *moduleSlotTracker = nullptr) const; + // Printing method used by LoopInfo. void printAsOperand(raw_ostream &OS, bool PrintType = true) const; diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index fa23df6288e99..dde0dc456c05f 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -608,58 +608,10 @@ bool MIPrinter::canPredictSuccessors(const MachineBasicBlock &MBB) const { void MIPrinter::print(const MachineBasicBlock &MBB) { assert(MBB.getNumber() >= 0 && "Invalid MBB number"); - OS << "bb." << MBB.getNumber(); - bool HasAttributes = false; - if (const auto *BB = MBB.getBasicBlock()) { - if (BB->hasName()) { - OS << "." << BB->getName(); - } else { - HasAttributes = true; - OS << " ("; - int Slot = MST.getLocalSlot(BB); - if (Slot == -1) - OS << ""; - else - OS << (Twine("%ir-block.") + Twine(Slot)).str(); - } - } - if (MBB.hasAddressTaken()) { - OS << (HasAttributes ? ", " : " ("); - OS << "address-taken"; - HasAttributes = true; - } - if (MBB.isEHPad()) { - OS << (HasAttributes ? ", " : " ("); - OS << "landing-pad"; - HasAttributes = true; - } - if (MBB.isEHFuncletEntry()) { - OS << (HasAttributes ? ", " : " ("); - OS << "ehfunclet-entry"; - HasAttributes = true; - } - if (MBB.getAlignment() != Align(1)) { - OS << (HasAttributes ? ", " : " ("); - OS << "align " << MBB.getAlignment().value(); - HasAttributes = true; - } - if (MBB.getSectionID() != MBBSectionID(0)) { - OS << (HasAttributes ? ", " : " ("); - OS << "bbsections "; - switch (MBB.getSectionID().Type) { - case MBBSectionID::SectionType::Exception: - OS << "Exception"; - break; - case MBBSectionID::SectionType::Cold: - OS << "Cold"; - break; - default: - OS << MBB.getSectionID().Number; - } - HasAttributes = true; - } - if (HasAttributes) - OS << ")"; + MBB.printName(OS, + MachineBasicBlock::PrintNameIr | + MachineBasicBlock::PrintNameAttributes, + &MST); OS << ":\n"; bool HasLineAttributes = false; diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 2d4b60435d962..626c04074a617 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -338,39 +338,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, if (Indexes && PrintSlotIndexes) OS << Indexes->getMBBStartIdx(this) << '\t'; - OS << "bb." << getNumber(); - bool HasAttributes = false; - if (const auto *BB = getBasicBlock()) { - if (BB->hasName()) { - OS << "." << BB->getName(); - } else { - HasAttributes = true; - OS << " ("; - int Slot = MST.getLocalSlot(BB); - if (Slot == -1) - OS << ""; - else - OS << (Twine("%ir-block.") + Twine(Slot)).str(); - } - } - - if (hasAddressTaken()) { - OS << (HasAttributes ? ", " : " ("); - OS << "address-taken"; - HasAttributes = true; - } - if (isEHPad()) { - OS << (HasAttributes ? ", " : " ("); - OS << "landing-pad"; - HasAttributes = true; - } - if (getAlignment() != Align(1)) { - OS << (HasAttributes ? ", " : " ("); - OS << "align " << Log2(getAlignment()); - HasAttributes = true; - } - if (HasAttributes) - OS << ")"; + printName(OS, PrintNameIr | PrintNameAttributes, &MST); OS << ":\n"; const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); @@ -478,9 +446,99 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, } } +/// Print the basic block's name as: +/// +/// bb.{number}[.{ir-name}] [(attributes...)] +/// +/// The {ir-name} is only printed when the \ref PrintNameIr flag is passed +/// (which is the default). If the IR block has no name, it is identified +/// numerically using the attribute syntax as "(%ir-block.{ir-slot})". +/// +/// When the \ref PrintNameAttributes flag is passed, additional attributes +/// of the block are printed when set. +/// +/// \param printNameFlags Combination of \ref PrintNameFlag flags indicating +/// the parts to print. +/// \param moduleSlotTracker Optional ModuleSlotTracker. This method will +/// incorporate its own tracker when necessary to +/// determine the block's IR name. +void MachineBasicBlock::printName(raw_ostream &os, unsigned printNameFlags, + ModuleSlotTracker *moduleSlotTracker) const { + os << "bb." << getNumber(); + bool hasAttributes = false; + + if (printNameFlags & PrintNameIr) { + if (const auto *bb = getBasicBlock()) { + if (bb->hasName()) { + os << '.' << bb->getName(); + } else { + hasAttributes = true; + os << " ("; + + int slot = -1; + + if (moduleSlotTracker) { + slot = moduleSlotTracker->getLocalSlot(bb); + } else if (bb->getParent()) { + ModuleSlotTracker tmpTracker(bb->getModule(), false); + tmpTracker.incorporateFunction(*bb->getParent()); + slot = tmpTracker.getLocalSlot(bb); + } + + if (slot == -1) + os << ""; + else + os << (Twine("%ir-block.") + Twine(slot)).str(); + } + } + } + + if (printNameFlags & PrintNameAttributes) { + if (hasAddressTaken()) { + os << (hasAttributes ? ", " : " ("); + os << "address-taken"; + hasAttributes = true; + } + if (isEHPad()) { + os << (hasAttributes ? ", " : " ("); + os << "landing-pad"; + hasAttributes = true; + } + if (isEHFuncletEntry()) { + os << (hasAttributes ? ", " : " ("); + os << "ehfunclet-entry"; + hasAttributes = true; + } + if (getAlignment() != Align(1)) { + os << (hasAttributes ? ", " : " ("); + os << "align " << getAlignment().value(); + hasAttributes = true; + } + if (getSectionID() != MBBSectionID(0)) { + os << (hasAttributes ? ", " : " ("); + os << "bbsections "; + switch (getSectionID().Type) { + case MBBSectionID::SectionType::Exception: + os << "Exception"; + break; + case MBBSectionID::SectionType::Cold: + os << "Cold"; + break; + default: + os << getSectionID().Number; + } + hasAttributes = true; + } + } + + if (hasAttributes) + os << ')'; +} + void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) const { - OS << "%bb." << getNumber(); + OS << '%'; + printName(OS, 0); } void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) { From 22c16360dd00230987fee5f6f3c57f8071144cc1 Mon Sep 17 00:00:00 2001 From: Fred Riss Date: Fri, 17 Jul 2020 17:59:15 -0700 Subject: [PATCH 0018/1035] [lldb/ObjectFileMachO] Correctly account for resolver symbols Summary: The resolver addresses stored in the dyld trie are relative to the base of the __TEXT segment. This is usually 0 in a dylib, so this was never noticed, but it is not 0 for most dylibs integrated in the shared cache. As we started using the shared cache images recently as symbol source, this causes LLDB to fail to resolve symbols which go through a runtime resolver. Reviewers: jasonmolenda, jingham Subscribers: lldb-commits Tags: #lldb Differential Revision: https://reviews.llvm.org/D84083 --- lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp | 2 ++ lldb/test/API/macosx/indirect_symbol/Makefile | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index ab1a6a8bb5f3e..338c798e6cef6 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -1990,6 +1990,8 @@ static bool ParseTrieEntries(DataExtractor &data, lldb::offset_t offset, if (e.entry.flags & EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER) { e.entry.other = data.GetULEB128(&offset); uint64_t resolver_addr = e.entry.other; + if (text_seg_base_addr != LLDB_INVALID_ADDRESS) + resolver_addr += text_seg_base_addr; if (is_arm) resolver_addr &= THUMB_ADDRESS_BIT_MASK; resolver_addresses.insert(resolver_addr); diff --git a/lldb/test/API/macosx/indirect_symbol/Makefile b/lldb/test/API/macosx/indirect_symbol/Makefile index 929ed58f75757..9069302b39c4f 100644 --- a/lldb/test/API/macosx/indirect_symbol/Makefile +++ b/lldb/test/API/macosx/indirect_symbol/Makefile @@ -8,7 +8,8 @@ include Makefile.rules build-libindirect: indirect.c $(MAKE) -f $(MAKEFILE_RULES) \ - DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES + DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES \ + LD_EXTRAS="-Wl,-image_base,0x200000000" build-libreepxoprt: reexport.c $(MAKE) -f $(MAKEFILE_RULES) \ From 1c7c69c795b22d73d038dd9b49de921cfd9d3468 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 24 Jul 2020 17:28:01 +0100 Subject: [PATCH 0019/1035] [ValueTracking] Check for ConstantExpr before using recursive helpers. Make sure we do not call constainsConstantExpression/containsUndefElement on ConstantExpression, which is not supported. In particular, containsUndefElement/constainsConstantExpression are only supported on constants which are supported by getAggregateElement. Unfortunately there's no convenient way to check if a constant supports getAggregateElement, so just check for non-constantexpressions with vector type. Other users of those functions do so too. Reviewers: spatel, nikic, craig.topper, lebedev.ri, jdoerfert, aqjune Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D84512 --- llvm/lib/Analysis/ValueTracking.cpp | 4 ++-- .../constexpr-vector-constainsundef-crash.ll | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/GVN/constexpr-vector-constainsundef-crash.ll diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0ab2a1350af31..116916a9be2d2 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4794,8 +4794,8 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, isa(C) || isa(C)) return true; - if (C->getType()->isVectorTy()) - return !C->containsUndefElement() && !C->containsConstantExpression(); + if (C->getType()->isVectorTy() && !isa(C)) + return !C->containsConstantExpression() && !C->containsUndefElement(); } // Strip cast operations from a pointer value. diff --git a/llvm/test/Transforms/GVN/constexpr-vector-constainsundef-crash.ll b/llvm/test/Transforms/GVN/constexpr-vector-constainsundef-crash.ll new file mode 100644 index 0000000000000..b99cade66daaa --- /dev/null +++ b/llvm/test/Transforms/GVN/constexpr-vector-constainsundef-crash.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -gvn -S %s | FileCheck %s + +; Reduced test case from +; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=24278 + +; Make sure we do not crash when dealing with a vector constant expression. +define <4 x i64*> @test(i64* %ptr) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <4 x i64*> getelementptr (i64, i64* null, <4 x i64> ) +; +entry: + %B9 = sdiv i16 -32768, 256 + %L3 = load i64, i64* %ptr, align 4 + %B3 = sub i16 0, %B9 + %0 = insertelement <4 x i16> undef, i16 %B3, i32 3 + %1 = sub <4 x i16> zeroinitializer, %0 + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = getelementptr inbounds i64, i64* null, <4 x i32> %2 + %I6 = insertelement <4 x i64*> %3, i64* undef, i64 %L3 + ret <4 x i64*> %I6 +} From 8a4878cc116c7ba2e1031d63e108610920c5ef5c Mon Sep 17 00:00:00 2001 From: Dokyung Song Date: Fri, 24 Jul 2020 16:14:42 +0000 Subject: [PATCH 0020/1035] [libFuzzer] Disable noasan-memcmp64.test and bcmp.test on Windows. Summary: This patch disables (i) noasan-memcmp64.test on Windows as libFuzzer's interceptors are only supported on Linux for now, and (ii) bcmp.test as on Windows bcmp is not available in strings.h. Reviewers: morehouse, hctim, kcc Subscribers: #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D84536 --- compiler-rt/test/fuzzer/bcmp.test | 2 +- compiler-rt/test/fuzzer/memcmp64.test | 4 ---- compiler-rt/test/fuzzer/noasan-memcmp64.test | 6 ++++++ 3 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 compiler-rt/test/fuzzer/noasan-memcmp64.test diff --git a/compiler-rt/test/fuzzer/bcmp.test b/compiler-rt/test/fuzzer/bcmp.test index 37ee6bedd4ee1..5bbbe9845beb2 100644 --- a/compiler-rt/test/fuzzer/bcmp.test +++ b/compiler-rt/test/fuzzer/bcmp.test @@ -1,4 +1,4 @@ -UNSUPPORTED: freebsd +UNSUPPORTED: freebsd, windows RUN: %cpp_compiler -DMEMCMP=bcmp %S/MemcmpTest.cpp -o %t RUN: not %run %t -seed=1 -runs=10000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/memcmp64.test b/compiler-rt/test/fuzzer/memcmp64.test index feb81d1e4fb28..24d14bf73bbf4 100644 --- a/compiler-rt/test/fuzzer/memcmp64.test +++ b/compiler-rt/test/fuzzer/memcmp64.test @@ -1,8 +1,4 @@ UNSUPPORTED: freebsd RUN: %cpp_compiler %S/Memcmp64BytesTest.cpp -o %t-Memcmp64BytesTest RUN: not %run %t-Memcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s - -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-memcmp %S/Memcmp64BytesTest.cpp -o %t-NoAsanMemcmp64BytesTest -RUN: not %run %t-NoAsanMemcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s - CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-memcmp64.test b/compiler-rt/test/fuzzer/noasan-memcmp64.test new file mode 100644 index 0000000000000..a6b8f88594d03 --- /dev/null +++ b/compiler-rt/test/fuzzer/noasan-memcmp64.test @@ -0,0 +1,6 @@ +UNSUPPORTED: darwin, freebsd, windows + +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-memcmp %S/Memcmp64BytesTest.cpp -o %t-NoAsanMemcmp64BytesTest +RUN: not %run %t-NoAsanMemcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s + +CHECK: BINGO From 58d84eb534252747115b358c890a1b79c65d4ad4 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 24 Jul 2020 09:43:15 -0700 Subject: [PATCH 0021/1035] debugserver: Support ios simulator load command disambiguation in qProcessInfo This patch basically moves the disambiguation code from a place where it was complicated to implement straight to where the load command is parsed, which has the neat side affect of actually supporting all call sites! rdar://problem/66011909 Differential Revision: https://reviews.llvm.org/D84480 --- .../macosx/simulator/TestSimulatorPlatform.py | 20 ++++++-- lldb/tools/debugserver/source/DNB.cpp | 5 +- .../debugserver/source/MacOSX/MachProcess.h | 3 -- .../debugserver/source/MacOSX/MachProcess.mm | 51 ++++++++----------- 4 files changed, 41 insertions(+), 38 deletions(-) diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py index 6e67fdc879bef..824cb9eee295c 100644 --- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py +++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py @@ -27,18 +27,28 @@ def check_debugserver(self, log, expected_platform, expected_version): """scan the debugserver packet log""" logfile = open(log, "r") dylib_info = None - response = False + process_info_ostype = None + expect_dylib_info_response = False + expect_process_info_response = False for line in logfile: - if response: + if expect_dylib_info_response: while line[0] != '$': line = line[1:] line = line[1:] # Unescape '}'. dylib_info = json.loads(line.replace('}]','}')[:-4]) - response = False + expect_dylib_info_response = False if 'send packet: $jGetLoadedDynamicLibrariesInfos:{' in line: - response = True - + expect_dylib_info_response = True + if expect_process_info_response: + for pair in line.split(';'): + keyval = pair.split(':') + if len(keyval) == 2 and keyval[0] == 'ostype': + process_info_ostype = keyval[1] + if 'send packet: $qProcessInfo#' in line: + expect_process_info_response = True + + self.assertEquals(process_info_ostype, expected_platform) self.assertTrue(dylib_info) aout_info = None for image in dylib_info['images']: diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp index af13a8f8208ba..0830ea36a91ae 100644 --- a/lldb/tools/debugserver/source/DNB.cpp +++ b/lldb/tools/debugserver/source/DNB.cpp @@ -1393,7 +1393,10 @@ const char *DNBGetDeploymentInfo(nub_process_t pid, uint32_t& patch_version) { MachProcessSP procSP; if (GetProcessSP(pid, procSP)) { - // FIXME: This doesn't correct for older ios simulator and macCatalyst. + // FIXME: This doesn't return the correct result when xctest (a + // macOS binary) is loaded with the macCatalyst dyld platform + // override. The image info corrects for this, but qProcessInfo + // will return what is in the binary. auto info = procSP->GetDeploymentInfo(lc, load_command_address); major_version = info.major_version; minor_version = info.minor_version; diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.h b/lldb/tools/debugserver/source/MacOSX/MachProcess.h index c749dd8426c5d..9d712390ac2ac 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachProcess.h +++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.h @@ -236,9 +236,6 @@ class MachProcess { operator bool() { return platform > 0; } /// The Mach-O platform type; unsigned char platform = 0; - /// Pre-LC_BUILD_VERSION files don't disambiguate between ios and ios - /// simulator. - bool maybe_simulator = false; uint32_t major_version = 0; uint32_t minor_version = 0; uint32_t patch_version = 0; diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm index 8a35f605daa38..10eaf38ea4357 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm +++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm @@ -617,7 +617,28 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options, info.major_version = vers_cmd.version >> 16; info.minor_version = (vers_cmd.version >> 8) & 0xffu; info.patch_version = vers_cmd.version & 0xffu; - info.maybe_simulator = true; + + // Disambiguate legacy simulator platforms. +#if (defined(__x86_64__) || defined(__i386__)) + // If we are running on Intel macOS, it is safe to assume this is + // really a back-deploying simulator binary. + switch (info.platform) { + case PLATFORM_IOS: + info.platform = PLATFORM_IOSSIMULATOR; + break; + case PLATFORM_TVOS: + info.platform = PLATFORM_TVOSSIMULATOR; + break; + case PLATFORM_WATCHOS: + info.platform = PLATFORM_WATCHOSSIMULATOR; + break; + } +#else + // On an Apple Silicon macOS host, there is no ambiguity. The only + // binaries that use legacy load commands are back-deploying + // native iOS binaries. All simulator binaries use the newer, + // unambiguous LC_BUILD_VERSION load commands. +#endif }; switch (cmd) { case LC_VERSION_MIN_IPHONEOS: @@ -778,34 +799,6 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options, uuid_copy(inf.uuid, uuidcmd.uuid); } if (DeploymentInfo deployment_info = GetDeploymentInfo(lc, load_cmds_p)) { - // Simulator support. If the platform is ambiguous, use the dyld info. - if (deployment_info.maybe_simulator) { - if (deployment_info.maybe_simulator) { -#if (defined(__x86_64__) || defined(__i386__)) - // If dyld doesn't return a platform, use a heuristic. - // If we are running on Intel macOS, it is safe to assume - // this is really a back-deploying simulator binary. - switch (deployment_info.platform) { - case PLATFORM_IOS: - deployment_info.platform = PLATFORM_IOSSIMULATOR; - break; - case PLATFORM_TVOS: - deployment_info.platform = PLATFORM_TVOSSIMULATOR; - break; - case PLATFORM_WATCHOS: - deployment_info.platform = PLATFORM_WATCHOSSIMULATOR; - break; - } -#else - // On an Apple Silicon macOS host, there is no - // ambiguity. The only binaries that use legacy load - // commands are back-deploying native iOS binaries. All - // simulator binaries use the newer, unambiguous - // LC_BUILD_VERSION load commands. - deployment_info.maybe_simulator = false; -#endif - } - } const char *lc_platform = GetPlatformString(deployment_info.platform); // macCatalyst support. // From f26aeab1b9f8fc7261ab8d31693570d6a56afa38 Mon Sep 17 00:00:00 2001 From: biplmish Date: Fri, 24 Jul 2020 04:30:04 -0500 Subject: [PATCH 0022/1035] [test commit] Add my name to the CREDITS.TXT Add my name to the CREDITS.TXT This is my test commit. (NFC) Differential Revision: https://reviews.llvm.org/D84488 --- llvm/CREDITS.TXT | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/CREDITS.TXT b/llvm/CREDITS.TXT index e998fa6bb257d..461b95bdcd8b5 100644 --- a/llvm/CREDITS.TXT +++ b/llvm/CREDITS.TXT @@ -537,3 +537,7 @@ D: PowerPC Backend Developer N: Djordje Todorovic E: djordje.todorovic@rt-rk.com D: Debug Information + +N: Biplob Mishra +E: biplmish@in.ibm.com +D: PowerPC Analysis From 805e6bcf22f30e37b47cc40b7eb14ae891a277a2 Mon Sep 17 00:00:00 2001 From: Meera Nakrani Date: Fri, 24 Jul 2020 17:22:56 +0000 Subject: [PATCH 0023/1035] Test Commit Test commit - added whitespace in ARMInstrMVE.td --- llvm/lib/Target/ARM/ARMInstrMVE.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index dc701412b5950..fa04a82b0ee26 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3773,7 +3773,7 @@ multiclass MVE_VABD_fp_m : MVE_VABDT_fp_m; defm MVE_VABDf32 : MVE_VABD_fp_m; -defm MVE_VABDf16 : MVE_VABD_fp_m; +defm MVE_VABDf16 : MVE_VABD_fp_m; class MVE_VCVT_fix From 3319d05630cdac66412a122efd75dfab073f5093 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Fri, 24 Jul 2020 20:31:35 +0300 Subject: [PATCH 0024/1035] [NFC][GVN] Improve loadpre-missed-opportunity.ll test again thanks to @fhahn --- .../GVN/loadpre-missed-opportunity.ll | 77 +++++++++++-------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll b/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll index dae22d18334b8..1c967f48d546c 100644 --- a/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll +++ b/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll @@ -1,25 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -gvn -S | FileCheck %s -define void @loadpre_opportunity(i32** %arg, i1 %arg1, i1 %arg2, i1 %arg3) { +define i32 @loadpre_opportunity(i32** %arg, i1 %arg1, i1 %arg2, i1 %arg3) { ; CHECK-LABEL: @loadpre_opportunity( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB9:%.*]] -; CHECK: bb6: -; CHECK-NEXT: br label [[BB9]] -; CHECK: bb9: -; CHECK-NEXT: br i1 [[ARG1:%.*]], label [[BB6:%.*]], label [[BB10:%.*]] -; CHECK: bb10: -; CHECK-NEXT: call void @somecall() -; CHECK-NEXT: br i1 [[ARG2:%.*]], label [[BB12:%.*]], label [[BB15:%.*]] -; CHECK: bb12: -; CHECK-NEXT: br label [[BB13:%.*]] +; CHECK-NEXT: [[I:%.*]] = load i32*, i32** [[ARG:%.*]], align 8 +; CHECK-NEXT: [[I6:%.*]] = call i32 @use(i32* [[I]]) +; CHECK-NEXT: br label [[BB11:%.*]] +; CHECK: bb7: +; CHECK-NEXT: [[I8:%.*]] = load i32*, i32** [[ARG]], align 8 +; CHECK-NEXT: [[I10:%.*]] = call i32 @use(i32* [[I8]]) +; CHECK-NEXT: br label [[BB11]] +; CHECK: bb11: +; CHECK-NEXT: [[I12:%.*]] = phi i32 [ [[I6]], [[BB:%.*]] ], [ [[I10]], [[BB7:%.*]] ] +; CHECK-NEXT: br i1 [[ARG1:%.*]], label [[BB7]], label [[BB13:%.*]] ; CHECK: bb13: -; CHECK-NEXT: br i1 [[ARG3:%.*]], label [[BB14:%.*]], label [[BB13]] +; CHECK-NEXT: call void @somecall() +; CHECK-NEXT: br i1 [[ARG2:%.*]], label [[BB14:%.*]], label [[BB17:%.*]] ; CHECK: bb14: -; CHECK-NEXT: br label [[BB15]] +; CHECK-NEXT: br label [[BB15:%.*]] ; CHECK: bb15: -; CHECK-NEXT: br label [[BB6]] +; CHECK-NEXT: br i1 [[ARG3:%.*]], label [[BB16:%.*]], label [[BB15]] +; CHECK: bb16: +; CHECK-NEXT: br label [[BB17]] +; CHECK: bb17: +; CHECK-NEXT: [[I18:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[I18]], label [[BB7]], label [[BB19:%.*]] +; CHECK: bb19: +; CHECK-NEXT: ret i32 [[I12]] ; bb: %i = load i32*, i32** %arg, align 8 @@ -27,31 +35,40 @@ bb: br label %bb5 bb5: - br label %bb9 - -bb6: - %i7 = load i32*, i32** %arg, align 8 - %i8 = getelementptr inbounds i32, i32* %i7, i64 0 - br label %bb9 + %i6 = call i32 @use(i32* %i4) + br label %bb11 -bb9: - br i1 %arg1, label %bb6, label %bb10 - -bb10: - call void @somecall() - br i1 %arg2, label %bb12, label %bb15 +bb7: + %i8 = load i32*, i32** %arg, align 8 + %i9 = getelementptr inbounds i32, i32* %i8, i64 0 + %i10 = call i32 @use(i32* %i9) + br label %bb11 -bb12: - br label %bb13 +bb11: + %i12 = phi i32 [ %i6, %bb5 ], [ %i10, %bb7 ] + br i1 %arg1, label %bb7, label %bb13 bb13: - br i1 %arg3, label %bb14, label %bb13 + call void @somecall() + br i1 %arg2, label %bb14, label %bb17 bb14: br label %bb15 bb15: - br label %bb6 + br i1 %arg3, label %bb16, label %bb15 + +bb16: + br label %bb17 + +bb17: + %i18 = call i1 @cond() + br i1 %i18, label %bb7, label %bb19 + +bb19: + ret i32 %i12 } declare void @somecall() +declare i32 @use(i32*) readnone +declare i1 @cond() readnone From 998334da2b1536e7c8f11c560770c8d4cfacb354 Mon Sep 17 00:00:00 2001 From: Kostya Kortchinsky Date: Thu, 16 Jul 2020 16:13:04 -0700 Subject: [PATCH 0025/1035] [scudo][standalone] Change the release loop for efficiency purposes Summary: On 32-b, the release algo loops multiple times over the freelist for a size class, which lead to a decrease in performance when there were a lot of free blocks. This changes the release functions to loop only once over the freelist, at the cost of using a little bit more memory for the release process: instead of working on one region at a time, we pass the whole memory area covered by all the regions for a given size class, and work on sub-areas of `RegionSize` in this large area. For 64-b, we just have 1 sub-area encompassing the whole region. Of course, not all the sub-areas within that large memory area will belong to the class id we are working on, but those will just be left untouched (which will not add to the RSS during the release process). Reviewers: pcc, cferris, hctim, eugenis Subscribers: llvm-commits, #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D83993 --- compiler-rt/lib/scudo/standalone/primary32.h | 45 ++++--- compiler-rt/lib/scudo/standalone/primary64.h | 2 +- compiler-rt/lib/scudo/standalone/release.h | 119 +++++++++++------- .../scudo/standalone/tests/release_test.cpp | 27 ++-- 4 files changed, 114 insertions(+), 79 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h index 2ee0f6c600ab2..321cf92fae30e 100644 --- a/compiler-rt/lib/scudo/standalone/primary32.h +++ b/compiler-rt/lib/scudo/standalone/primary32.h @@ -467,28 +467,33 @@ class SizeClassAllocator32 { } } - // TODO(kostyak): currently not ideal as we loop over all regions and - // iterate multiple times over the same freelist if a ClassId spans multiple - // regions. But it will have to do for now. - uptr TotalReleasedBytes = 0; - const uptr MaxSize = (RegionSize / BlockSize) * BlockSize; + DCHECK_GT(MinRegionIndex, 0U); + uptr First = 0; for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) { if (PossibleRegions[I] - 1U == ClassId) { - const uptr Region = I * RegionSize; - // If the region is the one currently associated to the size-class, we - // only need to release up to CurrentRegionAllocated, MaxSize otherwise. - const uptr Size = (Region == Sci->CurrentRegion) - ? Sci->CurrentRegionAllocated - : MaxSize; - ReleaseRecorder Recorder(Region); - releaseFreeMemoryToOS(Sci->FreeList, Region, Size, BlockSize, - &Recorder); - if (Recorder.getReleasedRangesCount() > 0) { - Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks; - Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount(); - Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes(); - TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes; - } + First = I; + break; + } + } + uptr Last = 0; + for (uptr I = MaxRegionIndex; I >= MinRegionIndex; I--) { + if (PossibleRegions[I] - 1U == ClassId) { + Last = I; + break; + } + } + uptr TotalReleasedBytes = 0; + if (First && Last) { + const uptr Base = First * RegionSize; + const uptr NumberOfRegions = Last - First + 1U; + ReleaseRecorder Recorder(Base); + releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions, + BlockSize, &Recorder); + if (Recorder.getReleasedRangesCount() > 0) { + Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks; + Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount(); + Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes(); + TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes; } } Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime(); diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index 01e674bf3fba5..e37dc4951f238 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -481,7 +481,7 @@ class SizeClassAllocator64 { ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data); releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg, - Region->AllocatedUser, BlockSize, &Recorder); + Region->AllocatedUser, 1U, BlockSize, &Recorder); if (Recorder.getReleasedRangesCount() > 0) { Region->ReleaseInfo.PushedBlocksAtLastRelease = diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h index fd55ea24132e6..748e1c0011530 100644 --- a/compiler-rt/lib/scudo/standalone/release.h +++ b/compiler-rt/lib/scudo/standalone/release.h @@ -49,7 +49,10 @@ class ReleaseRecorder { // incremented past MaxValue. class PackedCounterArray { public: - PackedCounterArray(uptr NumCounters, uptr MaxValue) : N(NumCounters) { + PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion, + uptr MaxValue) + : Regions(NumberOfRegions), NumCounters(CountersPerRegion) { + CHECK_GT(Regions, 0); CHECK_GT(NumCounters, 0); CHECK_GT(MaxValue, 0); constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL; @@ -66,9 +69,10 @@ class PackedCounterArray { PackingRatioLog = getLog2(PackingRatio); BitOffsetMask = PackingRatio - 1; - BufferSize = (roundUpTo(N, static_cast(1U) << PackingRatioLog) >> - PackingRatioLog) * - sizeof(*Buffer); + SizePerRegion = + roundUpTo(NumCounters, static_cast(1U) << PackingRatioLog) >> + PackingRatioLog; + BufferSize = SizePerRegion * sizeof(*Buffer) * Regions; if (BufferSize <= (StaticBufferCount * sizeof(Buffer[0])) && Mutex.tryLock()) { Buffer = &StaticBuffer[0]; @@ -89,41 +93,45 @@ class PackedCounterArray { bool isAllocated() const { return !!Buffer; } - uptr getCount() const { return N; } + uptr getCount() const { return NumCounters; } - uptr get(uptr I) const { - DCHECK_LT(I, N); + uptr get(uptr Region, uptr I) const { + DCHECK_LT(Region, Regions); + DCHECK_LT(I, NumCounters); const uptr Index = I >> PackingRatioLog; const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog; - return (Buffer[Index] >> BitOffset) & CounterMask; + return (Buffer[Region * SizePerRegion + Index] >> BitOffset) & CounterMask; } - void inc(uptr I) const { - DCHECK_LT(get(I), CounterMask); + void inc(uptr Region, uptr I) const { + DCHECK_LT(get(Region, I), CounterMask); const uptr Index = I >> PackingRatioLog; const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog; DCHECK_LT(BitOffset, SCUDO_WORDSIZE); - Buffer[Index] += static_cast(1U) << BitOffset; + Buffer[Region * SizePerRegion + Index] += static_cast(1U) + << BitOffset; } - void incRange(uptr From, uptr To) const { + void incRange(uptr Region, uptr From, uptr To) const { DCHECK_LE(From, To); - const uptr Top = Min(To + 1, N); + const uptr Top = Min(To + 1, NumCounters); for (uptr I = From; I < Top; I++) - inc(I); + inc(Region, I); } uptr getBufferSize() const { return BufferSize; } - static const uptr StaticBufferCount = 1024U; + static const uptr StaticBufferCount = 2048U; private: - const uptr N; + const uptr Regions; + const uptr NumCounters; uptr CounterSizeBitsLog; uptr CounterMask; uptr PackingRatioLog; uptr BitOffsetMask; + uptr SizePerRegion; uptr BufferSize; uptr *Buffer; @@ -169,7 +177,8 @@ template class FreePagesRangeTracker { template NOINLINE void releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, - uptr Size, uptr BlockSize, ReleaseRecorderT *Recorder) { + uptr RegionSize, uptr NumberOfRegions, uptr BlockSize, + ReleaseRecorderT *Recorder) { const uptr PageSize = getPageSizeCached(); // Figure out the number of chunks per page and whether we can take a fast @@ -206,13 +215,15 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, } } - const uptr PagesCount = roundUpTo(Size, PageSize) / PageSize; - PackedCounterArray Counters(PagesCount, FullPagesBlockCountMax); + const uptr PagesCount = roundUpTo(RegionSize, PageSize) / PageSize; + PackedCounterArray Counters(NumberOfRegions, PagesCount, + FullPagesBlockCountMax); if (!Counters.isAllocated()) return; const uptr PageSizeLog = getLog2(PageSize); - const uptr RoundedSize = PagesCount << PageSizeLog; + const uptr RoundedRegionSize = PagesCount << PageSizeLog; + const uptr RoundedSize = NumberOfRegions * RoundedRegionSize; // Iterate over free chunks and count how many free chunks affect each // allocated page. @@ -228,14 +239,17 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) { const uptr P = reinterpret_cast(It.get(I)) - Base; // This takes care of P < Base and P >= Base + RoundedSize. - if (P < RoundedSize) - Counters.inc(P >> PageSizeLog); + if (P < RoundedSize) { + const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize; + const uptr PInRegion = P - RegionIndex * RegionSize; + Counters.inc(RegionIndex, PInRegion >> PageSizeLog); + } } } - for (uptr P = Size; P < RoundedSize; P += BlockSize) - Counters.inc(P >> PageSizeLog); } else { // In all other cases chunks might affect more than one page. + DCHECK_GE(RegionSize, BlockSize); + const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize; for (const auto &It : FreeList) { // See TransferBatch comment above. const bool IsTransferBatch = @@ -244,13 +258,24 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) { const uptr P = reinterpret_cast(It.get(I)) - Base; // This takes care of P < Base and P >= Base + RoundedSize. - if (P < RoundedSize) - Counters.incRange(P >> PageSizeLog, - (P + BlockSize - 1) >> PageSizeLog); + if (P < RoundedSize) { + const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize; + uptr PInRegion = P - RegionIndex * RegionSize; + Counters.incRange(RegionIndex, PInRegion >> PageSizeLog, + (PInRegion + BlockSize - 1) >> PageSizeLog); + // The last block in a region might straddle a page, so if it's + // free, we mark the following "pretend" memory block(s) as free. + if (PInRegion == LastBlockInRegion) { + PInRegion += BlockSize; + while (PInRegion < RoundedRegionSize) { + Counters.incRange(RegionIndex, PInRegion >> PageSizeLog, + (PInRegion + BlockSize - 1) >> PageSizeLog); + PInRegion += BlockSize; + } + } + } } } - for (uptr P = Size; P < RoundedSize; P += BlockSize) - Counters.incRange(P >> PageSizeLog, (P + BlockSize - 1) >> PageSizeLog); } // Iterate over pages detecting ranges of pages with chunk Counters equal @@ -258,8 +283,10 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, FreePagesRangeTracker RangeTracker(Recorder); if (SameBlockCountPerPage) { // Fast path, every page has the same number of chunks affecting it. - for (uptr I = 0; I < Counters.getCount(); I++) - RangeTracker.processNextPage(Counters.get(I) == FullPagesBlockCountMax); + for (uptr I = 0; I < NumberOfRegions; I++) + for (uptr J = 0; J < PagesCount; J++) + RangeTracker.processNextPage(Counters.get(I, J) == + FullPagesBlockCountMax); } else { // Slow path, go through the pages keeping count how many chunks affect // each page. @@ -270,23 +297,25 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, // except the first and the last one) and then the last chunk size, adding // up the number of chunks on the current page and checking on every step // whether the page boundary was crossed. - uptr PrevPageBoundary = 0; - uptr CurrentBoundary = 0; - for (uptr I = 0; I < Counters.getCount(); I++) { - const uptr PageBoundary = PrevPageBoundary + PageSize; - uptr BlocksPerPage = Pn; - if (CurrentBoundary < PageBoundary) { - if (CurrentBoundary > PrevPageBoundary) - BlocksPerPage++; - CurrentBoundary += Pnc; + for (uptr I = 0; I < NumberOfRegions; I++) { + uptr PrevPageBoundary = 0; + uptr CurrentBoundary = 0; + for (uptr J = 0; J < PagesCount; J++) { + const uptr PageBoundary = PrevPageBoundary + PageSize; + uptr BlocksPerPage = Pn; if (CurrentBoundary < PageBoundary) { - BlocksPerPage++; - CurrentBoundary += BlockSize; + if (CurrentBoundary > PrevPageBoundary) + BlocksPerPage++; + CurrentBoundary += Pnc; + if (CurrentBoundary < PageBoundary) { + BlocksPerPage++; + CurrentBoundary += BlockSize; + } } - } - PrevPageBoundary = PageBoundary; + PrevPageBoundary = PageBoundary; - RangeTracker.processNextPage(Counters.get(I) == BlocksPerPage); + RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage); + } } } RangeTracker.finish(); diff --git a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp index a7478f47479d5..a693b97f80da6 100644 --- a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp @@ -21,14 +21,14 @@ TEST(ScudoReleaseTest, PackedCounterArray) { for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) { // Various valid counter's max values packed into one word. - scudo::PackedCounterArray Counters2N(1, 1UL << I); + scudo::PackedCounterArray Counters2N(1U, 1U, 1UL << I); EXPECT_EQ(sizeof(scudo::uptr), Counters2N.getBufferSize()); // Check the "all bit set" values too. - scudo::PackedCounterArray Counters2N1_1(1, ~0UL >> I); + scudo::PackedCounterArray Counters2N1_1(1U, 1U, ~0UL >> I); EXPECT_EQ(sizeof(scudo::uptr), Counters2N1_1.getBufferSize()); // Verify the packing ratio, the counter is Expected to be packed into the // closest power of 2 bits. - scudo::PackedCounterArray Counters(SCUDO_WORDSIZE, 1UL << I); + scudo::PackedCounterArray Counters(1U, SCUDO_WORDSIZE, 1UL << I); EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpToPowerOfTwo(I + 1), Counters.getBufferSize()); } @@ -38,19 +38,20 @@ TEST(ScudoReleaseTest, PackedCounterArray) { // Make sure counters request one memory page for the buffer. const scudo::uptr NumCounters = (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I); - scudo::PackedCounterArray Counters(NumCounters, 1UL << ((1UL << I) - 1)); - Counters.inc(0); + scudo::PackedCounterArray Counters(1U, NumCounters, + 1UL << ((1UL << I) - 1)); + Counters.inc(0U, 0U); for (scudo::uptr C = 1; C < NumCounters - 1; C++) { - EXPECT_EQ(0UL, Counters.get(C)); - Counters.inc(C); - EXPECT_EQ(1UL, Counters.get(C - 1)); + EXPECT_EQ(0UL, Counters.get(0U, C)); + Counters.inc(0U, C); + EXPECT_EQ(1UL, Counters.get(0U, C - 1)); } - EXPECT_EQ(0UL, Counters.get(NumCounters - 1)); - Counters.inc(NumCounters - 1); + EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1)); + Counters.inc(0U, NumCounters - 1); if (I > 0) { - Counters.incRange(0, NumCounters - 1); + Counters.incRange(0u, 0U, NumCounters - 1); for (scudo::uptr C = 0; C < NumCounters; C++) - EXPECT_EQ(2UL, Counters.get(C)); + EXPECT_EQ(2UL, Counters.get(0U, C)); } } } @@ -190,7 +191,7 @@ template void testReleaseFreeMemoryToOS() { // Release the memory. ReleasedPagesRecorder Recorder; - releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, BlockSize, + releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize, &Recorder); // Verify that there are no released pages touched by used chunks and all From db37937a4738f3a697804d8b2244baec076867a2 Mon Sep 17 00:00:00 2001 From: Meera Nakrani Date: Fri, 24 Jul 2020 17:46:25 +0000 Subject: [PATCH 0026/1035] [ARM] Added additional patterns to VABD instruction Added extra patterns to VABD instruction so it is selected in place of VSUB and VABS. Added corresponding regression test too. Differential Revision: https://reviews.llvm.org/D84500 --- llvm/lib/Target/ARM/ARMInstrMVE.td | 7 ++++ llvm/test/CodeGen/Thumb2/mve-vabd.ll | 63 ++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 llvm/test/CodeGen/Thumb2/mve-vabd.ll diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index fa04a82b0ee26..b082ca4d1c903 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3775,6 +3775,13 @@ multiclass MVE_VABD_fp_m defm MVE_VABDf32 : MVE_VABD_fp_m; defm MVE_VABDf16 : MVE_VABD_fp_m; +let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))), + (MVE_VABDf16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4f32 (fabs (fsub (v4f32 MQPR:$Qm), (v4f32 MQPR:$Qn)))), + (MVE_VABDf32 MQPR:$Qm, MQPR:$Qn)>; +} + class MVE_VCVT_fix : MVE_float<"vcvt", suffix, diff --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll new file mode 100644 index 0000000000000..3bbf2fc23a1cf --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP + +define arm_aapcs_vfpcc void @vabd_v4f32(<4 x float> %x, <4 x float> %y, <4 x float>* %z) { +; CHECK-MVE-LABEL: vabd_v4f32 +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-MVE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-MVE-NEXT: .pad #4 +; CHECK-MVE-NEXT: sub sp, #4 +; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vmov q4, q1 +; CHECK-MVE-NEXT: vmov q5, q0 +; CHECK-MVE-NEXT: mov r4, r0 +; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: vmov r1, s16 +; CHECK-MVE-NEXT: bl __aeabi_fsub +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: vmov r0, s21 + +; CHECK-MVEFP-LABEL: vabd_v4f32 +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vabd.f32 q0, q0, q1 +; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr +entry: + %0 = fsub <4 x float> %x, %y + %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %0) + store <4 x float> %1, <4 x float>* %z, align 4 + ret void +} + +define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %z) { +; CHECK-MVE-LABEL: vabd_v8f16 +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .save {r4, r5, r6, lr} +; CHECK-MVE-NEXT: push {r4, r5, r6, lr} +; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-MVE-NEXT: mov r4, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] +; CHECK-MVE-NEXT: vmov q5, q1 +; CHECK-MVE-NEXT: vmov q4, q0 +; CHECK-MVE-NEXT: bl __aeabi_h2f +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] + +; CHECK-MVEFP-LABEL: vabd_v8f16 +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vabd.f16 q0, q0, q1 +; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr +entry: + %0 = fsub <8 x half> %x, %y + %1 = call <8 x half> @llvm.fabs.v8f16(<8 x half> %0) + store <8 x half> %1, <8 x half>* %z + ret void +} + +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) From 8158f0cefe72ea7efbf41fdf4a7bfa392f6a8a2e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 24 Jul 2020 10:47:46 -0700 Subject: [PATCH 0027/1035] [X86] Use X86_MC::ParseX86Triple to add mode features to feature string in X86Subtarget::initSubtargetFeatures. Remove mode flags from constructor and remove calls to ToggleFeature for the mode bits. By adding them to the feature string we handle initializing the mode member variables in X86Subtarget and the feature bits in MCSubtargetInfo in one shot. --- llvm/lib/Target/X86/X86Subtarget.cpp | 35 ++++++++-------------------- llvm/lib/Target/X86/X86Subtarget.h | 6 ++--- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index ff2a94bcac8e2..51665255ec06b 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -231,15 +231,16 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (CPU.empty()) CPU = "generic"; - std::string FullFS = std::string(FS); - if (In64BitMode) { - // SSE2 should default to enabled in 64-bit mode, but can be turned off - // explicitly. - if (!FullFS.empty()) - FullFS = "+sse2," + FullFS; - else - FullFS = "+sse2"; - } + std::string FullFS = X86_MC::ParseX86Triple(TargetTriple); + assert(!FullFS.empty() && "Failed to parse X86 triple"); + + // SSE2 should default to enabled in 64-bit mode, but can be turned off + // explicitly. + if (TargetTriple.isArch64Bit()) + FullFS += ",+sse2"; + + if (!FS.empty()) + FullFS = (Twine(FullFS) + "," + FS).str(); // Parse features string and set the CPU. ParseSubtargetFeatures(CPU, FullFS); @@ -251,17 +252,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (hasSSE42() || hasSSE4A()) IsUAMem16Slow = false; - // It's important to keep the MCSubtargetInfo feature bits in sync with - // target data structure which is shared with MC code emitter, etc. - if (In64BitMode) - ToggleFeature(X86::Mode64Bit); - else if (In32BitMode) - ToggleFeature(X86::Mode32Bit); - else if (In16BitMode) - ToggleFeature(X86::Mode16Bit); - else - llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!"); - LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel << ", 3DNowLevel " << X863DNowLevel << ", 64bit " << HasX86_64 << "\n"); @@ -312,11 +302,6 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), RequiredVectorWidth(RequiredVectorWidth), - In64BitMode(TargetTriple.getArch() == Triple::x86_64), - In32BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() != Triple::CODE16), - In16BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() == Triple::CODE16), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 6f6823dea8ffd..54d7fbef7f94a 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -498,13 +498,13 @@ class X86Subtarget final : public X86GenSubtargetInfo { unsigned RequiredVectorWidth; /// True if compiling for 64-bit, false for 16-bit or 32-bit. - bool In64BitMode; + bool In64BitMode = false; /// True if compiling for 32-bit, false for 16-bit or 64-bit. - bool In32BitMode; + bool In32BitMode = false; /// True if compiling for 16-bit, false for 32-bit or 64-bit. - bool In16BitMode; + bool In16BitMode = false; /// Contains the Overhead of gather\scatter instructions int GatherOverhead = 1024; From 809600d6642773f71245f76995dab355effc73af Mon Sep 17 00:00:00 2001 From: Francesco Petrogalli Date: Tue, 7 Jul 2020 19:03:13 +0000 Subject: [PATCH 0028/1035] [llvm][sve] Reg + Imm addressing mode for ld1ro. Reviewers: kmclaughlin, efriedma, sdesmalen Subscribers: tschuett, hiraditya, psnobl, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D83357 --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 + .../lib/Target/AArch64/AArch64InstrFormats.td | 5 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 10 +- ...ntrinsics-ld1ro-addressing-mode-reg-imm.ll | 174 ++++++++++++++++++ 4 files changed, 188 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index af77171e92849..cb5530077fdd7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12347,6 +12347,9 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { "Unsupported opcode."); SDLoc DL(N); EVT VT = N->getValueType(0); + if (VT == MVT::nxv8bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); EVT LoadVT = VT; if (VT.isFloatingPoint()) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 6df7970f4d82b..4f4ba692c2db4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -495,6 +495,9 @@ def SImmS4XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); }]>; +def SImmS32XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64); +}]>; // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. @@ -546,7 +549,7 @@ def simm4s16 : Operand, ImmLeaf, ImmLeaf=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> { +[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> { let PrintMethod = "printImmScale<32>"; let ParserMatchClass = SImm4s32Operand; let DecoderMethod = "DecodeSImm<4>"; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index ee36ac0168003..1f067908de6b4 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7728,9 +7728,13 @@ multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; // Base addressing mode - def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), - (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; - + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)), + (!cast(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; + let AddedComplexity = 2 in { + // Reg + Imm addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))), + (!cast(NAME) $Pg, $base, simm4s32:$imm)>; + } } class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll new file mode 100644 index 0000000000000..e7edfc9d6bdd6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll @@ -0,0 +1,174 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+f64mm -asm-verbose=0 < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; WARN-NOT: warning + +; +; LD1ROB +; + +define @ld1rob_i8( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #32] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; +; LD1ROH +; + +define @ld1roh_i16( %pg, i16* %a) nounwind { +; CHECK-LABEL: ld1roh_i16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pg, i16* %base) + ret %load +} + +define @ld1roh_f16( %pg, half* %a) nounwind { +; CHECK-LABEL: ld1roh_f16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr half, half* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8f16( %pg, half* %base) + ret %load +} + +define @ld1roh_bf16( %pg, bfloat* %a) nounwind #0 { +; CHECK-LABEL: ld1roh_bf16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr bfloat, bfloat* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8bf16( %pg, bfloat* %base) + ret %load +} + +; +; LD1ROW +; + +define @ld1row_i32( %pg, i32* %a) nounwind { +; CHECK-LABEL: ld1row_i32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pg, i32* %base) + ret %load +} + +define @ld1row_f32( %pg, float* %a) nounwind { +; CHECK-LABEL: ld1row_f32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128] +; CHECK-NEXT: ret + %base = getelementptr float, float* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv4f32( %pg, float* %base) + ret %load +} + +; +; LD1ROD +; + +define @ld1rod_i64( %pg, i64* %a) nounwind { +; CHECK-LABEL: ld1rod_i64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-64] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %a, i64 -8 + %load = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pg, i64* %base) + ret %load +} + +define @ld1rod_f64( %pg, double* %a) nounwind { +; CHECK-LABEL: ld1rod_f64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-128] +; CHECK-NEXT: ret + %base = getelementptr double, double* %a, i64 -16 + %load = call @llvm.aarch64.sve.ld1ro.nxv2f64( %pg, double* %base) + ret %load +} + + +;;;;;;;;;;;;;; +; range checks: immediate must be a multiple of 32 in the range -256, ..., 224 + +; lower bound +define @ld1rob_i8_lower_bound( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_lower_bound: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #-256] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 -256 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; below lower bound +define @ld1roh_i16_below_lower_bound( %pg, i16* %a) nounwind { +; CHECK-LABEL: ld1roh_i16_below_lower_bound: +; CHECK-NEXT: sub x[[BASE:[0-9]+]], x0, #258 +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x[[BASE]]] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %a, i64 -129 + %load = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pg, i16* %base) + ret %load +} + +define @ld1rob_i8_below_lower_bound_01( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_below_lower_bound_01: +; CHECK-NEXT: mov x[[OFFSET:[0-9]+]], #-257 +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 -257 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; not a multiple of 32 +define @ld1row_i32_not_multiple( %pg, i32* %a) nounwind { +; CHECK-LABEL: ld1row_i32_not_multiple: +; CHECK-NEXT: add x[[BASE:[0-9]+]], x0, #12 +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x[[BASE]]] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %a, i64 3 + %load = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pg, i32* %base) + ret %load +} + +; upper bound +define @ld1rod_i64_upper_bound( %pg, i64* %a) nounwind { +; CHECK-LABEL: ld1rod_i64_upper_bound: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #224] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %a, i64 28 + %load = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pg, i64* %base) + ret %load +} + +define @ld1rob_i8_beyond_upper_bound( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_beyond_upper_bound: +; CHECK-NEXT: mov w[[OFFSET:[0-9]+]], #225 +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 225 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +declare @llvm.aarch64.sve.ld1ro.nxv16i8(, i8*) + +declare @llvm.aarch64.sve.ld1ro.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1ro.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1ro.nxv8bf16(, bfloat*) + +declare @llvm.aarch64.sve.ld1ro.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1ro.nxv4f32(, float*) + +declare @llvm.aarch64.sve.ld1ro.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1ro.nxv2f64(, double*) + + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+f64mm,+bf16" } From 945ed22f3397f52469618cd8a94207665f25bebd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 24 Jul 2020 11:10:28 -0700 Subject: [PATCH 0029/1035] [X86] Move the implicit enabling of sse2 for 64-bit mode from X86Subtarget::initSubtargetFeatures to X86_MC::ParseX86Triple. ParseX86Triple already checks for 64-bit mode and produces a static string. We can just add +sse2 to the end of that static string. This avoids a potential reallocation when appending it to the std::string at runtime. This is a slight change to the behavior of tools that only use MC layer which weren't implicitly enabling sse2 before, but will now. I don't think we check for sse2 explicitly in any MC layer components so this shouldn't matter in practice. And if it did matter the new behavior is more correct. --- llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp | 6 ++++-- llvm/lib/Target/X86/X86Subtarget.cpp | 5 ----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 8a478354cb168..8679bafa088e1 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -44,8 +44,10 @@ using namespace llvm; std::string X86_MC::ParseX86Triple(const Triple &TT) { std::string FS; - if (TT.getArch() == Triple::x86_64) - FS = "+64bit-mode,-32bit-mode,-16bit-mode"; + // SSE2 should default to enabled in 64-bit mode, but can be turned off + // explicitly. + if (TT.isArch64Bit()) + FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2"; else if (TT.getEnvironment() != Triple::CODE16) FS = "-64bit-mode,+32bit-mode,-16bit-mode"; else diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 51665255ec06b..07e913e139111 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -234,11 +234,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { std::string FullFS = X86_MC::ParseX86Triple(TargetTriple); assert(!FullFS.empty() && "Failed to parse X86 triple"); - // SSE2 should default to enabled in 64-bit mode, but can be turned off - // explicitly. - if (TargetTriple.isArch64Bit()) - FullFS += ",+sse2"; - if (!FS.empty()) FullFS = (Twine(FullFS) + "," + FS).str(); From 4a577c3a22c4ae388adca821a91552296e0d2653 Mon Sep 17 00:00:00 2001 From: madhur13490 Date: Thu, 23 Jul 2020 09:48:03 +0000 Subject: [PATCH 0030/1035] [AMDGPU] Fix incorrect arch assert while setting up FlatScratchInit Reviewers: arsenm, foad, rampitec, scott.linder Reviewed By: arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84391 --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index a2e802009d098..a5b04570655a4 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -274,6 +274,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( return; } + // For GFX9. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) .addReg(FlatScrInitLo) .addReg(ScratchWaveOffsetReg); @@ -284,7 +285,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( return; } - assert(ST.getGeneration() < AMDGPUSubtarget::GFX10); + assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); // Copy the size in bytes. BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) From 5a9630b7774dbacb7a0bdba068c1b26c231558bc Mon Sep 17 00:00:00 2001 From: cgyurgyik Date: Fri, 24 Jul 2020 14:31:27 -0400 Subject: [PATCH 0031/1035] [libc] Adds implementation for memrchr. Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D84469 --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/api.td | 3 +- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/spec/gnu_ext.td | 16 ++- libc/src/string/CMakeLists.txt | 8 ++ libc/src/string/memrchr.cpp | 26 +++++ libc/src/string/memrchr.h | 20 ++++ libc/test/src/string/CMakeLists.txt | 10 ++ libc/test/src/string/memrchr_test.cpp | 114 ++++++++++++++++++++++ 9 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 libc/src/string/memrchr.cpp create mode 100644 libc/src/string/memrchr.h create mode 100644 libc/test/src/string/memrchr_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 6a1ff3bd64a9b..b287a72d779b9 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -13,6 +13,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.string.strchr libc.src.string.strstr libc.src.string.strnlen + libc.src.string.memrchr ) set(TARGET_LIBM_ENTRYPOINTS diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 1ec1a024f85d0..5f7a858d5fa31 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -216,7 +216,8 @@ def StringAPI : PublicAPI<"string.h"> { "strtok", "strerror", "strlen", - "strnlen" + "strnlen", + "memrchr" ]; let TypeDeclarations = [ diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index b20f58c451847..db53005304896 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -31,6 +31,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.string.strchr libc.src.string.strstr libc.src.string.strnlen + libc.src.string.memrchr # sys/mman.h entrypoints libc.src.sys.mman.mmap diff --git a/libc/spec/gnu_ext.td b/libc/spec/gnu_ext.td index 7ac99783bc470..d85c562d9256a 100644 --- a/libc/spec/gnu_ext.td +++ b/libc/spec/gnu_ext.td @@ -12,8 +12,22 @@ def GnuExtensions : StandardSpec<"GNUExtensions"> { >, ] >; + + HeaderSpec String = HeaderSpec< + "string.h", + [], // Macros + [], // Types + [], // Enumerations + [ + FunctionSpec< + "memrchr", + RetValSpec, + [ArgSpec, ArgSpec, ArgSpec] + >, + ] + >; let Headers = [ - Math, + Math, String, ]; } diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 8bd7c1c045cf4..99450d5564593 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -78,6 +78,14 @@ add_entrypoint_object( .memchr ) +add_entrypoint_object( + memrchr + SRCS + memrchr.cpp + HDRS + memrchr.h +) + # Helper to define a function with multiple implementations # - Computes flags to satisfy required/rejected features and arch, # - Declares an entry point, diff --git a/libc/src/string/memrchr.cpp b/libc/src/string/memrchr.cpp new file mode 100644 index 0000000000000..81b034505202e --- /dev/null +++ b/libc/src/string/memrchr.cpp @@ -0,0 +1,26 @@ +//===-- Implementation of memrchr -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memrchr.h" +#include "src/__support/common.h" +#include + +namespace __llvm_libc { + +void *LLVM_LIBC_ENTRYPOINT(memrchr)(const void *src, int c, size_t n) { + const unsigned char *str = reinterpret_cast(src); + const unsigned char ch = c; + for (; n != 0; --n) { + const unsigned char *s = str + n - 1; + if (*s == ch) + return const_cast(s); + } + return nullptr; +} + +} // namespace __llvm_libc diff --git a/libc/src/string/memrchr.h b/libc/src/string/memrchr.h new file mode 100644 index 0000000000000..8f43577e331fe --- /dev/null +++ b/libc/src/string/memrchr.h @@ -0,0 +1,20 @@ +//===-- Implementation header for memrchr -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMRCHR_H +#define LLVM_LIBC_SRC_STRING_MEMRCHR_H + +#include + +namespace __llvm_libc { + +void *memrchr(const void *src, int c, size_t n); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMRCHR_H diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt index be43cc912b5a3..a116effef2719 100644 --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -82,6 +82,16 @@ add_libc_unittest( libc.src.string.strnlen ) +add_libc_unittest( + memrchr_test + SUITE + libc_string_unittests + SRCS + memrchr_test.cpp + DEPENDS + libc.src.string.memrchr +) + # Tests all implementations that can run on the host. function(add_libc_multi_impl_test name) get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) diff --git a/libc/test/src/string/memrchr_test.cpp b/libc/test/src/string/memrchr_test.cpp new file mode 100644 index 0000000000000..5f5f7a0d01828 --- /dev/null +++ b/libc/test/src/string/memrchr_test.cpp @@ -0,0 +1,114 @@ +//===-- Unittests for memrchr ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memrchr.h" +#include "utils/UnitTest/Test.h" +#include + +// A helper function that calls memrchr and abstracts away the explicit cast for +// readability purposes. +const char *call_memrchr(const void *src, int c, size_t size) { + return reinterpret_cast(__llvm_libc::memrchr(src, c, size)); +} + +TEST(MemRChrTest, FindsCharacterAfterNullTerminator) { + // memrchr should continue searching after a null terminator. + const size_t size = 6; + const unsigned char src[size] = {'a', '\0', 'b', 'c', 'd', '\0'}; + // Should return 'b', 'c', 'd', '\0' even when after null terminator. + ASSERT_STREQ(call_memrchr(src, 'b', size), "bcd"); +} + +TEST(MemRChrTest, FindsCharacterInNonNullTerminatedCollection) { + const size_t size = 3; + const unsigned char src[size] = {'a', 'b', 'c'}; + // Should return 'b', 'c'. + const char *ret = call_memrchr(src, 'b', size); + ASSERT_EQ(ret[0], 'b'); + ASSERT_EQ(ret[1], 'c'); +} + +TEST(MemRChrTest, FindsFirstCharacter) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return original array since 'a' is the first character. + ASSERT_STREQ(call_memrchr(src, 'a', size), "abcde"); +} + +TEST(MemRChrTest, FindsMiddleCharacter) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return characters after (and including) 'c'. + ASSERT_STREQ(call_memrchr(src, 'c', size), "cde"); +} + +TEST(MemRChrTest, FindsLastCharacterThatIsNotNullTerminator) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return 'e' and null-terminator. + ASSERT_STREQ(call_memrchr(src, 'e', size), "e"); +} + +TEST(MemRChrTest, FindsNullTerminator) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return null terminator. + ASSERT_STREQ(call_memrchr(src, '\0', size), ""); +} + +TEST(MemRChrTest, CharacterNotWithinStringShouldReturnNullptr) { + const size_t size = 4; + const unsigned char src[size] = {'1', '2', '3', '?'}; + // Since 'z' is not within 'characters', should return nullptr. + ASSERT_STREQ(call_memrchr(src, 'z', size), nullptr); +} + +TEST(MemRChrTest, CharacterNotWithinSizeShouldReturnNullptr) { + const unsigned char src[5] = {'1', '2', '3', '4', '\0'}; + // Since '4' is not within the first 2 characters, this should return nullptr. + const size_t size = 2; + ASSERT_STREQ(call_memrchr(src, '4', size), nullptr); +} + +TEST(MemRChrTest, ShouldFindLastOfDuplicates) { + size_t size = 12; // 11 characters + null terminator. + const char *dups = "abc1def1ghi"; + // 1 is duplicated in 'dups', but it should find the last copy. + ASSERT_STREQ(call_memrchr(dups, '1', size), "1ghi"); + + const char *repeated = "XXXXX"; + size = 6; // 5 characters + null terminator. + // Should return the last X with the null terminator. + ASSERT_STREQ(call_memrchr(repeated, 'X', size), "X"); +} + +TEST(MemRChrTest, EmptyStringShouldOnlyMatchNullTerminator) { + const size_t size = 1; // Null terminator. + const char *empty_string = ""; + // Null terminator should match. + ASSERT_STREQ(call_memrchr(empty_string, '\0', size), ""); + // All other characters should not match. + ASSERT_STREQ(call_memrchr(empty_string, 'A', size), nullptr); + ASSERT_STREQ(call_memrchr(empty_string, '9', size), nullptr); + ASSERT_STREQ(call_memrchr(empty_string, '?', size), nullptr); +} + +TEST(MemRChrTest, SignedCharacterFound) { + char c = -1; + const size_t size = 1; + char src[size] = {c}; + const char *actual = call_memrchr(src, c, size); + // Should find the last character 'c'. + ASSERT_EQ(actual[0], c); +} + +TEST(MemRChrTest, ZeroLengthShouldReturnNullptr) { + const unsigned char src[4] = {'a', 'b', 'c', '\0'}; + // This will iterate over exactly zero characters, so should return nullptr. + ASSERT_STREQ(call_memrchr(src, 'd', 0), nullptr); +} From 43f09110402d382b9ce7b41d1a24b117d7941695 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Fri, 24 Jul 2020 15:06:30 -0400 Subject: [PATCH 0032/1035] [openmp] Clean up OMPKinds.def remove OMP_DIRECTIVE This patch removes the OMP_DIRECTIVE definition from OMPKinds.def since they are now defined in OMP.td and OMP_DIRECTIVE is not used anymore in the code. Reviewed By: jdenny Differential Revision: https://reviews.llvm.org/D84329 --- .../include/llvm/Frontend/OpenMP/OMPKinds.def | 94 +------------------ 1 file changed, 3 insertions(+), 91 deletions(-) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 4f2fcb8af5d1d..7771dcd72d6a9 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -10,100 +10,12 @@ /// This file defines the list of supported OpenMP directives, clauses, runtime /// calls, and other things that need to be listed in enums. /// +/// This file is under transition to OMP.td with TableGen code generation. +/// //===----------------------------------------------------------------------===// /// OpenMP Directives and combined directives -/// -///{ - -#ifndef OMP_DIRECTIVE -#define OMP_DIRECTIVE(Enum, Str) -#endif - -#define __OMP_DIRECTIVE_EXT(Name, Str) OMP_DIRECTIVE(OMPD_##Name, Str) -#define __OMP_DIRECTIVE(Name) __OMP_DIRECTIVE_EXT(Name, #Name) - -__OMP_DIRECTIVE(threadprivate) -__OMP_DIRECTIVE(parallel) -__OMP_DIRECTIVE(task) -__OMP_DIRECTIVE(simd) -__OMP_DIRECTIVE(for) -__OMP_DIRECTIVE(sections) -__OMP_DIRECTIVE(section) -__OMP_DIRECTIVE(single) -__OMP_DIRECTIVE(master) -__OMP_DIRECTIVE(critical) -__OMP_DIRECTIVE(taskyield) -__OMP_DIRECTIVE(barrier) -__OMP_DIRECTIVE(taskwait) -__OMP_DIRECTIVE(taskgroup) -__OMP_DIRECTIVE(flush) -__OMP_DIRECTIVE(ordered) -__OMP_DIRECTIVE(atomic) -__OMP_DIRECTIVE(target) -__OMP_DIRECTIVE(teams) -__OMP_DIRECTIVE(cancel) -__OMP_DIRECTIVE(requires) -__OMP_DIRECTIVE_EXT(target_data, "target data") -__OMP_DIRECTIVE_EXT(target_enter_data, "target enter data") -__OMP_DIRECTIVE_EXT(target_exit_data, "target exit data") -__OMP_DIRECTIVE_EXT(target_parallel, "target parallel") -__OMP_DIRECTIVE_EXT(target_parallel_for, "target parallel for") -__OMP_DIRECTIVE_EXT(target_update, "target update") -__OMP_DIRECTIVE_EXT(parallel_for, "parallel for") -__OMP_DIRECTIVE_EXT(parallel_for_simd, "parallel for simd") -__OMP_DIRECTIVE_EXT(parallel_master, "parallel master") -__OMP_DIRECTIVE_EXT(parallel_sections, "parallel sections") -__OMP_DIRECTIVE_EXT(for_simd, "for simd") -__OMP_DIRECTIVE_EXT(cancellation_point, "cancellation point") -__OMP_DIRECTIVE_EXT(declare_reduction, "declare reduction") -__OMP_DIRECTIVE_EXT(declare_mapper, "declare mapper") -__OMP_DIRECTIVE_EXT(declare_simd, "declare simd") -__OMP_DIRECTIVE(taskloop) -__OMP_DIRECTIVE_EXT(taskloop_simd, "taskloop simd") -__OMP_DIRECTIVE(distribute) -__OMP_DIRECTIVE_EXT(declare_target, "declare target") -__OMP_DIRECTIVE_EXT(end_declare_target, "end declare target") -__OMP_DIRECTIVE_EXT(distribute_parallel_for, "distribute parallel for") -__OMP_DIRECTIVE_EXT(distribute_parallel_for_simd, - "distribute parallel for simd") -__OMP_DIRECTIVE_EXT(distribute_simd, "distribute simd") -__OMP_DIRECTIVE_EXT(target_parallel_for_simd, "target parallel for simd") -__OMP_DIRECTIVE_EXT(target_simd, "target simd") -__OMP_DIRECTIVE_EXT(teams_distribute, "teams distribute") -__OMP_DIRECTIVE_EXT(teams_distribute_simd, "teams distribute simd") -__OMP_DIRECTIVE_EXT(teams_distribute_parallel_for_simd, - "teams distribute parallel for simd") -__OMP_DIRECTIVE_EXT(teams_distribute_parallel_for, - "teams distribute parallel for") -__OMP_DIRECTIVE_EXT(target_teams, "target teams") -__OMP_DIRECTIVE_EXT(target_teams_distribute, "target teams distribute") -__OMP_DIRECTIVE_EXT(target_teams_distribute_parallel_for, - "target teams distribute parallel for") -__OMP_DIRECTIVE_EXT(target_teams_distribute_parallel_for_simd, - "target teams distribute parallel for simd") -__OMP_DIRECTIVE_EXT(target_teams_distribute_simd, - "target teams distribute simd") -__OMP_DIRECTIVE(allocate) -__OMP_DIRECTIVE_EXT(declare_variant, "declare variant") -__OMP_DIRECTIVE_EXT(master_taskloop, "master taskloop") -__OMP_DIRECTIVE_EXT(parallel_master_taskloop, "parallel master taskloop") -__OMP_DIRECTIVE_EXT(master_taskloop_simd, "master taskloop simd") -__OMP_DIRECTIVE_EXT(parallel_master_taskloop_simd, - "parallel master taskloop simd") -__OMP_DIRECTIVE(depobj) -__OMP_DIRECTIVE(scan) -__OMP_DIRECTIVE_EXT(begin_declare_variant, "begin declare variant") -__OMP_DIRECTIVE_EXT(end_declare_variant, "end declare variant") - -// Has to be the last because Clang implicitly expects it to be. -__OMP_DIRECTIVE(unknown) - -#undef __OMP_DIRECTIVE_EXT -#undef __OMP_DIRECTIVE -#undef OMP_DIRECTIVE - -///} +/// - Moved to OMP.td /// OpenMP Clauses /// From 0b339c069266b7a4f3c82f80067d74620cbe19c4 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 24 Jul 2020 12:09:36 -0700 Subject: [PATCH 0033/1035] [lldb] Inform every language runtime of the modified modules When a process is notified that modules got loaded, currently only existing language runtimes are given a chance to deal with that. This means that if the runtime for a given language wasn't needed before it won't be informed of the module chance. This is wrong because the module change might be what triggers the need for a certain runtime. Instead, we should give the language runtime for every supported language a chance to deal with the modified modules. Differential revision: https://reviews.llvm.org/D84475 --- lldb/source/Target/Process.cpp | 40 ++++++++++------------------------ 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index b7694ec43f341..3776a90e546ae 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -5757,41 +5757,25 @@ addr_t Process::ResolveIndirectFunction(const Address *address, Status &error) { } void Process::ModulesDidLoad(ModuleList &module_list) { + // Inform the system runtime of the modified modules. SystemRuntime *sys_runtime = GetSystemRuntime(); - if (sys_runtime) { + if (sys_runtime) sys_runtime->ModulesDidLoad(module_list); - } GetJITLoaders().ModulesDidLoad(module_list); - // Give runtimes a chance to be created. + // Give the instrumentation runtimes a chance to be created before informing + // them of the modified modules. InstrumentationRuntime::ModulesDidLoad(module_list, this, m_instrumentation_runtimes); + for (auto &runtime : m_instrumentation_runtimes) + runtime.second->ModulesDidLoad(module_list); - // Tell runtimes about new modules. - for (auto pos = m_instrumentation_runtimes.begin(); - pos != m_instrumentation_runtimes.end(); ++pos) { - InstrumentationRuntimeSP runtime = pos->second; - runtime->ModulesDidLoad(module_list); - } - - // Let any language runtimes we have already created know about the modules - // that loaded. - - // Iterate over a copy of this language runtime list in case the language - // runtime ModulesDidLoad somehow causes the language runtime to be - // unloaded. - { - std::lock_guard guard(m_language_runtimes_mutex); - LanguageRuntimeCollection language_runtimes(m_language_runtimes); - for (const auto &pair : language_runtimes) { - // We must check language_runtime_sp to make sure it is not nullptr as we - // might cache the fact that we didn't have a language runtime for a - // language. - LanguageRuntimeSP language_runtime_sp = pair.second; - if (language_runtime_sp) - language_runtime_sp->ModulesDidLoad(module_list); - } + // Give the language runtimes a chance to be created before informing them of + // the modified modules. + for (const lldb::LanguageType lang_type : Language::GetSupportedLanguages()) { + if (LanguageRuntime *runtime = GetLanguageRuntime(lang_type)) + runtime->ModulesDidLoad(module_list); } // If we don't have an operating system plug-in, try to load one since @@ -5799,7 +5783,7 @@ void Process::ModulesDidLoad(ModuleList &module_list) { if (!m_os_up) LoadOperatingSystemPlugin(false); - // Give structured-data plugins a chance to see the modified modules. + // Inform the structured-data plugins of the modified modules. for (auto pair : m_structured_data_plugin_map) { if (pair.second) pair.second->ModulesDidLoad(*this, module_list); From 9bb6ce78bec79295a5ad2f0ef96bfde4cb3f714c Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 24 Jul 2020 11:54:13 -0700 Subject: [PATCH 0034/1035] Rename scoped-noalias -> scoped-noalias-aa Summary: To match NewPM name. Also the new name is clearer and more consistent. Subscribers: jvesely, nhaehnle, hiraditya, asbirlea, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84542 --- llvm/lib/Analysis/ScopedNoAliasAA.cpp | 2 +- llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll | 2 +- llvm/test/Analysis/ScopedNoAliasAA/basic.ll | 2 +- llvm/test/Analysis/ScopedNoAliasAA/basic2.ll | 2 +- llvm/test/Transforms/GVN/noalias.ll | 2 +- llvm/test/Transforms/LICM/dropped-tbaa.ll | 2 +- llvm/test/Transforms/LICM/pr42969.ll | 2 +- llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll | 2 +- llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll | 2 +- llvm/test/Transforms/LoopVectorize/noalias-md.ll | 2 +- llvm/test/Transforms/LoopVersioning/noalias-version-twice.ll | 2 +- llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll | 2 +- llvm/test/Transforms/NewGVN/noalias.ll | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/llvm/lib/Analysis/ScopedNoAliasAA.cpp index 8928678d6ab21..5e2aaab050afe 100644 --- a/llvm/lib/Analysis/ScopedNoAliasAA.cpp +++ b/llvm/lib/Analysis/ScopedNoAliasAA.cpp @@ -185,7 +185,7 @@ ScopedNoAliasAAResult ScopedNoAliasAA::run(Function &F, char ScopedNoAliasAAWrapperPass::ID = 0; -INITIALIZE_PASS(ScopedNoAliasAAWrapperPass, "scoped-noalias", +INITIALIZE_PASS(ScopedNoAliasAAWrapperPass, "scoped-noalias-aa", "Scoped NoAlias Alias Analysis", false, true) ImmutablePass *llvm::createScopedNoAliasAAWrapperPass() { diff --git a/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll b/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll index 5c66f03b87020..099a7aeb8a76a 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -basic-aa -scoped-noalias-aa -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/ScopedNoAliasAA/basic.ll b/llvm/test/Analysis/ScopedNoAliasAA/basic.ll index 9ca06a6826bdb..92cc0ccc32d3c 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/basic.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/basic.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -basic-aa -scoped-noalias-aa -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll b/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll index e0c16b1991157..8275cc0439e00 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -basic-aa -scoped-noalias-aa -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/GVN/noalias.ll b/llvm/test/Transforms/GVN/noalias.ll index 69c21f110b5ea..9cfcc9bbd9d4c 100644 --- a/llvm/test/Transforms/GVN/noalias.ll +++ b/llvm/test/Transforms/GVN/noalias.ll @@ -1,4 +1,4 @@ -; RUN: opt -scoped-noalias -basic-aa -gvn -S < %s | FileCheck %s +; RUN: opt -scoped-noalias-aa -basic-aa -gvn -S < %s | FileCheck %s define i32 @test1(i32* %p, i32* %q) { ; CHECK-LABEL: @test1(i32* %p, i32* %q) diff --git a/llvm/test/Transforms/LICM/dropped-tbaa.ll b/llvm/test/Transforms/LICM/dropped-tbaa.ll index 7d37ca55c1880..3297cc0b305a5 100644 --- a/llvm/test/Transforms/LICM/dropped-tbaa.ll +++ b/llvm/test/Transforms/LICM/dropped-tbaa.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -scoped-noalias -tbaa -licm -S | FileCheck %s +; RUN: opt < %s -scoped-noalias-aa -tbaa -licm -S | FileCheck %s ; This test case case is generated from the following C code with -fstrict-aliasing, ; and after passing through -inline -mem2reg -loop-rotate -instcombine diff --git a/llvm/test/Transforms/LICM/pr42969.ll b/llvm/test/Transforms/LICM/pr42969.ll index 7fa36f0cfe77d..3cb82dc8095a6 100644 --- a/llvm/test/Transforms/LICM/pr42969.ll +++ b/llvm/test/Transforms/LICM/pr42969.ll @@ -1,4 +1,4 @@ -; RUN: opt %s -S -scoped-noalias -enable-mssa-loop-dependency=true -licm | FileCheck %s +; RUN: opt %s -S -scoped-noalias-aa -enable-mssa-loop-dependency=true -licm | FileCheck %s define i16 @main(i1 %a_b_mayalias, i16* %a, i16* %b) { ; CHECK: scalar.body: diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll index 8a0641ffe0d52..debf607e10759 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basic-aa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basic-aa -scoped-noalias-aa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basic-aa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" diff --git a/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll b/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll index d54a9aefe73a1..c343bb8dd6a32 100644 --- a/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll +++ b/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -scoped-noalias -loop-vectorize -licm -force-vector-width=2 \ +; RUN: opt -basic-aa -scoped-noalias-aa -loop-vectorize -licm -force-vector-width=2 \ ; RUN: -force-vector-interleave=1 -S < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/noalias-md.ll b/llvm/test/Transforms/LoopVectorize/noalias-md.ll index b141998163aef..27192de5576fc 100644 --- a/llvm/test/Transforms/LoopVectorize/noalias-md.ll +++ b/llvm/test/Transforms/LoopVectorize/noalias-md.ll @@ -1,7 +1,7 @@ ; RUN: opt -basic-aa -loop-vectorize -force-vector-width=2 \ ; RUN: -force-vector-interleave=1 -S < %s \ ; RUN: | FileCheck %s -check-prefix=BOTH -check-prefix=LV -; RUN: opt -basic-aa -scoped-noalias -loop-vectorize -dse -force-vector-width=2 \ +; RUN: opt -basic-aa -scoped-noalias-aa -loop-vectorize -dse -force-vector-width=2 \ ; RUN: -force-vector-interleave=1 -S < %s \ ; RUN: | FileCheck %s -check-prefix=BOTH -check-prefix=DSE diff --git a/llvm/test/Transforms/LoopVersioning/noalias-version-twice.ll b/llvm/test/Transforms/LoopVersioning/noalias-version-twice.ll index 89e898a90c178..4420b71d1f452 100644 --- a/llvm/test/Transforms/LoopVersioning/noalias-version-twice.ll +++ b/llvm/test/Transforms/LoopVersioning/noalias-version-twice.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -loop-distribute -enable-loop-distribute -loop-simplify -scoped-noalias \ +; RUN: opt -basic-aa -loop-distribute -enable-loop-distribute -loop-simplify -scoped-noalias-aa \ ; RUN: -loop-versioning -S < %s | FileCheck %s ; Test the metadata generated when versioning an already versioned loop. Here diff --git a/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll index 81043d4f46ddf..c3f7a11272815 100644 --- a/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -basic-aa -scoped-noalias -memcpyopt -S %s | FileCheck %s +; RUN: opt -basic-aa -scoped-noalias-aa -memcpyopt -S %s | FileCheck %s %T = type { i8, i32 } diff --git a/llvm/test/Transforms/NewGVN/noalias.ll b/llvm/test/Transforms/NewGVN/noalias.ll index c5f23bfad89a5..23fc95d4ff8dc 100644 --- a/llvm/test/Transforms/NewGVN/noalias.ll +++ b/llvm/test/Transforms/NewGVN/noalias.ll @@ -1,4 +1,4 @@ -; RUN: opt -scoped-noalias -basic-aa -newgvn -S < %s | FileCheck %s +; RUN: opt -scoped-noalias-aa -basic-aa -newgvn -S < %s | FileCheck %s define i32 @test1(i32* %p, i32* %q) { ; CHECK-LABEL: @test1(i32* %p, i32* %q) From 4d09ed953b5b8c70d9ca0aeaed8f26a237b612c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 24 Jul 2020 00:05:55 +0300 Subject: [PATCH 0035/1035] [llvm-lib] Support adding short import library objects with llvm-lib This fixes PR 42837. Differential Revision: https://reviews.llvm.org/D84465 --- llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp | 6 ++++-- llvm/test/tools/llvm-lib/implibs.test | 12 ++++++++++++ llvm/test/tools/llvm-lib/invalid.test | 2 +- 3 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 llvm/test/tools/llvm-lib/implibs.test diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index c409012554245..cd39428b9c38e 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -191,9 +191,11 @@ static void appendFile(std::vector &Members, file_magic Magic = identify_magic(MB.getBuffer()); if (Magic != file_magic::coff_object && Magic != file_magic::bitcode && - Magic != file_magic::archive && Magic != file_magic::windows_resource) { + Magic != file_magic::archive && Magic != file_magic::windows_resource && + Magic != file_magic::coff_import_library) { llvm::errs() << MB.getBufferIdentifier() - << ": not a COFF object, bitcode, archive or resource file\n"; + << ": not a COFF object, bitcode, archive, import library or " + "resource file\n"; exit(1); } diff --git a/llvm/test/tools/llvm-lib/implibs.test b/llvm/test/tools/llvm-lib/implibs.test new file mode 100644 index 0000000000000..ebff4bb4608f2 --- /dev/null +++ b/llvm/test/tools/llvm-lib/implibs.test @@ -0,0 +1,12 @@ +Test that import libraries (and the members thereof) can be added to another +static library. + +RUN: rm -rf %t +RUN: mkdir -p %t + +RUN: echo -e "EXPORTS\nMyFunc" > %t/lib.def +RUN: llvm-dlltool -m i386:x86-64 -l %t/lib.lib -d %t/lib.def -D lib.dll +RUN: llvm-lib -out:%t/newlib.lib %t/lib.lib + +RUN: llvm-ar t %t/newlib.lib | FileCheck %s +CHECK: lib.dll diff --git a/llvm/test/tools/llvm-lib/invalid.test b/llvm/test/tools/llvm-lib/invalid.test index 57266400cdc87..a4b06a03358b0 100644 --- a/llvm/test/tools/llvm-lib/invalid.test +++ b/llvm/test/tools/llvm-lib/invalid.test @@ -1,2 +1,2 @@ RUN: not llvm-lib %S/Inputs/cl-gl.obj 2>&1 | FileCheck %s -CHECK: not a COFF object, bitcode, archive or resource file +CHECK: not a COFF object, bitcode, archive, import library or resource file From 9e81d8bbf19d72fca3d87b7334c613d1aa2a5795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 21 Jul 2020 23:39:37 +0300 Subject: [PATCH 0036/1035] [MC] [COFF] Make sure that weak external symbols are undefined symbols For comdats (e.g. caused by -ffunction-sections), Section is already set here; make sure it's null, for the weak external symbol to be undefined. This fixes PR46779. Differential Revision: https://reviews.llvm.org/D84507 --- llvm/lib/MC/WinCOFFObjectWriter.cpp | 1 + llvm/test/MC/COFF/weak-comdat.s | 34 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 llvm/test/MC/COFF/weak-comdat.s diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 4796ef531054b..8e7bf1eb01697 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -375,6 +375,7 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym, COFFSymbol *Local = nullptr; if (cast(MCSym).isWeakExternal()) { Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL; + Sym->Section = nullptr; COFFSymbol *WeakDefault = getLinkedSymbol(MCSym); if (!WeakDefault) { diff --git a/llvm/test/MC/COFF/weak-comdat.s b/llvm/test/MC/COFF/weak-comdat.s new file mode 100644 index 0000000000000..8605da6b521db --- /dev/null +++ b/llvm/test/MC/COFF/weak-comdat.s @@ -0,0 +1,34 @@ +// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s -o %t.o +// RUN: llvm-readobj --symbols %t.o | FileCheck %s + +// Test that the weak symbol is properly undefined, while originally being +// the leader symbol for a comdat. (This can easily happen if building with +// -ffunction-sections). + + .section .text$func,"xr",one_only,func + .weak func +func: + ret + +// CHECK: Symbol { +// CHECK: Name: func +// CHECK-NEXT: Value: 0 +// CHECK-NEXT: Section: IMAGE_SYM_UNDEFINED (0) +// CHECK-NEXT: BaseType: Null (0x0) +// CHECK-NEXT: ComplexType: Null (0x0) +// CHECK-NEXT: StorageClass: WeakExternal (0x69) +// CHECK-NEXT: AuxSymbolCount: 1 +// CHECK-NEXT: AuxWeakExternal { +// CHECK-NEXT: Linked: .weak.func.default (10) +// CHECK-NEXT: Search: Alias (0x3) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: Symbol { +// CHECK-NEXT: Name: .weak.func.default +// CHECK-NEXT: Value: 0 +// CHECK-NEXT: Section: .text$func (4) +// CHECK-NEXT: BaseType: Null (0x0) +// CHECK-NEXT: ComplexType: Null (0x0) +// CHECK-NEXT: StorageClass: External (0x2) +// CHECK-NEXT: AuxSymbolCount: 0 +// CHECK-NEXT: } From 032b78a0762bee129f33e4255ada6d374aa70c71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirst=C3=B3f=20Umann?= Date: Fri, 24 Jul 2020 19:10:50 +0200 Subject: [PATCH 0037/1035] [analyzer] Revert the accidental commit of D82122 Was accidentally squished into rGb6cbe6cb0399d4671e5384dcc326af56bc6bd122. The assert fires on the code snippet included in this commit. More discussion can be found in https://reviews.llvm.org/D82598. --- clang/lib/StaticAnalyzer/Core/Environment.cpp | 12 +-- clang/test/Analysis/live-stmts.mm | 101 ++++++++++++++++++ 2 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 clang/test/Analysis/live-stmts.mm diff --git a/clang/lib/StaticAnalyzer/Core/Environment.cpp b/clang/lib/StaticAnalyzer/Core/Environment.cpp index 9e6d79bb7dcc9..1ccf4c6104a65 100644 --- a/clang/lib/StaticAnalyzer/Core/Environment.cpp +++ b/clang/lib/StaticAnalyzer/Core/Environment.cpp @@ -183,18 +183,12 @@ EnvironmentManager::removeDeadBindings(Environment Env, F.getTreeFactory()); // Iterate over the block-expr bindings. - for (Environment::iterator I = Env.begin(), E = Env.end(); I != E; ++I) { + for (Environment::iterator I = Env.begin(), E = Env.end(); + I != E; ++I) { const EnvironmentEntry &BlkExpr = I.getKey(); const SVal &X = I.getData(); - const bool IsBlkExprLive = - SymReaper.isLive(BlkExpr.getStmt(), BlkExpr.getLocationContext()); - - assert((isa(BlkExpr.getStmt()) || !IsBlkExprLive) && - "Only Exprs can be live, LivenessAnalysis argues about the liveness " - "of *values*!"); - - if (IsBlkExprLive) { + if (SymReaper.isLive(BlkExpr.getStmt(), BlkExpr.getLocationContext())) { // Copy the binding to the new map. EBMapRef = EBMapRef.add(BlkExpr, X); diff --git a/clang/test/Analysis/live-stmts.mm b/clang/test/Analysis/live-stmts.mm new file mode 100644 index 0000000000000..a6ddd03ca5d85 --- /dev/null +++ b/clang/test/Analysis/live-stmts.mm @@ -0,0 +1,101 @@ +// RUN: %clang_analyze_cc1 -w -fblocks %s \ +// RUN: -analyzer-checker=debug.DumpLiveStmts \ +// RUN: 2>&1 | FileCheck %s + +@interface Item +// ... +@end + +@interface Collection +// ... +@end + +typedef void (^Blk)(); + +struct RAII { + Blk blk; + +public: + RAII(Blk blk): blk(blk) {} + +// CHECK: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B2 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: + + ~RAII() { blk(); } + +// CHECK-NEXT: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B2 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +}; + +void foo(Collection *coll) { + RAII raii(^{}); + for (Item *item in coll) {} +} +// CHECK-NEXT: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B2 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B3 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B4 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B5 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: + From 7d076e19e31a2a32e357cbdcf0183f88fe1fb0fb Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Fri, 24 Jul 2020 15:38:27 -0400 Subject: [PATCH 0038/1035] [PowerPC] Fix computation of offset for load-and-splat for permuted loads Unfortunately this is another regression from my canonicalization patch (1fed131660b2). The patch contained two implicit assumptions: 1. That we would have a permuted load only if we are loading a partial vector 2. That a partial vector load would necessarily be as wide as the splat However, assumption 2 is not correct since it is possible to do a wider load and only splat a half of it. This patch corrects this assumption by simply checking if the load is permuted and adjusting the offset if it is. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 26 ++++-- .../PowerPC/canonical-merge-shuffles.ll | 88 +++++++++++++++++++ 2 files changed, 106 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index edc23b2673f38..c2ba7195509a1 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9126,13 +9126,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { Op0.getOperand(1)); } -static const SDValue *getNormalLoadInput(const SDValue &Op) { +static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) { const SDValue *InputLoad = &Op; if (InputLoad->getOpcode() == ISD::BITCAST) InputLoad = &InputLoad->getOperand(0); if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR || - InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) + InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) { + IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED; InputLoad = &InputLoad->getOperand(0); + } if (InputLoad->getOpcode() != ISD::LOAD) return nullptr; LoadSDNode *LD = cast(*InputLoad); @@ -9304,7 +9306,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (!BVNIsConstantSplat || SplatBitSize > 32) { - const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + bool IsPermutedLoad = false; + const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. if (InputLoad && DAG.isSplatValue(Op, true)) { @@ -9927,7 +9930,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If this is a load-and-splat, we can do that with a single instruction // in some cases. However if the load has multiple uses, we don't want to // combine it because that will just produce multiple loads. - const SDValue *InputLoad = getNormalLoadInput(V1); + bool IsPermutedLoad = false; + const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad); if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && InputLoad->hasOneUse()) { @@ -9935,6 +9939,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + // The splat index for permuted loads will be in the left half of the vector + // which is strictly wider than the loaded value by 8 bytes. So we need to + // adjust the splat index to point to the correct address in memory. + if (IsPermutedLoad) { + assert(isLittleEndian && "Unexpected permuted load on big endian target"); + SplatIdx += IsFourByte ? 2 : 1; + assert(SplatIdx < IsFourByte ? 4 : 2 && + "Splat of a value outside of the loaded memory"); + } + LoadSDNode *LD = cast(*InputLoad); // For 4-byte load-and-splat, we need Power9. if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { @@ -9944,10 +9958,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, else Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; - // If we are loading a partial vector, it does not make sense to adjust - // the base pointer. This happens with (splat (s_to_v_permuted (ld))). - if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64)) - Offset = 0; SDValue BasePtr = LD->getBasePtr(); if (Offset != 0) BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index 11bc2bae9871f..cdd04b33318ea 100644 --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -446,5 +446,93 @@ entry: ret <16 x i8> %shuffle } +define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat4Low: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: ld r3, 0(r3) +; CHECK-P8-NEXT: mtfprd f0, r3 +; CHECK-P8-NEXT: xxspltw v2, vs0, 0 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat4Low: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r3, r3, 4 +; CHECK-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat4Low: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: vspltw v2, v2, 2 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit18 to <4 x i32> + ret <4 x i32> %1 +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat4hi: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: ld r3, 0(r3) +; CHECK-P8-NEXT: mtfprd f0, r3 +; CHECK-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat4hi: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat4hi: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: vspltw v2, v2, 3 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit22 to <4 x i32> + ret <4 x i32> %1 +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat8: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: lxvdsx v2, 0, r3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat8: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxvdsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat8: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI19_0@toc@ha +; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI19_0@toc@l +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: addi r3, r1, -16 +; CHECK-NOVSX-NEXT: lvx v3, 0, r3 +; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit30 to <2 x i64> + ret <2 x i64> %1 +} + declare double @dummy() local_unnamed_addr attributes #0 = { nounwind } From 679158e662aa247282b8eea4c2d60b33204171fb Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Fri, 24 Jul 2020 20:50:25 +0100 Subject: [PATCH 0039/1035] Make hip math headers easier to use from C Summary: Make hip math headers easier to use from C Motivation is a step towards using the hip math headers to implement math.h for openmp, which needs to work with C as well as C++. NFC for C++ code. Reviewers: yaxunl, jdoerfert Reviewed By: yaxunl Subscribers: sstefan1, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D84476 --- clang/lib/Headers/__clang_hip_libdevice_declares.h | 6 +++++- clang/lib/Headers/__clang_hip_math.h | 10 ++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h index 7110404434405..2cf9cc7f1eb65 100644 --- a/clang/lib/Headers/__clang_hip_libdevice_declares.h +++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h @@ -10,7 +10,9 @@ #ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__ #define __CLANG_HIP_LIBDEVICE_DECLARES_H__ +#ifdef __cplusplus extern "C" { +#endif // BEGIN FLOAT __device__ __attribute__((const)) float __ocml_acos_f32(float); @@ -316,7 +318,7 @@ __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); __device__ inline __2f16 __llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL. { - return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)}; + return (__2f16){__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)}; } __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); @@ -325,6 +327,8 @@ __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16); +#ifdef __cplusplus } // extern "C" +#endif #endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__ diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h index 47d3c17175591..f9ca9bf606fb7 100644 --- a/clang/lib/Headers/__clang_hip_math.h +++ b/clang/lib/Headers/__clang_hip_math.h @@ -95,8 +95,10 @@ inline uint64_t __make_mantissa(const char *__tagp) { } // BEGIN FLOAT +#ifdef __cplusplus __DEVICE__ inline float abs(float __x) { return __ocml_fabs_f32(__x); } +#endif __DEVICE__ inline float acosf(float __x) { return __ocml_acos_f32(__x); } __DEVICE__ @@ -251,7 +253,7 @@ inline float nanf(const char *__tagp) { uint32_t sign : 1; } bits; - static_assert(sizeof(float) == sizeof(ieee_float), ""); + static_assert(sizeof(float) == sizeof(struct ieee_float), ""); } __tmp; __tmp.bits.sign = 0u; @@ -553,8 +555,10 @@ inline float __tanf(float __x) { return __ocml_tan_f32(__x); } // END FLOAT // BEGIN DOUBLE +#ifdef __cplusplus __DEVICE__ inline double abs(double __x) { return __ocml_fabs_f64(__x); } +#endif __DEVICE__ inline double acos(double __x) { return __ocml_acos_f64(__x); } __DEVICE__ @@ -712,7 +716,7 @@ inline double nan(const char *__tagp) { uint32_t exponent : 11; uint32_t sign : 1; } bits; - static_assert(sizeof(double) == sizeof(ieee_double), ""); + static_assert(sizeof(double) == sizeof(struct ieee_double), ""); } __tmp; __tmp.bits.sign = 0u; @@ -1178,6 +1182,7 @@ __host__ inline static int max(int __arg1, int __arg2) { return std::max(__arg1, __arg2); } +#ifdef __cplusplus __DEVICE__ inline float pow(float __base, int __iexp) { return powif(__base, __iexp); } @@ -1188,6 +1193,7 @@ __DEVICE__ inline _Float16 pow(_Float16 __base, int __iexp) { return __ocml_pown_f16(__base, __iexp); } +#endif #pragma pop_macro("__DEF_FUN1") #pragma pop_macro("__DEF_FUN2") From 51eeeb477fad47d4885f3fe226aa4941470dae26 Mon Sep 17 00:00:00 2001 From: cgyurgyik Date: Fri, 24 Jul 2020 15:40:19 -0400 Subject: [PATCH 0040/1035] [libc] [Obvious] Place entrypoints, specs alphabetically. --- libc/config/linux/aarch64/entrypoints.txt | 10 +++++----- libc/config/linux/api.td | 24 +++++++++++------------ libc/config/linux/x86_64/entrypoints.txt | 12 ++++++------ 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index b287a72d779b9..8cdc6ca68621f 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -4,16 +4,16 @@ set(TARGET_LIBC_ENTRYPOINTS # string.h entrypoints libc.src.string.bzero + libc.src.string.memchr libc.src.string.memcpy libc.src.string.memset - libc.src.string.strcpy + libc.src.string.memrchr libc.src.string.strcat - libc.src.string.strlen - libc.src.string.memchr libc.src.string.strchr - libc.src.string.strstr + libc.src.string.strcpy + libc.src.string.strlen libc.src.string.strnlen - libc.src.string.memrchr + libc.src.string.strstr ) set(TARGET_LIBM_ENTRYPOINTS diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 5f7a858d5fa31..3abdf9f55b07a 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -194,30 +194,30 @@ def MathAPI : PublicAPI<"math.h"> { def StringAPI : PublicAPI<"string.h"> { let Functions = [ "bzero", + "memchr", + "memcmp", "memcpy", "memmove", - "memcmp", - "memchr", + "memrchr", "memset", - "strcpy", - "strncpy", "strcat", - "strncat", + "strchr", "strcmp", "strcoll", + "strcpy", + "strcspn", + "strerror", + "strlen", + "strncat", "strncmp", - "strxfrm", - "strchr", - "strcspn", + "strncpy", + "strnlen", "strpbrk", "strrchr", "strspn", "strstr", "strtok", - "strerror", - "strlen", - "strnlen", - "memrchr" + "strxfrm", ]; let TypeDeclarations = [ diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index db53005304896..16dd702d0eb56 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -21,17 +21,17 @@ set(TARGET_LIBC_ENTRYPOINTS # string.h entrypoints libc.src.string.bzero + libc.src.string.memchr libc.src.string.memcpy + libc.src.string.memrchr libc.src.string.memset - libc.src.string.strcpy libc.src.string.strcat - libc.src.string.strlen - libc.src.string.strcmp - libc.src.string.memchr libc.src.string.strchr - libc.src.string.strstr + libc.src.string.strcmp + libc.src.string.strcpy + libc.src.string.strlen libc.src.string.strnlen - libc.src.string.memrchr + libc.src.string.strstr # sys/mman.h entrypoints libc.src.sys.mman.mmap From 1e77b3af125e4d1ba79e5d502b959d49f3addef9 Mon Sep 17 00:00:00 2001 From: Gui Andrade Date: Fri, 24 Jul 2020 20:00:02 +0000 Subject: [PATCH 0041/1035] [MSAN] Allow inserting array checks Flattens arrays by ORing together all their elements. Differential Revision: https://reviews.llvm.org/D84446 --- .../Instrumentation/MemorySanitizer.cpp | 79 +++++++++++-------- .../MemorySanitizer/check-array.ll | 19 +++++ 2 files changed, 65 insertions(+), 33 deletions(-) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/check-array.ll diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index f4f62a31d89ee..427abde4277d4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1149,36 +1149,30 @@ struct MemorySanitizerVisitor : public InstVisitor { const DataLayout &DL = F.getParent()->getDataLayout(); const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); - if (Shadow->getType()->isArrayTy()) { - paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, - OriginAlignment); - } else { - Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB); - if (auto *ConstantShadow = dyn_cast(ConvertedShadow)) { - if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) - paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, - OriginAlignment); - return; - } - - unsigned TypeSizeInBits = - DL.getTypeSizeInBits(ConvertedShadow->getType()); - unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); - if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) { - FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex]; - Value *ConvertedShadow2 = IRB.CreateZExt( - ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); - IRB.CreateCall(Fn, {ConvertedShadow2, - IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), - Origin}); - } else { - Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp"); - Instruction *CheckTerm = SplitBlockAndInsertIfThen( - Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights); - IRBuilder<> IRBNew(CheckTerm); - paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize, + Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB); + if (auto *ConstantShadow = dyn_cast(ConvertedShadow)) { + if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) + paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, OriginAlignment); - } + return; + } + + unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType()); + unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); + if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) { + FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex]; + Value *ConvertedShadow2 = + IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); + IRB.CreateCall(Fn, + {ConvertedShadow2, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), Origin}); + } else { + Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp"); + Instruction *CheckTerm = SplitBlockAndInsertIfThen( + Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights); + IRBuilder<> IRBNew(CheckTerm); + paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize, + OriginAlignment); } } @@ -1410,12 +1404,31 @@ struct MemorySanitizerVisitor : public InstVisitor { return Aggregator; } + // Extract combined shadow of array elements + Value *collapseArrayShadow(ArrayType *Array, Value *Shadow, + IRBuilder<> &IRB) { + if (!Array->getNumElements()) + return IRB.getIntN(/* width */ 1, /* value */ 0); + + Value *FirstItem = IRB.CreateExtractValue(Shadow, 0); + Value *Aggregator = convertShadowToScalar(FirstItem, IRB); + + for (unsigned Idx = 1; Idx < Array->getNumElements(); Idx++) { + Value *ShadowItem = IRB.CreateExtractValue(Shadow, Idx); + Value *ShadowInner = convertShadowToScalar(ShadowItem, IRB); + Aggregator = IRB.CreateOr(Aggregator, ShadowInner); + } + return Aggregator; + } + /// Convert a shadow value to it's flattened variant. The resulting /// shadow may not necessarily have the same bit width as the input /// value, but it will always be comparable to zero. Value *convertShadowToScalar(Value *V, IRBuilder<> &IRB) { if (StructType *Struct = dyn_cast(V->getType())) return collapseStructShadow(Struct, V, IRB); + if (ArrayType *Array = dyn_cast(V->getType())) + return collapseArrayShadow(Array, V, IRB); Type *Ty = V->getType(); Type *NoVecTy = getShadowTyNoVec(Ty); if (Ty == NoVecTy) return V; @@ -1765,10 +1778,10 @@ struct MemorySanitizerVisitor : public InstVisitor { if (!InsertChecks) return; #ifndef NDEBUG Type *ShadowTy = Shadow->getType(); - assert( - (isa(ShadowTy) || isa(ShadowTy) || - isa(ShadowTy)) && - "Can only insert checks for integer, vector, and struct shadow types"); + assert((isa(ShadowTy) || isa(ShadowTy) || + isa(ShadowTy) || isa(ShadowTy)) && + "Can only insert checks for integer, vector, and aggregate shadow " + "types"); #endif InstrumentationList.push_back( ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); diff --git a/llvm/test/Instrumentation/MemorySanitizer/check-array.ll b/llvm/test/Instrumentation/MemorySanitizer/check-array.ll new file mode 100644 index 0000000000000..6d1e517a186d4 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/check-array.ll @@ -0,0 +1,19 @@ +; RUN: opt < %s -msan-eager-checks -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan-module),function(msan)' 2>&1 | \ +; RUN: FileCheck -allow-deprecated-dag-overlap -check-prefixes=CHECK,CHECK-ORIGINS %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define noundef [2 x i24] @check_array([2 x i24]* %p) sanitize_memory { +; CHECK: @check_array([2 x i24]* [[P:%.*]]) +; CHECK: [[O:%.*]] = load [2 x i24], [2 x i24]* [[P]] + %o = load [2 x i24], [2 x i24]* %p +; CHECK: [[FIELD0:%.+]] = extractvalue [2 x i24] %_msld, 0 +; CHECK: [[FIELD1:%.+]] = extractvalue [2 x i24] %_msld, 1 +; CHECK: [[F1_OR:%.+]] = or i24 [[FIELD0]], [[FIELD1]] +; CHECK: %_mscmp = icmp ne i24 [[F1_OR]], 0 +; CHECK: br i1 %_mscmp +; CHECK: call void @__msan_warning +; CHECK: ret [2 x i24] [[O]] + ret [2 x i24] %o +} From 0db2934b0fa9e00ac98e2cb168adba96f6bcd0da Mon Sep 17 00:00:00 2001 From: shafik Date: Fri, 24 Jul 2020 13:11:59 -0700 Subject: [PATCH 0042/1035] [ASTImporter] Modify ImportDefiniton for ObjCInterfaceDecl so that we always the ImportDeclContext one we start the definition Once we start the definition of an ObjCInterfaceDecl we won't attempt to ImportDeclContext later on. Unlike RecordDecl case which uses DefinitionCompleter to force completeDefinition we don't seem to have a similar mechanism for ObjCInterfaceDecl. This fix was needed due to a bug we see in LLDB expression parsing where an initial expression cause an ObjCInterfaceDecl to be defined and subsequent expressions during import do not call ImportDeclContext and we can end up in a situation where ivars are imported out of order and not all ivars are imported. Differential Revision: https://reviews.llvm.org/D83972 --- clang/lib/AST/ASTImporter.cpp | 9 ++++----- .../API/lang/objc/bitfield_ivars/TestBitfieldIvars.py | 5 ++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index fcfaba625a722..e0bca8f08bb41 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -4758,11 +4758,10 @@ Error ASTNodeImporter::ImportDefinition( return ToImplOrErr.takeError(); } - if (shouldForceImportDeclContext(Kind)) { - // Import all of the members of this class. - if (Error Err = ImportDeclContext(From, /*ForceImport=*/true)) - return Err; - } + // Import all of the members of this class. + if (Error Err = ImportDeclContext(From, /*ForceImport=*/true)) + return Err; + return Error::success(); } diff --git a/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py b/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py index 6118854131024..4154bb144b350 100644 --- a/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py +++ b/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py @@ -14,9 +14,8 @@ def test(self): lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.m")) self.expect_expr("chb->hb->field1", result_type="unsigned int", result_value="0") - - ## FIXME field2 should have a value of 1 - self.expect("expr chb->hb->field2", matching=False, substrs = ["= 1"]) # this must happen second + ## This should happen second + self.expect_expr("chb->hb->field2", result_type="unsigned int", result_value="1") self.expect_expr("hb2->field1", result_type="unsigned int", result_value="10") self.expect_expr("hb2->field2", result_type="unsigned int", result_value="3") From dfa267a61c2b797b8fe9c345ee94742d496b39c6 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Fri, 24 Jul 2020 13:15:53 -0700 Subject: [PATCH 0043/1035] [mlir][shape] Fix missing dependency --- mlir/lib/Dialect/Shape/IR/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/lib/Dialect/Shape/IR/CMakeLists.txt b/mlir/lib/Dialect/Shape/IR/CMakeLists.txt index 84e085fbafdfa..e39f1c770f29f 100644 --- a/mlir/lib/Dialect/Shape/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Shape/IR/CMakeLists.txt @@ -17,4 +17,5 @@ add_mlir_dialect_library(MLIRShape MLIRInferTypeOpInterface MLIRIR MLIRSideEffectInterfaces + MLIRStandardOps ) From 9c87466c3960145eb5baa2881d4239de1dcfbc6f Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Thu, 16 Jul 2020 16:55:09 -0500 Subject: [PATCH 0044/1035] [OpenMP] Use `abort` not `error` for fatal runtime exceptions See PR46515 for the rational but generally, we want to *really* abort not gracefully shut down. Reviewed By: grokos, ABataev Differential Revision: https://reviews.llvm.org/D83963 --- openmp/libomptarget/src/private.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index 7772175cdca2e..ba2f161c7927a 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -93,17 +93,17 @@ typedef int (*TargetDataFuncPtrTy)(DeviceTy &, int32_t, void **, void **, fprintf(stderr, "Libomptarget message: " _str "\n", __VA_ARGS__); \ } while (0) -#define FATAL_MESSAGE0(_num, _str) \ - do { \ - fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \ - exit(1); \ +#define FATAL_MESSAGE0(_num, _str) \ + do { \ + fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \ + abort(); \ } while (0) -#define FATAL_MESSAGE(_num, _str, ...) \ - do { \ - fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \ - __VA_ARGS__); \ - exit(1); \ +#define FATAL_MESSAGE(_num, _str, ...) \ + do { \ + fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \ + __VA_ARGS__); \ + abort(); \ } while (0) // Implemented in libomp, they are called from within __tgt_* functions. From ce2d69b5577314ee97504876381273bbcddca3c5 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Fri, 24 Jul 2020 13:10:55 -0500 Subject: [PATCH 0045/1035] [SROA][Mem2Reg] Do not crash on alloca + addrspacecast SROA knows that it can look through addrspacecast but PromoteMemoryToRegister did not handle them. This caused an assertion error for the test case, exposed while running `Transforms/PhaseOrdering/inlining-alignment-assumptions.ll` with D83978 applied. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D84085 --- .../Utils/PromoteMemoryToRegister.cpp | 3 +++ .../Transforms/Mem2Reg/alloca_addrspace.ll | 19 +++++++++++++++++++ .../Transforms/SROA/alloca-address-space.ll | 13 +++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index c7e9c919ec471..f079f81a6e8f5 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -95,6 +95,9 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { return false; if (!onlyUsedByLifetimeMarkers(GEPI)) return false; + } else if (const AddrSpaceCastInst *ASCI = dyn_cast(U)) { + if (!onlyUsedByLifetimeMarkers(ASCI)) + return false; } else { return false; } diff --git a/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll b/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll new file mode 100644 index 0000000000000..95db828fd117e --- /dev/null +++ b/llvm/test/Transforms/Mem2Reg/alloca_addrspace.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mem2reg -S | FileCheck %s +; RUN: opt < %s -passes=mem2reg -S | FileCheck %s + +; Do not crash ;) + +target datalayout = "e-p:64:64-p5:32:32-A5" + +define amdgpu_kernel void @addressspace_alloca() { +; CHECK-LABEL: @addressspace_alloca( +; CHECK-NEXT: ret void +; + %alloca = alloca i8, align 8, addrspace(5) + %cast = addrspacecast i8 addrspace(5)* %alloca to i8* + call void @llvm.lifetime.start.p0i8(i64 2, i8* %cast) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr) diff --git a/llvm/test/Transforms/SROA/alloca-address-space.ll b/llvm/test/Transforms/SROA/alloca-address-space.ll index c8e22f5b25d8a..2ec59e66b9da6 100644 --- a/llvm/test/Transforms/SROA/alloca-address-space.ll +++ b/llvm/test/Transforms/SROA/alloca-address-space.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -sroa -S | FileCheck %s +; RUN: opt < %s -passes=sroa -S | FileCheck %s target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64-A2" declare void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i1) @@ -127,3 +128,15 @@ define void @test_load_store_diff_addr_space([2 x float] addrspace(1)* %complex1 store i64 %v2, i64 addrspace(1)* %p2 ret void } + +define void @addressspace_alloca_lifetime() { +; CHECK-LABEL: @addressspace_alloca_lifetime( +; CHECK-NEXT: ret void +; + %alloca = alloca i8, align 8, addrspace(2) + %cast = addrspacecast i8 addrspace(2)* %alloca to i8* + call void @llvm.lifetime.start.p0i8(i64 2, i8* %cast) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr) From ce8928f2e4e5e9d192f2c603b37d32da92214ab1 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Fri, 24 Jul 2020 13:11:19 -0500 Subject: [PATCH 0046/1035] [Mem2Reg] Teach promote to register about droppable instructions This is the first of two patches to address PR46753. We basically allow mem2reg to promote allocas that are used in doppable instructions, for now that means `llvm.assume`. The uses of the alloca (or a bitcast or zero offset GEP from there) are replaced by `undef` in the droppable instructions. Reviewed By: Tyker Differential Revision: https://reviews.llvm.org/D83976 --- llvm/include/llvm/Analysis/ValueTracking.h | 4 + llvm/include/llvm/IR/Value.h | 3 + llvm/lib/Analysis/ValueTracking.cpp | 25 ++++-- llvm/lib/IR/Value.cpp | 4 + .../Utils/PromoteMemoryToRegister.cpp | 46 +++++++--- .../Transforms/Mem2Reg/ignore-droppable.ll | 85 +++++++++++++++++++ 6 files changed, 149 insertions(+), 18 deletions(-) create mode 100644 llvm/test/Transforms/Mem2Reg/ignore-droppable.ll diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index ce5aea2e8d34d..4d81bb692b47c 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -416,6 +416,10 @@ class Value; /// Return true if the only users of this pointer are lifetime markers. bool onlyUsedByLifetimeMarkers(const Value *V); + /// Return true if the only users of this pointer are lifetime markers or + /// droppable instructions. + bool onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V); + /// Return true if speculation of the given load must be suppressed to avoid /// ordering or interfering with an active sanitizer. If not suppressed, /// dereferenceability and alignment must be proven separately. Note: This diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h index 04ca682746264..fa706a1b64086 100644 --- a/llvm/include/llvm/IR/Value.h +++ b/llvm/include/llvm/IR/Value.h @@ -470,6 +470,9 @@ class Value { void dropDroppableUses(llvm::function_ref ShouldDrop = [](const Use *) { return true; }); + /// Remove every use of \p User that can safely be removed. + void dropDroppableUsesByUser(const User &Usr); + /// Check if this value is used in the specified basic block. bool isUsedInBasicBlock(const BasicBlock *BB) const; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 116916a9be2d2..271200f7030a2 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4302,18 +4302,33 @@ bool llvm::getUnderlyingObjectsForCodeGen(const Value *V, return true; } -/// Return true if the only users of this pointer are lifetime markers. -bool llvm::onlyUsedByLifetimeMarkers(const Value *V) { +static bool onlyUsedByLifetimeMarkersOrDroppableInstsHelper( + const Value *V, bool AllowLifetime, bool AllowDroppable) { for (const User *U : V->users()) { const IntrinsicInst *II = dyn_cast(U); - if (!II) return false; - - if (!II->isLifetimeStartOrEnd()) + if (!II) return false; + + if (AllowLifetime && II->isLifetimeStartOrEnd()) + continue; + + if (AllowDroppable && II->isDroppable()) + continue; + + return false; } return true; } +bool llvm::onlyUsedByLifetimeMarkers(const Value *V) { + return onlyUsedByLifetimeMarkersOrDroppableInstsHelper( + V, /* AllowLifetime */ true, /* AllowDroppable */ false); +} +bool llvm::onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V) { + return onlyUsedByLifetimeMarkersOrDroppableInstsHelper( + V, /* AllowLifetime */ true, /* AllowDroppable */ true); +} + bool llvm::mustSuppressSpeculation(const LoadInst &LI) { if (!LI.isUnordered()) return true; diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index efb8d53e8964b..8c1f9c5a3b36f 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -192,6 +192,10 @@ void Value::dropDroppableUses( } } +void Value::dropDroppableUsesByUser(const User &Usr) { + dropDroppableUses([&](const Use *U) { return U->getUser() == &Usr; }); +} + bool Value::isUsedInBasicBlock(const BasicBlock *BB) const { // This can be computed either by scanning the instructions in BB, or by // scanning the use list of this Value. Both lists can be very long, but diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index f079f81a6e8f5..33904e54ac237 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -62,10 +62,6 @@ STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); bool llvm::isAllocaPromotable(const AllocaInst *AI) { - // FIXME: If the memory unit is of pointer or integer type, we can permit - // assignments to subsections of the memory unit. - unsigned AS = AI->getType()->getAddressSpace(); - // Only allow direct and non-volatile loads and stores... for (const User *U : AI->users()) { if (const LoadInst *LI = dyn_cast(U)) { @@ -81,19 +77,15 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { if (SI->isVolatile()) return false; } else if (const IntrinsicInst *II = dyn_cast(U)) { - if (!II->isLifetimeStartOrEnd()) + if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; } else if (const BitCastInst *BCI = dyn_cast(U)) { - if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) - return false; - if (!onlyUsedByLifetimeMarkers(BCI)) + if (!onlyUsedByLifetimeMarkersOrDroppableInsts(BCI)) return false; } else if (const GetElementPtrInst *GEPI = dyn_cast(U)) { - if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) - return false; if (!GEPI->hasAllZeroIndices()) return false; - if (!onlyUsedByLifetimeMarkers(GEPI)) + if (!onlyUsedByLifetimeMarkersOrDroppableInsts(GEPI)) return false; } else if (const AddrSpaceCastInst *ASCI = dyn_cast(U)) { if (!onlyUsedByLifetimeMarkers(ASCI)) @@ -315,16 +307,38 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) { AC->registerAssumption(CI); } -static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { +static void removeIntrinsicUsers(AllocaInst *AI) { // Knowing that this alloca is promotable, we know that it's safe to kill all // instructions except for load and store. + // Helper to drop the uses of \p I in \p UserI. + auto DropUsesIn = [](Instruction *UserI, Instruction *I, + Instruction::user_iterator &UI, + const Instruction::user_iterator &UE) { + // TODO For now we forget assumed information, this can be improved. + assert(isa(UserI) && + cast(UserI)->getIntrinsicID() == Intrinsic::assume && + "Expected assume"); + + // Skip ahead if User has multiple uses of I. + while (UI != UE && *UI == UserI) + ++UI; + + I->dropDroppableUsesByUser(*UserI); + }; + for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { Instruction *I = cast(*UI); ++UI; if (isa(I) || isa(I)) continue; + // Drop the use of AI in droppable instructions. + if (I->isDroppable()) { + DropUsesIn(I, AI, UI, UE); + continue; + } + if (!I->getType()->isVoidTy()) { // The only users of this bitcast/GEP instruction are lifetime intrinsics. // Follow the use/def chain to erase them now instead of leaving it for @@ -332,6 +346,12 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) { Instruction *Inst = cast(*UUI); ++UUI; + + // Drop the use of I in droppable instructions. + if (Inst->isDroppable()) { + DropUsesIn(Inst, I, UUI, UUE); + continue; + } Inst->eraseFromParent(); } } @@ -547,7 +567,7 @@ void PromoteMem2Reg::run() { assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); - removeLifetimeIntrinsicUsers(AI); + removeIntrinsicUsers(AI); if (AI->use_empty()) { // If there are no uses of the alloca, just delete it now. diff --git a/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll new file mode 100644 index 0000000000000..ecad226e1d0f0 --- /dev/null +++ b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mem2reg -S -o - < %s | FileCheck %s +; RUN: opt -passes=mem2reg -S -o - < %s | FileCheck %s + +declare void @llvm.assume(i1) +declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr) +declare void @llvm.lifetime.end.p0i8(i64 %size, i8* nocapture %ptr) + +define void @positive_assume_uses(i32* %arg) { +; CHECK-LABEL: @positive_assume_uses( +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(i32* [[ARG:%.*]]), "ignore"(i32* undef, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i32* undef, i64 8), "nonnull"(i32* [[ARG]]) ] +; CHECK-NEXT: ret void +; + %A = alloca i32 + call void @llvm.assume(i1 true) ["nonnull"(i32* %arg), "align"(i32* %A, i64 2)] + store i32 1, i32* %A + call void @llvm.assume(i1 true) ["align"(i32* %A, i64 8), "nonnull"(i32* %arg)] + ret void +} + +define void @negative_assume_condition_use() { +; CHECK-LABEL: @negative_assume_condition_use( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: [[CND:%.*]] = icmp eq i8* [[B]], null +; CHECK-NEXT: call void @llvm.assume(i1 [[CND]]) +; CHECK-NEXT: store i32 1, i32* [[A]], align 4 +; CHECK-NEXT: ret void +; + %A = alloca i32 + %B = bitcast i32* %A to i8* + %cnd = icmp eq i8* %B, null + call void @llvm.assume(i1 %cnd) + store i32 1, i32* %A + ret void +} + +define void @positive_multiple_assume_uses() { +; CHECK-LABEL: @positive_multiple_assume_uses( +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"({ i8, i16 }* undef, i64 8), "ignore"({ i8, i16 }* undef, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"({ i8, i16 }* undef), "ignore"({ i8, i16 }* undef, i64 2) ] +; CHECK-NEXT: ret void +; + %A = alloca {i8, i16} + call void @llvm.assume(i1 true) ["align"({i8, i16}* %A, i64 8), "align"({i8, i16}* %A, i64 16)] + store {i8, i16} zeroinitializer, {i8, i16}* %A + call void @llvm.assume(i1 true) ["nonnull"({i8, i16}* %A), "align"({i8, i16}* %A, i64 2)] + ret void +} + +define void @positive_gep_assume_uses() { +; CHECK-LABEL: @positive_gep_assume_uses( +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i8* undef, i64 8), "ignore"(i8* undef, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i8* undef), "ignore"(i8* undef, i64 2) ] +; CHECK-NEXT: ret void +; + %A = alloca {i8, i16} + %B = getelementptr {i8, i16}, {i8, i16}* %A, i32 0, i32 0 + call void @llvm.lifetime.start.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["align"(i8* %B, i64 8), "align"(i8* %B, i64 16)] + store {i8, i16} zeroinitializer, {i8, i16}* %A + call void @llvm.lifetime.end.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["nonnull"(i8* %B), "align"(i8* %B, i64 2)] + ret void +} + +define void @positive_mixed_assume_uses() { +; CHECK-LABEL: @positive_mixed_assume_uses( +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i8* undef), "ignore"(i8* undef, i64 8), "ignore"(i8* undef, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i8* undef), "ignore"(i8* undef, i64 2), "ignore"(i8* undef) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i32* undef), "ignore"(i32* undef, i64 2), "ignore"(i8* undef) ] +; CHECK-NEXT: ret void +; + %A = alloca i8 + %B = getelementptr i8, i8* %A, i32 0 + %C = bitcast i8* %A to i32* + call void @llvm.lifetime.start.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["nonnull"(i8* %B), "align"(i8* %A, i64 8), "align"(i8* %B, i64 16)] + store i8 1, i8* %A + call void @llvm.lifetime.end.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["nonnull"(i8* %B), "align"(i8* %A, i64 2), "nonnull"(i8* %A)] + call void @llvm.assume(i1 true) ["nonnull"(i32* %C), "align"(i32* %C, i64 2), "nonnull"(i8* %A)] + ret void +} From aa09db495a9b40de456e196c0deb6b98324779b9 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Fri, 24 Jul 2020 14:06:27 -0500 Subject: [PATCH 0047/1035] [SROA] Teach promote to register about droppable instructions This is the second of two patches to address PR46753. We basically allow SROA to promote allocas that are used in doppable instructions, for now that means `llvm.assume`. The (transitive) uses are replaced by `undef` in the droppable instructions. See also D83976. Reviewed By: Tyker Differential Revision: https://reviews.llvm.org/D83978 --- llvm/lib/Transforms/Scalar/SROA.cpp | 21 +++-- llvm/test/Transforms/SROA/ignore-droppable.ll | 88 +++++++++++++++++++ 2 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/SROA/ignore-droppable.ll diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 89f324deef9fd..8eb2853347bdb 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -920,6 +920,9 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { // FIXME: What about debug intrinsics? This matches old behavior, but // doesn't make sense. void visitIntrinsicInst(IntrinsicInst &II) { + if (II.isDroppable()) + return; + if (!IsOffsetKnown) return PI.setAborted(&II); @@ -1825,7 +1828,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. } else if (IntrinsicInst *II = dyn_cast(U->getUser())) { - if (!II->isLifetimeStartOrEnd()) + if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { // Disable vector promotion when there are loads or stores of an FCA. @@ -2058,7 +2061,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. } else if (IntrinsicInst *II = dyn_cast(U->getUser())) { - if (!II->isLifetimeStartOrEnd()) + if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; } else { return false; @@ -2778,7 +2781,7 @@ class llvm::sroa::AllocaSliceRewriter Type *AllocaTy = NewAI.getAllocatedType(); Type *ScalarTy = AllocaTy->getScalarType(); - + const bool CanContinue = [&]() { if (VecTy || IntTy) return true; @@ -3074,13 +3077,21 @@ class llvm::sroa::AllocaSliceRewriter } bool visitIntrinsicInst(IntrinsicInst &II) { - assert(II.isLifetimeStartOrEnd()); + assert((II.isLifetimeStartOrEnd() || II.isDroppable()) && + "Unexpected intrinsic!"); LLVM_DEBUG(dbgs() << " original: " << II << "\n"); - assert(II.getArgOperand(1) == OldPtr); // Record this instruction for deletion. Pass.DeadInsts.insert(&II); + if (II.isDroppable()) { + assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume"); + // TODO For now we forget assumed information, this can be improved. + OldPtr->dropDroppableUsesByUser(II); + return true; + } + + assert(II.getArgOperand(1) == OldPtr); // Lifetime intrinsics are only promotable if they cover the whole alloca. // Therefore, we drop lifetime intrinsics which don't cover the whole // alloca. diff --git a/llvm/test/Transforms/SROA/ignore-droppable.ll b/llvm/test/Transforms/SROA/ignore-droppable.ll new file mode 100644 index 0000000000000..85f3d8a5f39f4 --- /dev/null +++ b/llvm/test/Transforms/SROA/ignore-droppable.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -sroa -S -o - < %s | FileCheck %s +; RUN: opt -passes=sroa -S -o - < %s | FileCheck %s + +declare void @llvm.assume(i1) +declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr) +declare void @llvm.lifetime.end.p0i8(i64 %size, i8* nocapture %ptr) + +define void @positive_assume_uses(i32* %arg) { +; CHECK-LABEL: @positive_assume_uses( +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(i32* [[ARG:%.*]]), "ignore"(i32* undef, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i32* undef, i64 8), "nonnull"(i32* [[ARG]]) ] +; CHECK-NEXT: ret void +; + %A = alloca i32 + call void @llvm.assume(i1 true) ["nonnull"(i32* %arg), "align"(i32* %A, i64 2)] + store i32 1, i32* %A + call void @llvm.assume(i1 true) ["align"(i32* %A, i64 8), "nonnull"(i32* %arg)] + ret void +} + +define void @negative_assume_condition_use() { +; CHECK-LABEL: @negative_assume_condition_use( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[B:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: [[CND:%.*]] = icmp eq i8* [[B]], null +; CHECK-NEXT: call void @llvm.assume(i1 [[CND]]) +; CHECK-NEXT: store i32 1, i32* [[A]], align 4 +; CHECK-NEXT: ret void +; + %A = alloca i32 + %B = bitcast i32* %A to i8* + %cnd = icmp eq i8* %B, null + call void @llvm.assume(i1 %cnd) + store i32 1, i32* %A + ret void +} + +define void @positive_multiple_assume_uses() { +; CHECK-LABEL: @positive_multiple_assume_uses( +; CHECK-NEXT: [[A:%.*]] = alloca { i8, i16 }, align 8 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"({ i8, i16 }* [[A]], i64 8), "align"({ i8, i16 }* [[A]], i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"({ i8, i16 }* [[A]]), "align"({ i8, i16 }* [[A]], i64 2) ] +; CHECK-NEXT: ret void +; + %A = alloca {i8, i16} + call void @llvm.assume(i1 true) ["align"({i8, i16}* %A, i64 8), "align"({i8, i16}* %A, i64 16)] + store {i8, i16} zeroinitializer, {i8, i16}* %A + call void @llvm.assume(i1 true) ["nonnull"({i8, i16}* %A), "align"({i8, i16}* %A, i64 2)] + ret void +} + +define void @positive_gep_assume_uses() { +; CHECK-LABEL: @positive_gep_assume_uses( +; CHECK-NEXT: [[A:%.*]] = alloca { i8, i16 }, align 8 +; CHECK-NEXT: [[B:%.*]] = getelementptr { i8, i16 }, { i8, i16 }* [[A]], i32 0, i32 0 +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(i8* [[B]], i64 8), "align"(i8* [[B]], i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(i8* [[B]]), "align"(i8* [[B]], i64 2) ] +; CHECK-NEXT: ret void +; + %A = alloca {i8, i16} + %B = getelementptr {i8, i16}, {i8, i16}* %A, i32 0, i32 0 + call void @llvm.lifetime.start.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["align"(i8* %B, i64 8), "align"(i8* %B, i64 16)] + store {i8, i16} zeroinitializer, {i8, i16}* %A + call void @llvm.lifetime.end.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["nonnull"(i8* %B), "align"(i8* %B, i64 2)] + ret void +} + +define void @positive_mixed_assume_uses() { +; CHECK-LABEL: @positive_mixed_assume_uses( +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i8* undef), "ignore"(i8* undef, i64 8), "ignore"(i8* undef, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i8* undef), "ignore"(i8* undef, i64 2), "ignore"(i8* undef) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(i32* undef), "ignore"(i32* undef, i64 2), "ignore"(i8* undef) ] +; CHECK-NEXT: ret void +; + %A = alloca i8 + %B = getelementptr i8, i8* %A, i32 0 + %C = bitcast i8* %A to i32* + call void @llvm.lifetime.start.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["nonnull"(i8* %B), "align"(i8* %A, i64 8), "align"(i8* %B, i64 16)] + store i8 1, i8* %A + call void @llvm.lifetime.end.p0i8(i64 2, i8* %B) + call void @llvm.assume(i1 true) ["nonnull"(i8* %B), "align"(i8* %A, i64 2), "nonnull"(i8* %A)] + call void @llvm.assume(i1 true) ["nonnull"(i32* %C), "align"(i32* %C, i64 2), "nonnull"(i8* %A)] + ret void +} From c02aa53ecb25189bfdecd852a251e1c17ed0ee24 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Thu, 23 Jul 2020 12:52:46 -0700 Subject: [PATCH 0048/1035] [AArch64][SVE] Add "fast" fcmp operations. dacf8d3 added support for most fcmp operations, but there are some extra variations I hadn't considered: SelectionDAG supports float comparisons that are neither ordered nor unordered. Add support for the missing operations. Differential Revision: https://reviews.llvm.org/D84460 --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 + .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 21 +++++-- llvm/test/CodeGen/AArch64/sve-fcmp.ll | 55 +++++++++++++++++++ 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cb5530077fdd7..f080abd8e6271 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -230,7 +230,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MVT::nxv2f64 }) { setCondCodeAction(ISD::SETO, VT, Expand); setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETLE, VT, Expand); setCondCodeAction(ISD::SETULT, VT, Expand); setCondCodeAction(ISD::SETULE, VT, Expand); setCondCodeAction(ISD::SETUGE, VT, Expand); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index dc501a9536b9a..7c39268a4441f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -210,6 +210,19 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>; def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; +def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs), + [(setoge node:$lhs, node:$rhs), + (setge node:$lhs, node:$rhs)]>; +def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs), + [(setogt node:$lhs, node:$rhs), + (setgt node:$lhs, node:$rhs)]>; +def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs), + [(setoeq node:$lhs, node:$rhs), + (seteq node:$lhs, node:$rhs)]>; +def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs), + [(setone node:$lhs, node:$rhs), + (setne node:$lhs, node:$rhs)]>; + let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; @@ -1172,10 +1185,10 @@ multiclass sve_prefetch; defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; - defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>; + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>; defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll index 86fff734f1883..3bb3627e23939 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll @@ -257,3 +257,58 @@ define @oeq_4f32_zext( %x, %y to ret %r } + +define @eq_fast( %x, %x2) { +; CHECK-LABEL: eq_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast oeq %x, %x2 + ret %y +} +define @gt_fast( %x, %x2) { +; CHECK-LABEL: gt_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast ogt %x, %x2 + ret %y +} +define @ge_fast( %x, %x2) { +; CHECK-LABEL: ge_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast oge %x, %x2 + ret %y +} +define @lt_fast( %x, %x2) { +; CHECK-LABEL: lt_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: ret + %y = fcmp fast olt %x, %x2 + ret %y +} +define @le_fast( %x, %x2) { +; CHECK-LABEL: le_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmge p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: ret + %y = fcmp fast ole %x, %x2 + ret %y +} +define @ne_fast( %x, %x2) { +; CHECK-LABEL: ne_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast one %x, %x2 + ret %y +} From 986e3af53bfe591e88a1ae4f82ea1cc0a15819a3 Mon Sep 17 00:00:00 2001 From: Yifan Shen Date: Fri, 24 Jul 2020 12:45:41 -0700 Subject: [PATCH 0049/1035] Add Debug Info Size to Symbol Status Summary: If a module has debug info, the size of debug symbol will be displayed after the Symbols Loaded Message for each module in the VScode modules view.{F12335461} Reviewers: wallace, clayborg Reviewed By: wallace, clayborg Subscribers: cfe-commits, aprantl, lldb-commits Tags: #lldb, #clang Differential Revision: https://reviews.llvm.org/D83731 --- clang/tools/clang-format/git-clang-format | 585 ------------------ .../tools/lldb-vscode/lldbvscode_testcase.py | 9 +- .../API/tools/lldb-vscode/module/Makefile | 10 +- .../lldb-vscode/module/TestVSCode_module.py | 67 +- lldb/tools/lldb-vscode/JSONUtils.cpp | 61 +- 5 files changed, 121 insertions(+), 611 deletions(-) diff --git a/clang/tools/clang-format/git-clang-format b/clang/tools/clang-format/git-clang-format index f3cd585e7f4a0..e69de29bb2d1d 100755 --- a/clang/tools/clang-format/git-clang-format +++ b/clang/tools/clang-format/git-clang-format @@ -1,585 +0,0 @@ -#!/usr/bin/env python -# -#===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===# -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===------------------------------------------------------------------------===# - -r""" -clang-format git integration -============================ - -This file provides a clang-format integration for git. Put it somewhere in your -path and ensure that it is executable. Then, "git clang-format" will invoke -clang-format on the changes in current files or a specific commit. - -For further details, run: -git clang-format -h - -Requires Python 2.7 or Python 3 -""" - -from __future__ import absolute_import, division, print_function -import argparse -import collections -import contextlib -import errno -import os -import re -import subprocess -import sys - -usage = 'git clang-format [OPTIONS] [] [] [--] [...]' - -desc = ''' -If zero or one commits are given, run clang-format on all lines that differ -between the working directory and , which defaults to HEAD. Changes are -only applied to the working directory. - -If two commits are given (requires --diff), run clang-format on all lines in the -second that differ from the first . - -The following git-config settings set the default of the corresponding option: - clangFormat.binary - clangFormat.commit - clangFormat.extensions - clangFormat.style -''' - -# Name of the temporary index file in which save the output of clang-format. -# This file is created within the .git directory. -temp_index_basename = 'clang-format-index' - - -Range = collections.namedtuple('Range', 'start, count') - - -def main(): - config = load_git_config() - - # In order to keep '--' yet allow options after positionals, we need to - # check for '--' ourselves. (Setting nargs='*' throws away the '--', while - # nargs=argparse.REMAINDER disallows options after positionals.) - argv = sys.argv[1:] - try: - idx = argv.index('--') - except ValueError: - dash_dash = [] - else: - dash_dash = argv[idx:] - argv = argv[:idx] - - default_extensions = ','.join([ - # From clang/lib/Frontend/FrontendOptions.cpp, all lower case - 'c', 'h', # C - 'm', # ObjC - 'mm', # ObjC++ - 'cc', 'cp', 'cpp', 'c++', 'cxx', 'hh', 'hpp', 'hxx', # C++ - 'cu', # CUDA - # Other languages that clang-format supports - 'proto', 'protodevel', # Protocol Buffers - 'java', # Java - 'js', # JavaScript - 'ts', # TypeScript - 'cs', # C Sharp - ]) - - p = argparse.ArgumentParser( - usage=usage, formatter_class=argparse.RawDescriptionHelpFormatter, - description=desc) - p.add_argument('--binary', - default=config.get('clangformat.binary', 'clang-format'), - help='path to clang-format'), - p.add_argument('--commit', - default=config.get('clangformat.commit', 'HEAD'), - help='default commit to use if none is specified'), - p.add_argument('--diff', action='store_true', - help='print a diff instead of applying the changes') - p.add_argument('--extensions', - default=config.get('clangformat.extensions', - default_extensions), - help=('comma-separated list of file extensions to format, ' - 'excluding the period and case-insensitive')), - p.add_argument('-f', '--force', action='store_true', - help='allow changes to unstaged files') - p.add_argument('-p', '--patch', action='store_true', - help='select hunks interactively') - p.add_argument('-q', '--quiet', action='count', default=0, - help='print less information') - p.add_argument('--style', - default=config.get('clangformat.style', None), - help='passed to clang-format'), - p.add_argument('-v', '--verbose', action='count', default=0, - help='print extra information') - # We gather all the remaining positional arguments into 'args' since we need - # to use some heuristics to determine whether or not was present. - # However, to print pretty messages, we make use of metavar and help. - p.add_argument('args', nargs='*', metavar='', - help='revision from which to compute the diff') - p.add_argument('ignored', nargs='*', metavar='...', - help='if specified, only consider differences in these files') - opts = p.parse_args(argv) - - opts.verbose -= opts.quiet - del opts.quiet - - commits, files = interpret_args(opts.args, dash_dash, opts.commit) - if len(commits) > 1: - if not opts.diff: - die('--diff is required when two commits are given') - else: - if len(commits) > 2: - die('at most two commits allowed; %d given' % len(commits)) - changed_lines = compute_diff_and_extract_lines(commits, files) - if opts.verbose >= 1: - ignored_files = set(changed_lines) - filter_by_extension(changed_lines, opts.extensions.lower().split(',')) - if opts.verbose >= 1: - ignored_files.difference_update(changed_lines) - if ignored_files: - print('Ignoring changes in the following files (wrong extension):') - for filename in ignored_files: - print(' %s' % filename) - if changed_lines: - print('Running clang-format on the following files:') - for filename in changed_lines: - print(' %s' % filename) - if not changed_lines: - print('no modified files to format') - return - # The computed diff outputs absolute paths, so we must cd before accessing - # those files. - cd_to_toplevel() - if len(commits) > 1: - old_tree = commits[1] - new_tree = run_clang_format_and_save_to_tree(changed_lines, - revision=commits[1], - binary=opts.binary, - style=opts.style) - else: - old_tree = create_tree_from_workdir(changed_lines) - new_tree = run_clang_format_and_save_to_tree(changed_lines, - binary=opts.binary, - style=opts.style) - if opts.verbose >= 1: - print('old tree: %s' % old_tree) - print('new tree: %s' % new_tree) - if old_tree == new_tree: - if opts.verbose >= 0: - print('clang-format did not modify any files') - elif opts.diff: - print_diff(old_tree, new_tree) - else: - changed_files = apply_changes(old_tree, new_tree, force=opts.force, - patch_mode=opts.patch) - if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1: - print('changed files:') - for filename in changed_files: - print(' %s' % filename) - - -def load_git_config(non_string_options=None): - """Return the git configuration as a dictionary. - - All options are assumed to be strings unless in `non_string_options`, in which - is a dictionary mapping option name (in lower case) to either "--bool" or - "--int".""" - if non_string_options is None: - non_string_options = {} - out = {} - for entry in run('git', 'config', '--list', '--null').split('\0'): - if entry: - if '\n' in entry: - name, value = entry.split('\n', 1) - else: - # A setting with no '=' ('\n' with --null) is implicitly 'true' - name = entry - value = 'true' - if name in non_string_options: - value = run('git', 'config', non_string_options[name], name) - out[name] = value - return out - - -def interpret_args(args, dash_dash, default_commit): - """Interpret `args` as "[commits] [--] [files]" and return (commits, files). - - It is assumed that "--" and everything that follows has been removed from - args and placed in `dash_dash`. - - If "--" is present (i.e., `dash_dash` is non-empty), the arguments to its - left (if present) are taken as commits. Otherwise, the arguments are checked - from left to right if they are commits or files. If commits are not given, - a list with `default_commit` is used.""" - if dash_dash: - if len(args) == 0: - commits = [default_commit] - else: - commits = args - for commit in commits: - object_type = get_object_type(commit) - if object_type not in ('commit', 'tag'): - if object_type is None: - die("'%s' is not a commit" % commit) - else: - die("'%s' is a %s, but a commit was expected" % (commit, object_type)) - files = dash_dash[1:] - elif args: - commits = [] - while args: - if not disambiguate_revision(args[0]): - break - commits.append(args.pop(0)) - if not commits: - commits = [default_commit] - files = args - else: - commits = [default_commit] - files = [] - return commits, files - - -def disambiguate_revision(value): - """Returns True if `value` is a revision, False if it is a file, or dies.""" - # If `value` is ambiguous (neither a commit nor a file), the following - # command will die with an appropriate error message. - run('git', 'rev-parse', value, verbose=False) - object_type = get_object_type(value) - if object_type is None: - return False - if object_type in ('commit', 'tag'): - return True - die('`%s` is a %s, but a commit or filename was expected' % - (value, object_type)) - - -def get_object_type(value): - """Returns a string description of an object's type, or None if it is not - a valid git object.""" - cmd = ['git', 'cat-file', '-t', value] - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - if p.returncode != 0: - return None - return convert_string(stdout.strip()) - - -def compute_diff_and_extract_lines(commits, files): - """Calls compute_diff() followed by extract_lines().""" - diff_process = compute_diff(commits, files) - changed_lines = extract_lines(diff_process.stdout) - diff_process.stdout.close() - diff_process.wait() - if diff_process.returncode != 0: - # Assume error was already printed to stderr. - sys.exit(2) - return changed_lines - - -def compute_diff(commits, files): - """Return a subprocess object producing the diff from `commits`. - - The return value's `stdin` file object will produce a patch with the - differences between the working directory and the first commit if a single - one was specified, or the difference between both specified commits, filtered - on `files` (if non-empty). Zero context lines are used in the patch.""" - git_tool = 'diff-index' - if len(commits) > 1: - git_tool = 'diff-tree' - cmd = ['git', git_tool, '-p', '-U0'] + commits + ['--'] - cmd.extend(files) - p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) - p.stdin.close() - return p - - -def extract_lines(patch_file): - """Extract the changed lines in `patch_file`. - - The return value is a dictionary mapping filename to a list of (start_line, - line_count) pairs. - - The input must have been produced with ``-U0``, meaning unidiff format with - zero lines of context. The return value is a dict mapping filename to a - list of line `Range`s.""" - matches = {} - for line in patch_file: - line = convert_string(line) - match = re.search(r'^\+\+\+\ [^/]+/(.*)', line) - if match: - filename = match.group(1).rstrip('\r\n') - match = re.search(r'^@@ -[0-9,]+ \+(\d+)(,(\d+))?', line) - if match: - start_line = int(match.group(1)) - line_count = 1 - if match.group(3): - line_count = int(match.group(3)) - if line_count > 0: - matches.setdefault(filename, []).append(Range(start_line, line_count)) - return matches - - -def filter_by_extension(dictionary, allowed_extensions): - """Delete every key in `dictionary` that doesn't have an allowed extension. - - `allowed_extensions` must be a collection of lowercase file extensions, - excluding the period.""" - allowed_extensions = frozenset(allowed_extensions) - for filename in list(dictionary.keys()): - base_ext = filename.rsplit('.', 1) - if len(base_ext) == 1 and '' in allowed_extensions: - continue - if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions: - del dictionary[filename] - - -def cd_to_toplevel(): - """Change to the top level of the git repository.""" - toplevel = run('git', 'rev-parse', '--show-toplevel') - os.chdir(toplevel) - - -def create_tree_from_workdir(filenames): - """Create a new git tree with the given files from the working directory. - - Returns the object ID (SHA-1) of the created tree.""" - return create_tree(filenames, '--stdin') - - -def run_clang_format_and_save_to_tree(changed_lines, revision=None, - binary='clang-format', style=None): - """Run clang-format on each file and save the result to a git tree. - - Returns the object ID (SHA-1) of the created tree.""" - def iteritems(container): - try: - return container.iteritems() # Python 2 - except AttributeError: - return container.items() # Python 3 - def index_info_generator(): - for filename, line_ranges in iteritems(changed_lines): - if revision: - git_metadata_cmd = ['git', 'ls-tree', - '%s:%s' % (revision, os.path.dirname(filename)), - os.path.basename(filename)] - git_metadata = subprocess.Popen(git_metadata_cmd, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - stdout = git_metadata.communicate()[0] - mode = oct(int(stdout.split()[0], 8)) - else: - mode = oct(os.stat(filename).st_mode) - # Adjust python3 octal format so that it matches what git expects - if mode.startswith('0o'): - mode = '0' + mode[2:] - blob_id = clang_format_to_blob(filename, line_ranges, - revision=revision, - binary=binary, - style=style) - yield '%s %s\t%s' % (mode, blob_id, filename) - return create_tree(index_info_generator(), '--index-info') - - -def create_tree(input_lines, mode): - """Create a tree object from the given input. - - If mode is '--stdin', it must be a list of filenames. If mode is - '--index-info' is must be a list of values suitable for "git update-index - --index-info", such as " ". Any other mode - is invalid.""" - assert mode in ('--stdin', '--index-info') - cmd = ['git', 'update-index', '--add', '-z', mode] - with temporary_index_file(): - p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - for line in input_lines: - p.stdin.write(to_bytes('%s\0' % line)) - p.stdin.close() - if p.wait() != 0: - die('`%s` failed' % ' '.join(cmd)) - tree_id = run('git', 'write-tree') - return tree_id - - -def clang_format_to_blob(filename, line_ranges, revision=None, - binary='clang-format', style=None): - """Run clang-format on the given file and save the result to a git blob. - - Runs on the file in `revision` if not None, or on the file in the working - directory if `revision` is None. - - Returns the object ID (SHA-1) of the created blob.""" - clang_format_cmd = [binary] - if style: - clang_format_cmd.extend(['-style='+style]) - clang_format_cmd.extend([ - '-lines=%s:%s' % (start_line, start_line+line_count-1) - for start_line, line_count in line_ranges]) - if revision: - clang_format_cmd.extend(['-assume-filename='+filename]) - git_show_cmd = ['git', 'cat-file', 'blob', '%s:%s' % (revision, filename)] - git_show = subprocess.Popen(git_show_cmd, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - git_show.stdin.close() - clang_format_stdin = git_show.stdout - else: - clang_format_cmd.extend([filename]) - git_show = None - clang_format_stdin = subprocess.PIPE - try: - clang_format = subprocess.Popen(clang_format_cmd, stdin=clang_format_stdin, - stdout=subprocess.PIPE) - if clang_format_stdin == subprocess.PIPE: - clang_format_stdin = clang_format.stdin - except OSError as e: - if e.errno == errno.ENOENT: - die('cannot find executable "%s"' % binary) - else: - raise - clang_format_stdin.close() - hash_object_cmd = ['git', 'hash-object', '-w', '--path='+filename, '--stdin'] - hash_object = subprocess.Popen(hash_object_cmd, stdin=clang_format.stdout, - stdout=subprocess.PIPE) - clang_format.stdout.close() - stdout = hash_object.communicate()[0] - if hash_object.returncode != 0: - die('`%s` failed' % ' '.join(hash_object_cmd)) - if clang_format.wait() != 0: - die('`%s` failed' % ' '.join(clang_format_cmd)) - if git_show and git_show.wait() != 0: - die('`%s` failed' % ' '.join(git_show_cmd)) - return convert_string(stdout).rstrip('\r\n') - - -@contextlib.contextmanager -def temporary_index_file(tree=None): - """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting - the file afterward.""" - index_path = create_temporary_index(tree) - old_index_path = os.environ.get('GIT_INDEX_FILE') - os.environ['GIT_INDEX_FILE'] = index_path - try: - yield - finally: - if old_index_path is None: - del os.environ['GIT_INDEX_FILE'] - else: - os.environ['GIT_INDEX_FILE'] = old_index_path - os.remove(index_path) - - -def create_temporary_index(tree=None): - """Create a temporary index file and return the created file's path. - - If `tree` is not None, use that as the tree to read in. Otherwise, an - empty index is created.""" - gitdir = run('git', 'rev-parse', '--git-dir') - path = os.path.join(gitdir, temp_index_basename) - if tree is None: - tree = '--empty' - run('git', 'read-tree', '--index-output='+path, tree) - return path - - -def print_diff(old_tree, new_tree): - """Print the diff between the two trees to stdout.""" - # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output - # is expected to be viewed by the user, and only the former does nice things - # like color and pagination. - # - # We also only print modified files since `new_tree` only contains the files - # that were modified, so unmodified files would show as deleted without the - # filter. - subprocess.check_call(['git', 'diff', '--diff-filter=M', old_tree, new_tree, - '--']) - - -def apply_changes(old_tree, new_tree, force=False, patch_mode=False): - """Apply the changes in `new_tree` to the working directory. - - Bails if there are local changes in those files and not `force`. If - `patch_mode`, runs `git checkout --patch` to select hunks interactively.""" - changed_files = run('git', 'diff-tree', '--diff-filter=M', '-r', '-z', - '--name-only', old_tree, - new_tree).rstrip('\0').split('\0') - if not force: - unstaged_files = run('git', 'diff-files', '--name-status', *changed_files) - if unstaged_files: - print('The following files would be modified but ' - 'have unstaged changes:', file=sys.stderr) - print(unstaged_files, file=sys.stderr) - print('Please commit, stage, or stash them first.', file=sys.stderr) - sys.exit(2) - if patch_mode: - # In patch mode, we could just as well create an index from the new tree - # and checkout from that, but then the user will be presented with a - # message saying "Discard ... from worktree". Instead, we use the old - # tree as the index and checkout from new_tree, which gives the slightly - # better message, "Apply ... to index and worktree". This is not quite - # right, since it won't be applied to the user's index, but oh well. - with temporary_index_file(old_tree): - subprocess.check_call(['git', 'checkout', '--patch', new_tree]) - index_tree = old_tree - else: - with temporary_index_file(new_tree): - run('git', 'checkout-index', '-a', '-f') - return changed_files - - -def run(*args, **kwargs): - stdin = kwargs.pop('stdin', '') - verbose = kwargs.pop('verbose', True) - strip = kwargs.pop('strip', True) - for name in kwargs: - raise TypeError("run() got an unexpected keyword argument '%s'" % name) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - stdin=subprocess.PIPE) - stdout, stderr = p.communicate(input=stdin) - - stdout = convert_string(stdout) - stderr = convert_string(stderr) - - if p.returncode == 0: - if stderr: - if verbose: - print('`%s` printed to stderr:' % ' '.join(args), file=sys.stderr) - print(stderr.rstrip(), file=sys.stderr) - if strip: - stdout = stdout.rstrip('\r\n') - return stdout - if verbose: - print('`%s` returned %s' % (' '.join(args), p.returncode), file=sys.stderr) - if stderr: - print(stderr.rstrip(), file=sys.stderr) - sys.exit(2) - - -def die(message): - print('error:', message, file=sys.stderr) - sys.exit(2) - - -def to_bytes(str_input): - # Encode to UTF-8 to get binary data. - if isinstance(str_input, bytes): - return str_input - return str_input.encode('utf-8') - - -def to_string(bytes_input): - if isinstance(bytes_input, str): - return bytes_input - return bytes_input.encode('utf-8') - - -def convert_string(bytes_input): - try: - return to_string(bytes_input.decode('utf-8')) - except AttributeError: # 'str' object has no attribute 'decode'. - return str(bytes_input) - except UnicodeError: - return str(bytes_input) - -if __name__ == '__main__': - main() diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py index c1b33c220b4bd..fa5a9c0db1ebd 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py @@ -53,11 +53,12 @@ def set_function_breakpoints(self, functions, condition=None, breakpoint_ids.append('%i' % (breakpoint['id'])) return breakpoint_ids - def waitUntil(self, condition): - while True: - if condition(): - break + def waitUntil(self, condition_callback): + for _ in range(20): + if condition_callback(): + return True time.sleep(0.5) + return False def verify_breakpoint_hit(self, breakpoint_ids): '''Wait for the process we are debugging to stop, and verify we hit diff --git a/lldb/test/API/tools/lldb-vscode/module/Makefile b/lldb/test/API/tools/lldb-vscode/module/Makefile index 1fb944b138937..b30baf48b972e 100644 --- a/lldb/test/API/tools/lldb-vscode/module/Makefile +++ b/lldb/test/API/tools/lldb-vscode/module/Makefile @@ -2,12 +2,16 @@ DYLIB_NAME := foo DYLIB_CXX_SOURCES := foo.cpp CXX_SOURCES := main.cpp -all: a.out.stripped +LD_EXTRAS := -Wl,-rpath "-Wl,$(shell pwd)" +USE_LIBDL :=1 include Makefile.rules -a.out.stripped: a.out.dSYM +all: a.out.stripped + +a.out.stripped: strip -o a.out.stripped a.out + ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out.stripped -endif +endif \ No newline at end of file diff --git a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py index 40c4145b38e36..a16430fccae1d 100644 --- a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py +++ b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py @@ -10,56 +10,93 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbvscode_testcase +import re class TestVSCode_module(lldbvscode_testcase.VSCodeTestCaseBase): mydir = TestBase.compute_mydir(__file__) - - @skipIfWindows - @skipUnlessDarwin - @skipIfRemote - def test_modules_event(self): + def run_test(self, symbol_basename, expect_debug_info_size): program_basename = "a.out.stripped" - program= self.getBuildArtifact(program_basename) + program = self.getBuildArtifact(program_basename) self.build_and_launch(program) functions = ['foo'] breakpoint_ids = self.set_function_breakpoints(functions) - self.assertEquals(len(breakpoint_ids), len(functions), - 'expect one breakpoint') + self.assertEquals(len(breakpoint_ids), len(functions), 'expect one breakpoint') self.continue_to_breakpoints(breakpoint_ids) active_modules = self.vscode.get_active_modules() - self.assertIn(program_basename, active_modules, '%s module is in active modules' % (program_basename)) program_module = active_modules[program_basename] + self.assertIn(program_basename, active_modules, '%s module is in active modules' % (program_basename)) self.assertIn('name', program_module, 'make sure name is in module') self.assertEqual(program_basename, program_module['name']) self.assertIn('path', program_module, 'make sure path is in module') self.assertEqual(program, program_module['path']) self.assertTrue('symbolFilePath' not in program_module, 'Make sure a.out.stripped has no debug info') self.assertEqual('Symbols not found.', program_module['symbolStatus']) - symbol_path = self.getBuildArtifact("a.out") - self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbol_path))) + symbols_path = self.getBuildArtifact(symbol_basename) + self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbols_path))) def checkSymbolsLoaded(): active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] return 'Symbols loaded.' == program_module['symbolStatus'] - self.waitUntil(checkSymbolsLoaded) + def checkSymbolsLoadedWithSize(): + active_modules = self.vscode.get_active_modules() + program_module = active_modules[program_basename] + symbolsStatus = program_module['symbolStatus'] + symbol_regex = re.compile(r"Symbols loaded. \([0-9]+(\.[0-9]*)?[KMG]?B\)") + return symbol_regex.match(program_module['symbolStatus']) + + if expect_debug_info_size: + self.waitUntil(checkSymbolsLoadedWithSize) + else: + self.waitUntil(checkSymbolsLoaded) active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] self.assertEqual(program_basename, program_module['name']) self.assertEqual(program, program_module['path']) - self.assertEqual('Symbols loaded.', program_module['symbolStatus']) self.assertIn('symbolFilePath', program_module) - self.assertEqual(symbol_path, program_module['symbolFilePath']) + self.assertIn(symbols_path, program_module['symbolFilePath']) self.assertIn('addressRange', program_module) + @skipIfWindows + @skipUnlessDarwin + @skipIfRemote + #TODO: Update the Makefile so that this test runs on Linux + def test_module_event(self): + ''' + Mac or linux. + + On mac, if we load a.out as our symbol file, we will use DWARF with .o files and we will + have debug symbols, but we won't see any debug info size because all of the DWARF + sections are in .o files. + + On other platforms, we expect a.out to have debug info, so we will expect a size. + expect_debug_info_size = platform.system() != 'Darwin' + return self.run_test("a.out", expect_debug_info_size) + ''' + expect_debug_info_size = platform.system() != 'Darwin' + return self.run_test("a.out", expect_debug_info_size) + + @skipIfWindows + @skipUnlessDarwin + @skipIfRemote + def test_module_event_dsym(self): + ''' + Darwin only test with dSYM file. + + On mac, if we load a.out.dSYM as our symbol file, we will have debug symbols and we + will have DWARF sections added to the module, so we will expect a size. + return self.run_test("a.out.dSYM", True) + ''' + return self.run_test("a.out.dSYM", True) + @skipIfWindows @skipUnlessDarwin @skipIfRemote def test_compile_units(self): - program= self.getBuildArtifact("a.out") + program = self.getBuildArtifact("a.out") self.build_and_launch(program) source = "main.cpp" main_source_path = self.getSourcePath(source) diff --git a/lldb/tools/lldb-vscode/JSONUtils.cpp b/lldb/tools/lldb-vscode/JSONUtils.cpp index 1ebaa5c377121..f6cdcf5a46cfc 100644 --- a/lldb/tools/lldb-vscode/JSONUtils.cpp +++ b/lldb/tools/lldb-vscode/JSONUtils.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include +#include +#include #include "llvm/ADT/Optional.h" #include "llvm/Support/FormatAdapters.h" @@ -327,6 +329,50 @@ llvm::json::Value CreateBreakpoint(lldb::SBBreakpoint &bp, return llvm::json::Value(std::move(object)); } +static uint64_t GetDebugInfoSizeInSection(lldb::SBSection section) { + uint64_t debug_info_size = 0; + llvm::StringRef section_name(section.GetName()); + if (section_name.startswith(".debug") || section_name.startswith("__debug") || + section_name.startswith(".apple") || section_name.startswith("__apple")) + debug_info_size += section.GetFileByteSize(); + size_t num_sub_sections = section.GetNumSubSections(); + for (size_t i = 0; i < num_sub_sections; i++) { + debug_info_size += + GetDebugInfoSizeInSection(section.GetSubSectionAtIndex(i)); + } + return debug_info_size; +} + +static uint64_t GetDebugInfoSize(lldb::SBModule module) { + uint64_t debug_info_size = 0; + size_t num_sections = module.GetNumSections(); + for (size_t i = 0; i < num_sections; i++) { + debug_info_size += GetDebugInfoSizeInSection(module.GetSectionAtIndex(i)); + } + return debug_info_size; +} + +static std::string ConvertDebugInfoSizeToString(uint64_t debug_info) { + std::ostringstream oss; + oss << " ("; + oss << std::fixed << std::setprecision(1); + + if (debug_info < 1024) { + oss << debug_info << "B"; + } else if (debug_info < 1024 * 1024) { + double kb = double(debug_info) / 1024.0; + oss << kb << "KB"; + } else if (debug_info < 1024 * 1024 * 1024) { + double mb = double(debug_info) / (1024.0 * 1024.0); + oss << mb << "MB"; + } else { + double gb = double(debug_info) / (1024.0 * 1024.0 * 1024.0); + oss << gb << "GB"; + ; + } + oss << ")"; + return oss.str(); +} llvm::json::Value CreateModule(lldb::SBModule &module) { llvm::json::Object object; if (!module.IsValid()) @@ -339,9 +385,15 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { std::string module_path(module_path_arr); object.try_emplace("path", module_path); if (module.GetNumCompileUnits() > 0) { - object.try_emplace("symbolStatus", "Symbols loaded."); + std::string symbol_str = "Symbols loaded."; + uint64_t debug_info = GetDebugInfoSize(module); + if (debug_info > 0) { + symbol_str += ConvertDebugInfoSizeToString(debug_info); + } + object.try_emplace("symbolStatus", symbol_str); char symbol_path_arr[PATH_MAX]; - module.GetSymbolFileSpec().GetPath(symbol_path_arr, sizeof(symbol_path_arr)); + module.GetSymbolFileSpec().GetPath(symbol_path_arr, + sizeof(symbol_path_arr)); std::string symbol_path(symbol_path_arr); object.try_emplace("symbolFilePath", symbol_path); } else { @@ -352,8 +404,9 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { object.try_emplace("addressRange", loaded_addr); std::string version_str; uint32_t version_nums[3]; - uint32_t num_versions = module.GetVersion(version_nums, sizeof(version_nums)/sizeof(uint32_t)); - for (uint32_t i=0; i Date: Fri, 24 Jul 2020 13:28:29 -0700 Subject: [PATCH 0050/1035] Revert "Add Debug Info Size to Symbol Status" This reverts commit 986e3af53bfe591e88a1ae4f82ea1cc0a15819a3. It incorrectly deleted clang/tools/clang-format/git-clang-format --- clang/tools/clang-format/git-clang-format | 585 ++++++++++++++++++ .../tools/lldb-vscode/lldbvscode_testcase.py | 9 +- .../API/tools/lldb-vscode/module/Makefile | 10 +- .../lldb-vscode/module/TestVSCode_module.py | 67 +- lldb/tools/lldb-vscode/JSONUtils.cpp | 61 +- 5 files changed, 611 insertions(+), 121 deletions(-) diff --git a/clang/tools/clang-format/git-clang-format b/clang/tools/clang-format/git-clang-format index e69de29bb2d1d..f3cd585e7f4a0 100755 --- a/clang/tools/clang-format/git-clang-format +++ b/clang/tools/clang-format/git-clang-format @@ -0,0 +1,585 @@ +#!/usr/bin/env python +# +#===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===# +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===------------------------------------------------------------------------===# + +r""" +clang-format git integration +============================ + +This file provides a clang-format integration for git. Put it somewhere in your +path and ensure that it is executable. Then, "git clang-format" will invoke +clang-format on the changes in current files or a specific commit. + +For further details, run: +git clang-format -h + +Requires Python 2.7 or Python 3 +""" + +from __future__ import absolute_import, division, print_function +import argparse +import collections +import contextlib +import errno +import os +import re +import subprocess +import sys + +usage = 'git clang-format [OPTIONS] [] [] [--] [...]' + +desc = ''' +If zero or one commits are given, run clang-format on all lines that differ +between the working directory and , which defaults to HEAD. Changes are +only applied to the working directory. + +If two commits are given (requires --diff), run clang-format on all lines in the +second that differ from the first . + +The following git-config settings set the default of the corresponding option: + clangFormat.binary + clangFormat.commit + clangFormat.extensions + clangFormat.style +''' + +# Name of the temporary index file in which save the output of clang-format. +# This file is created within the .git directory. +temp_index_basename = 'clang-format-index' + + +Range = collections.namedtuple('Range', 'start, count') + + +def main(): + config = load_git_config() + + # In order to keep '--' yet allow options after positionals, we need to + # check for '--' ourselves. (Setting nargs='*' throws away the '--', while + # nargs=argparse.REMAINDER disallows options after positionals.) + argv = sys.argv[1:] + try: + idx = argv.index('--') + except ValueError: + dash_dash = [] + else: + dash_dash = argv[idx:] + argv = argv[:idx] + + default_extensions = ','.join([ + # From clang/lib/Frontend/FrontendOptions.cpp, all lower case + 'c', 'h', # C + 'm', # ObjC + 'mm', # ObjC++ + 'cc', 'cp', 'cpp', 'c++', 'cxx', 'hh', 'hpp', 'hxx', # C++ + 'cu', # CUDA + # Other languages that clang-format supports + 'proto', 'protodevel', # Protocol Buffers + 'java', # Java + 'js', # JavaScript + 'ts', # TypeScript + 'cs', # C Sharp + ]) + + p = argparse.ArgumentParser( + usage=usage, formatter_class=argparse.RawDescriptionHelpFormatter, + description=desc) + p.add_argument('--binary', + default=config.get('clangformat.binary', 'clang-format'), + help='path to clang-format'), + p.add_argument('--commit', + default=config.get('clangformat.commit', 'HEAD'), + help='default commit to use if none is specified'), + p.add_argument('--diff', action='store_true', + help='print a diff instead of applying the changes') + p.add_argument('--extensions', + default=config.get('clangformat.extensions', + default_extensions), + help=('comma-separated list of file extensions to format, ' + 'excluding the period and case-insensitive')), + p.add_argument('-f', '--force', action='store_true', + help='allow changes to unstaged files') + p.add_argument('-p', '--patch', action='store_true', + help='select hunks interactively') + p.add_argument('-q', '--quiet', action='count', default=0, + help='print less information') + p.add_argument('--style', + default=config.get('clangformat.style', None), + help='passed to clang-format'), + p.add_argument('-v', '--verbose', action='count', default=0, + help='print extra information') + # We gather all the remaining positional arguments into 'args' since we need + # to use some heuristics to determine whether or not was present. + # However, to print pretty messages, we make use of metavar and help. + p.add_argument('args', nargs='*', metavar='', + help='revision from which to compute the diff') + p.add_argument('ignored', nargs='*', metavar='...', + help='if specified, only consider differences in these files') + opts = p.parse_args(argv) + + opts.verbose -= opts.quiet + del opts.quiet + + commits, files = interpret_args(opts.args, dash_dash, opts.commit) + if len(commits) > 1: + if not opts.diff: + die('--diff is required when two commits are given') + else: + if len(commits) > 2: + die('at most two commits allowed; %d given' % len(commits)) + changed_lines = compute_diff_and_extract_lines(commits, files) + if opts.verbose >= 1: + ignored_files = set(changed_lines) + filter_by_extension(changed_lines, opts.extensions.lower().split(',')) + if opts.verbose >= 1: + ignored_files.difference_update(changed_lines) + if ignored_files: + print('Ignoring changes in the following files (wrong extension):') + for filename in ignored_files: + print(' %s' % filename) + if changed_lines: + print('Running clang-format on the following files:') + for filename in changed_lines: + print(' %s' % filename) + if not changed_lines: + print('no modified files to format') + return + # The computed diff outputs absolute paths, so we must cd before accessing + # those files. + cd_to_toplevel() + if len(commits) > 1: + old_tree = commits[1] + new_tree = run_clang_format_and_save_to_tree(changed_lines, + revision=commits[1], + binary=opts.binary, + style=opts.style) + else: + old_tree = create_tree_from_workdir(changed_lines) + new_tree = run_clang_format_and_save_to_tree(changed_lines, + binary=opts.binary, + style=opts.style) + if opts.verbose >= 1: + print('old tree: %s' % old_tree) + print('new tree: %s' % new_tree) + if old_tree == new_tree: + if opts.verbose >= 0: + print('clang-format did not modify any files') + elif opts.diff: + print_diff(old_tree, new_tree) + else: + changed_files = apply_changes(old_tree, new_tree, force=opts.force, + patch_mode=opts.patch) + if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1: + print('changed files:') + for filename in changed_files: + print(' %s' % filename) + + +def load_git_config(non_string_options=None): + """Return the git configuration as a dictionary. + + All options are assumed to be strings unless in `non_string_options`, in which + is a dictionary mapping option name (in lower case) to either "--bool" or + "--int".""" + if non_string_options is None: + non_string_options = {} + out = {} + for entry in run('git', 'config', '--list', '--null').split('\0'): + if entry: + if '\n' in entry: + name, value = entry.split('\n', 1) + else: + # A setting with no '=' ('\n' with --null) is implicitly 'true' + name = entry + value = 'true' + if name in non_string_options: + value = run('git', 'config', non_string_options[name], name) + out[name] = value + return out + + +def interpret_args(args, dash_dash, default_commit): + """Interpret `args` as "[commits] [--] [files]" and return (commits, files). + + It is assumed that "--" and everything that follows has been removed from + args and placed in `dash_dash`. + + If "--" is present (i.e., `dash_dash` is non-empty), the arguments to its + left (if present) are taken as commits. Otherwise, the arguments are checked + from left to right if they are commits or files. If commits are not given, + a list with `default_commit` is used.""" + if dash_dash: + if len(args) == 0: + commits = [default_commit] + else: + commits = args + for commit in commits: + object_type = get_object_type(commit) + if object_type not in ('commit', 'tag'): + if object_type is None: + die("'%s' is not a commit" % commit) + else: + die("'%s' is a %s, but a commit was expected" % (commit, object_type)) + files = dash_dash[1:] + elif args: + commits = [] + while args: + if not disambiguate_revision(args[0]): + break + commits.append(args.pop(0)) + if not commits: + commits = [default_commit] + files = args + else: + commits = [default_commit] + files = [] + return commits, files + + +def disambiguate_revision(value): + """Returns True if `value` is a revision, False if it is a file, or dies.""" + # If `value` is ambiguous (neither a commit nor a file), the following + # command will die with an appropriate error message. + run('git', 'rev-parse', value, verbose=False) + object_type = get_object_type(value) + if object_type is None: + return False + if object_type in ('commit', 'tag'): + return True + die('`%s` is a %s, but a commit or filename was expected' % + (value, object_type)) + + +def get_object_type(value): + """Returns a string description of an object's type, or None if it is not + a valid git object.""" + cmd = ['git', 'cat-file', '-t', value] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + if p.returncode != 0: + return None + return convert_string(stdout.strip()) + + +def compute_diff_and_extract_lines(commits, files): + """Calls compute_diff() followed by extract_lines().""" + diff_process = compute_diff(commits, files) + changed_lines = extract_lines(diff_process.stdout) + diff_process.stdout.close() + diff_process.wait() + if diff_process.returncode != 0: + # Assume error was already printed to stderr. + sys.exit(2) + return changed_lines + + +def compute_diff(commits, files): + """Return a subprocess object producing the diff from `commits`. + + The return value's `stdin` file object will produce a patch with the + differences between the working directory and the first commit if a single + one was specified, or the difference between both specified commits, filtered + on `files` (if non-empty). Zero context lines are used in the patch.""" + git_tool = 'diff-index' + if len(commits) > 1: + git_tool = 'diff-tree' + cmd = ['git', git_tool, '-p', '-U0'] + commits + ['--'] + cmd.extend(files) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + p.stdin.close() + return p + + +def extract_lines(patch_file): + """Extract the changed lines in `patch_file`. + + The return value is a dictionary mapping filename to a list of (start_line, + line_count) pairs. + + The input must have been produced with ``-U0``, meaning unidiff format with + zero lines of context. The return value is a dict mapping filename to a + list of line `Range`s.""" + matches = {} + for line in patch_file: + line = convert_string(line) + match = re.search(r'^\+\+\+\ [^/]+/(.*)', line) + if match: + filename = match.group(1).rstrip('\r\n') + match = re.search(r'^@@ -[0-9,]+ \+(\d+)(,(\d+))?', line) + if match: + start_line = int(match.group(1)) + line_count = 1 + if match.group(3): + line_count = int(match.group(3)) + if line_count > 0: + matches.setdefault(filename, []).append(Range(start_line, line_count)) + return matches + + +def filter_by_extension(dictionary, allowed_extensions): + """Delete every key in `dictionary` that doesn't have an allowed extension. + + `allowed_extensions` must be a collection of lowercase file extensions, + excluding the period.""" + allowed_extensions = frozenset(allowed_extensions) + for filename in list(dictionary.keys()): + base_ext = filename.rsplit('.', 1) + if len(base_ext) == 1 and '' in allowed_extensions: + continue + if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions: + del dictionary[filename] + + +def cd_to_toplevel(): + """Change to the top level of the git repository.""" + toplevel = run('git', 'rev-parse', '--show-toplevel') + os.chdir(toplevel) + + +def create_tree_from_workdir(filenames): + """Create a new git tree with the given files from the working directory. + + Returns the object ID (SHA-1) of the created tree.""" + return create_tree(filenames, '--stdin') + + +def run_clang_format_and_save_to_tree(changed_lines, revision=None, + binary='clang-format', style=None): + """Run clang-format on each file and save the result to a git tree. + + Returns the object ID (SHA-1) of the created tree.""" + def iteritems(container): + try: + return container.iteritems() # Python 2 + except AttributeError: + return container.items() # Python 3 + def index_info_generator(): + for filename, line_ranges in iteritems(changed_lines): + if revision: + git_metadata_cmd = ['git', 'ls-tree', + '%s:%s' % (revision, os.path.dirname(filename)), + os.path.basename(filename)] + git_metadata = subprocess.Popen(git_metadata_cmd, stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + stdout = git_metadata.communicate()[0] + mode = oct(int(stdout.split()[0], 8)) + else: + mode = oct(os.stat(filename).st_mode) + # Adjust python3 octal format so that it matches what git expects + if mode.startswith('0o'): + mode = '0' + mode[2:] + blob_id = clang_format_to_blob(filename, line_ranges, + revision=revision, + binary=binary, + style=style) + yield '%s %s\t%s' % (mode, blob_id, filename) + return create_tree(index_info_generator(), '--index-info') + + +def create_tree(input_lines, mode): + """Create a tree object from the given input. + + If mode is '--stdin', it must be a list of filenames. If mode is + '--index-info' is must be a list of values suitable for "git update-index + --index-info", such as " ". Any other mode + is invalid.""" + assert mode in ('--stdin', '--index-info') + cmd = ['git', 'update-index', '--add', '-z', mode] + with temporary_index_file(): + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + for line in input_lines: + p.stdin.write(to_bytes('%s\0' % line)) + p.stdin.close() + if p.wait() != 0: + die('`%s` failed' % ' '.join(cmd)) + tree_id = run('git', 'write-tree') + return tree_id + + +def clang_format_to_blob(filename, line_ranges, revision=None, + binary='clang-format', style=None): + """Run clang-format on the given file and save the result to a git blob. + + Runs on the file in `revision` if not None, or on the file in the working + directory if `revision` is None. + + Returns the object ID (SHA-1) of the created blob.""" + clang_format_cmd = [binary] + if style: + clang_format_cmd.extend(['-style='+style]) + clang_format_cmd.extend([ + '-lines=%s:%s' % (start_line, start_line+line_count-1) + for start_line, line_count in line_ranges]) + if revision: + clang_format_cmd.extend(['-assume-filename='+filename]) + git_show_cmd = ['git', 'cat-file', 'blob', '%s:%s' % (revision, filename)] + git_show = subprocess.Popen(git_show_cmd, stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + git_show.stdin.close() + clang_format_stdin = git_show.stdout + else: + clang_format_cmd.extend([filename]) + git_show = None + clang_format_stdin = subprocess.PIPE + try: + clang_format = subprocess.Popen(clang_format_cmd, stdin=clang_format_stdin, + stdout=subprocess.PIPE) + if clang_format_stdin == subprocess.PIPE: + clang_format_stdin = clang_format.stdin + except OSError as e: + if e.errno == errno.ENOENT: + die('cannot find executable "%s"' % binary) + else: + raise + clang_format_stdin.close() + hash_object_cmd = ['git', 'hash-object', '-w', '--path='+filename, '--stdin'] + hash_object = subprocess.Popen(hash_object_cmd, stdin=clang_format.stdout, + stdout=subprocess.PIPE) + clang_format.stdout.close() + stdout = hash_object.communicate()[0] + if hash_object.returncode != 0: + die('`%s` failed' % ' '.join(hash_object_cmd)) + if clang_format.wait() != 0: + die('`%s` failed' % ' '.join(clang_format_cmd)) + if git_show and git_show.wait() != 0: + die('`%s` failed' % ' '.join(git_show_cmd)) + return convert_string(stdout).rstrip('\r\n') + + +@contextlib.contextmanager +def temporary_index_file(tree=None): + """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting + the file afterward.""" + index_path = create_temporary_index(tree) + old_index_path = os.environ.get('GIT_INDEX_FILE') + os.environ['GIT_INDEX_FILE'] = index_path + try: + yield + finally: + if old_index_path is None: + del os.environ['GIT_INDEX_FILE'] + else: + os.environ['GIT_INDEX_FILE'] = old_index_path + os.remove(index_path) + + +def create_temporary_index(tree=None): + """Create a temporary index file and return the created file's path. + + If `tree` is not None, use that as the tree to read in. Otherwise, an + empty index is created.""" + gitdir = run('git', 'rev-parse', '--git-dir') + path = os.path.join(gitdir, temp_index_basename) + if tree is None: + tree = '--empty' + run('git', 'read-tree', '--index-output='+path, tree) + return path + + +def print_diff(old_tree, new_tree): + """Print the diff between the two trees to stdout.""" + # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output + # is expected to be viewed by the user, and only the former does nice things + # like color and pagination. + # + # We also only print modified files since `new_tree` only contains the files + # that were modified, so unmodified files would show as deleted without the + # filter. + subprocess.check_call(['git', 'diff', '--diff-filter=M', old_tree, new_tree, + '--']) + + +def apply_changes(old_tree, new_tree, force=False, patch_mode=False): + """Apply the changes in `new_tree` to the working directory. + + Bails if there are local changes in those files and not `force`. If + `patch_mode`, runs `git checkout --patch` to select hunks interactively.""" + changed_files = run('git', 'diff-tree', '--diff-filter=M', '-r', '-z', + '--name-only', old_tree, + new_tree).rstrip('\0').split('\0') + if not force: + unstaged_files = run('git', 'diff-files', '--name-status', *changed_files) + if unstaged_files: + print('The following files would be modified but ' + 'have unstaged changes:', file=sys.stderr) + print(unstaged_files, file=sys.stderr) + print('Please commit, stage, or stash them first.', file=sys.stderr) + sys.exit(2) + if patch_mode: + # In patch mode, we could just as well create an index from the new tree + # and checkout from that, but then the user will be presented with a + # message saying "Discard ... from worktree". Instead, we use the old + # tree as the index and checkout from new_tree, which gives the slightly + # better message, "Apply ... to index and worktree". This is not quite + # right, since it won't be applied to the user's index, but oh well. + with temporary_index_file(old_tree): + subprocess.check_call(['git', 'checkout', '--patch', new_tree]) + index_tree = old_tree + else: + with temporary_index_file(new_tree): + run('git', 'checkout-index', '-a', '-f') + return changed_files + + +def run(*args, **kwargs): + stdin = kwargs.pop('stdin', '') + verbose = kwargs.pop('verbose', True) + strip = kwargs.pop('strip', True) + for name in kwargs: + raise TypeError("run() got an unexpected keyword argument '%s'" % name) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + stdin=subprocess.PIPE) + stdout, stderr = p.communicate(input=stdin) + + stdout = convert_string(stdout) + stderr = convert_string(stderr) + + if p.returncode == 0: + if stderr: + if verbose: + print('`%s` printed to stderr:' % ' '.join(args), file=sys.stderr) + print(stderr.rstrip(), file=sys.stderr) + if strip: + stdout = stdout.rstrip('\r\n') + return stdout + if verbose: + print('`%s` returned %s' % (' '.join(args), p.returncode), file=sys.stderr) + if stderr: + print(stderr.rstrip(), file=sys.stderr) + sys.exit(2) + + +def die(message): + print('error:', message, file=sys.stderr) + sys.exit(2) + + +def to_bytes(str_input): + # Encode to UTF-8 to get binary data. + if isinstance(str_input, bytes): + return str_input + return str_input.encode('utf-8') + + +def to_string(bytes_input): + if isinstance(bytes_input, str): + return bytes_input + return bytes_input.encode('utf-8') + + +def convert_string(bytes_input): + try: + return to_string(bytes_input.decode('utf-8')) + except AttributeError: # 'str' object has no attribute 'decode'. + return str(bytes_input) + except UnicodeError: + return str(bytes_input) + +if __name__ == '__main__': + main() diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py index fa5a9c0db1ebd..c1b33c220b4bd 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py @@ -53,12 +53,11 @@ def set_function_breakpoints(self, functions, condition=None, breakpoint_ids.append('%i' % (breakpoint['id'])) return breakpoint_ids - def waitUntil(self, condition_callback): - for _ in range(20): - if condition_callback(): - return True + def waitUntil(self, condition): + while True: + if condition(): + break time.sleep(0.5) - return False def verify_breakpoint_hit(self, breakpoint_ids): '''Wait for the process we are debugging to stop, and verify we hit diff --git a/lldb/test/API/tools/lldb-vscode/module/Makefile b/lldb/test/API/tools/lldb-vscode/module/Makefile index b30baf48b972e..1fb944b138937 100644 --- a/lldb/test/API/tools/lldb-vscode/module/Makefile +++ b/lldb/test/API/tools/lldb-vscode/module/Makefile @@ -2,16 +2,12 @@ DYLIB_NAME := foo DYLIB_CXX_SOURCES := foo.cpp CXX_SOURCES := main.cpp -LD_EXTRAS := -Wl,-rpath "-Wl,$(shell pwd)" -USE_LIBDL :=1 +all: a.out.stripped include Makefile.rules -all: a.out.stripped - -a.out.stripped: +a.out.stripped: a.out.dSYM strip -o a.out.stripped a.out - ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out.stripped -endif \ No newline at end of file +endif diff --git a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py index a16430fccae1d..40c4145b38e36 100644 --- a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py +++ b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py @@ -10,93 +10,56 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbvscode_testcase -import re class TestVSCode_module(lldbvscode_testcase.VSCodeTestCaseBase): mydir = TestBase.compute_mydir(__file__) - def run_test(self, symbol_basename, expect_debug_info_size): + + @skipIfWindows + @skipUnlessDarwin + @skipIfRemote + def test_modules_event(self): program_basename = "a.out.stripped" - program = self.getBuildArtifact(program_basename) + program= self.getBuildArtifact(program_basename) self.build_and_launch(program) functions = ['foo'] breakpoint_ids = self.set_function_breakpoints(functions) - self.assertEquals(len(breakpoint_ids), len(functions), 'expect one breakpoint') + self.assertEquals(len(breakpoint_ids), len(functions), + 'expect one breakpoint') self.continue_to_breakpoints(breakpoint_ids) active_modules = self.vscode.get_active_modules() - program_module = active_modules[program_basename] self.assertIn(program_basename, active_modules, '%s module is in active modules' % (program_basename)) + program_module = active_modules[program_basename] self.assertIn('name', program_module, 'make sure name is in module') self.assertEqual(program_basename, program_module['name']) self.assertIn('path', program_module, 'make sure path is in module') self.assertEqual(program, program_module['path']) self.assertTrue('symbolFilePath' not in program_module, 'Make sure a.out.stripped has no debug info') self.assertEqual('Symbols not found.', program_module['symbolStatus']) - symbols_path = self.getBuildArtifact(symbol_basename) - self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbols_path))) + symbol_path = self.getBuildArtifact("a.out") + self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbol_path))) def checkSymbolsLoaded(): active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] return 'Symbols loaded.' == program_module['symbolStatus'] + self.waitUntil(checkSymbolsLoaded) - def checkSymbolsLoadedWithSize(): - active_modules = self.vscode.get_active_modules() - program_module = active_modules[program_basename] - symbolsStatus = program_module['symbolStatus'] - symbol_regex = re.compile(r"Symbols loaded. \([0-9]+(\.[0-9]*)?[KMG]?B\)") - return symbol_regex.match(program_module['symbolStatus']) - - if expect_debug_info_size: - self.waitUntil(checkSymbolsLoadedWithSize) - else: - self.waitUntil(checkSymbolsLoaded) active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] self.assertEqual(program_basename, program_module['name']) self.assertEqual(program, program_module['path']) + self.assertEqual('Symbols loaded.', program_module['symbolStatus']) self.assertIn('symbolFilePath', program_module) - self.assertIn(symbols_path, program_module['symbolFilePath']) + self.assertEqual(symbol_path, program_module['symbolFilePath']) self.assertIn('addressRange', program_module) - @skipIfWindows - @skipUnlessDarwin - @skipIfRemote - #TODO: Update the Makefile so that this test runs on Linux - def test_module_event(self): - ''' - Mac or linux. - - On mac, if we load a.out as our symbol file, we will use DWARF with .o files and we will - have debug symbols, but we won't see any debug info size because all of the DWARF - sections are in .o files. - - On other platforms, we expect a.out to have debug info, so we will expect a size. - expect_debug_info_size = platform.system() != 'Darwin' - return self.run_test("a.out", expect_debug_info_size) - ''' - expect_debug_info_size = platform.system() != 'Darwin' - return self.run_test("a.out", expect_debug_info_size) - - @skipIfWindows - @skipUnlessDarwin - @skipIfRemote - def test_module_event_dsym(self): - ''' - Darwin only test with dSYM file. - - On mac, if we load a.out.dSYM as our symbol file, we will have debug symbols and we - will have DWARF sections added to the module, so we will expect a size. - return self.run_test("a.out.dSYM", True) - ''' - return self.run_test("a.out.dSYM", True) - @skipIfWindows @skipUnlessDarwin @skipIfRemote def test_compile_units(self): - program = self.getBuildArtifact("a.out") + program= self.getBuildArtifact("a.out") self.build_and_launch(program) source = "main.cpp" main_source_path = self.getSourcePath(source) diff --git a/lldb/tools/lldb-vscode/JSONUtils.cpp b/lldb/tools/lldb-vscode/JSONUtils.cpp index f6cdcf5a46cfc..1ebaa5c377121 100644 --- a/lldb/tools/lldb-vscode/JSONUtils.cpp +++ b/lldb/tools/lldb-vscode/JSONUtils.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include -#include -#include #include "llvm/ADT/Optional.h" #include "llvm/Support/FormatAdapters.h" @@ -329,50 +327,6 @@ llvm::json::Value CreateBreakpoint(lldb::SBBreakpoint &bp, return llvm::json::Value(std::move(object)); } -static uint64_t GetDebugInfoSizeInSection(lldb::SBSection section) { - uint64_t debug_info_size = 0; - llvm::StringRef section_name(section.GetName()); - if (section_name.startswith(".debug") || section_name.startswith("__debug") || - section_name.startswith(".apple") || section_name.startswith("__apple")) - debug_info_size += section.GetFileByteSize(); - size_t num_sub_sections = section.GetNumSubSections(); - for (size_t i = 0; i < num_sub_sections; i++) { - debug_info_size += - GetDebugInfoSizeInSection(section.GetSubSectionAtIndex(i)); - } - return debug_info_size; -} - -static uint64_t GetDebugInfoSize(lldb::SBModule module) { - uint64_t debug_info_size = 0; - size_t num_sections = module.GetNumSections(); - for (size_t i = 0; i < num_sections; i++) { - debug_info_size += GetDebugInfoSizeInSection(module.GetSectionAtIndex(i)); - } - return debug_info_size; -} - -static std::string ConvertDebugInfoSizeToString(uint64_t debug_info) { - std::ostringstream oss; - oss << " ("; - oss << std::fixed << std::setprecision(1); - - if (debug_info < 1024) { - oss << debug_info << "B"; - } else if (debug_info < 1024 * 1024) { - double kb = double(debug_info) / 1024.0; - oss << kb << "KB"; - } else if (debug_info < 1024 * 1024 * 1024) { - double mb = double(debug_info) / (1024.0 * 1024.0); - oss << mb << "MB"; - } else { - double gb = double(debug_info) / (1024.0 * 1024.0 * 1024.0); - oss << gb << "GB"; - ; - } - oss << ")"; - return oss.str(); -} llvm::json::Value CreateModule(lldb::SBModule &module) { llvm::json::Object object; if (!module.IsValid()) @@ -385,15 +339,9 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { std::string module_path(module_path_arr); object.try_emplace("path", module_path); if (module.GetNumCompileUnits() > 0) { - std::string symbol_str = "Symbols loaded."; - uint64_t debug_info = GetDebugInfoSize(module); - if (debug_info > 0) { - symbol_str += ConvertDebugInfoSizeToString(debug_info); - } - object.try_emplace("symbolStatus", symbol_str); + object.try_emplace("symbolStatus", "Symbols loaded."); char symbol_path_arr[PATH_MAX]; - module.GetSymbolFileSpec().GetPath(symbol_path_arr, - sizeof(symbol_path_arr)); + module.GetSymbolFileSpec().GetPath(symbol_path_arr, sizeof(symbol_path_arr)); std::string symbol_path(symbol_path_arr); object.try_emplace("symbolFilePath", symbol_path); } else { @@ -404,9 +352,8 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { object.try_emplace("addressRange", loaded_addr); std::string version_str; uint32_t version_nums[3]; - uint32_t num_versions = - module.GetVersion(version_nums, sizeof(version_nums) / sizeof(uint32_t)); - for (uint32_t i = 0; i < num_versions; ++i) { + uint32_t num_versions = module.GetVersion(version_nums, sizeof(version_nums)/sizeof(uint32_t)); + for (uint32_t i=0; i Date: Fri, 24 Jul 2020 13:30:04 -0700 Subject: [PATCH 0051/1035] Add Debug Info Size to Symbol Status If a module has debug info, the size of debug symbol will be displayed after the Symbols Loaded Message for each module in the VScode modules view.{F12335461} Reviewed By: wallace, clayborg Differential Revision: https://reviews.llvm.org/D83731 --- .../tools/lldb-vscode/lldbvscode_testcase.py | 9 +-- .../API/tools/lldb-vscode/module/Makefile | 10 ++- .../lldb-vscode/module/TestVSCode_module.py | 67 ++++++++++++++----- lldb/tools/lldb-vscode/JSONUtils.cpp | 61 +++++++++++++++-- 4 files changed, 121 insertions(+), 26 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py index c1b33c220b4bd..fa5a9c0db1ebd 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py @@ -53,11 +53,12 @@ def set_function_breakpoints(self, functions, condition=None, breakpoint_ids.append('%i' % (breakpoint['id'])) return breakpoint_ids - def waitUntil(self, condition): - while True: - if condition(): - break + def waitUntil(self, condition_callback): + for _ in range(20): + if condition_callback(): + return True time.sleep(0.5) + return False def verify_breakpoint_hit(self, breakpoint_ids): '''Wait for the process we are debugging to stop, and verify we hit diff --git a/lldb/test/API/tools/lldb-vscode/module/Makefile b/lldb/test/API/tools/lldb-vscode/module/Makefile index 1fb944b138937..b30baf48b972e 100644 --- a/lldb/test/API/tools/lldb-vscode/module/Makefile +++ b/lldb/test/API/tools/lldb-vscode/module/Makefile @@ -2,12 +2,16 @@ DYLIB_NAME := foo DYLIB_CXX_SOURCES := foo.cpp CXX_SOURCES := main.cpp -all: a.out.stripped +LD_EXTRAS := -Wl,-rpath "-Wl,$(shell pwd)" +USE_LIBDL :=1 include Makefile.rules -a.out.stripped: a.out.dSYM +all: a.out.stripped + +a.out.stripped: strip -o a.out.stripped a.out + ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out.stripped -endif +endif \ No newline at end of file diff --git a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py index 40c4145b38e36..a16430fccae1d 100644 --- a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py +++ b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py @@ -10,56 +10,93 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbvscode_testcase +import re class TestVSCode_module(lldbvscode_testcase.VSCodeTestCaseBase): mydir = TestBase.compute_mydir(__file__) - - @skipIfWindows - @skipUnlessDarwin - @skipIfRemote - def test_modules_event(self): + def run_test(self, symbol_basename, expect_debug_info_size): program_basename = "a.out.stripped" - program= self.getBuildArtifact(program_basename) + program = self.getBuildArtifact(program_basename) self.build_and_launch(program) functions = ['foo'] breakpoint_ids = self.set_function_breakpoints(functions) - self.assertEquals(len(breakpoint_ids), len(functions), - 'expect one breakpoint') + self.assertEquals(len(breakpoint_ids), len(functions), 'expect one breakpoint') self.continue_to_breakpoints(breakpoint_ids) active_modules = self.vscode.get_active_modules() - self.assertIn(program_basename, active_modules, '%s module is in active modules' % (program_basename)) program_module = active_modules[program_basename] + self.assertIn(program_basename, active_modules, '%s module is in active modules' % (program_basename)) self.assertIn('name', program_module, 'make sure name is in module') self.assertEqual(program_basename, program_module['name']) self.assertIn('path', program_module, 'make sure path is in module') self.assertEqual(program, program_module['path']) self.assertTrue('symbolFilePath' not in program_module, 'Make sure a.out.stripped has no debug info') self.assertEqual('Symbols not found.', program_module['symbolStatus']) - symbol_path = self.getBuildArtifact("a.out") - self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbol_path))) + symbols_path = self.getBuildArtifact(symbol_basename) + self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbols_path))) def checkSymbolsLoaded(): active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] return 'Symbols loaded.' == program_module['symbolStatus'] - self.waitUntil(checkSymbolsLoaded) + def checkSymbolsLoadedWithSize(): + active_modules = self.vscode.get_active_modules() + program_module = active_modules[program_basename] + symbolsStatus = program_module['symbolStatus'] + symbol_regex = re.compile(r"Symbols loaded. \([0-9]+(\.[0-9]*)?[KMG]?B\)") + return symbol_regex.match(program_module['symbolStatus']) + + if expect_debug_info_size: + self.waitUntil(checkSymbolsLoadedWithSize) + else: + self.waitUntil(checkSymbolsLoaded) active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] self.assertEqual(program_basename, program_module['name']) self.assertEqual(program, program_module['path']) - self.assertEqual('Symbols loaded.', program_module['symbolStatus']) self.assertIn('symbolFilePath', program_module) - self.assertEqual(symbol_path, program_module['symbolFilePath']) + self.assertIn(symbols_path, program_module['symbolFilePath']) self.assertIn('addressRange', program_module) + @skipIfWindows + @skipUnlessDarwin + @skipIfRemote + #TODO: Update the Makefile so that this test runs on Linux + def test_module_event(self): + ''' + Mac or linux. + + On mac, if we load a.out as our symbol file, we will use DWARF with .o files and we will + have debug symbols, but we won't see any debug info size because all of the DWARF + sections are in .o files. + + On other platforms, we expect a.out to have debug info, so we will expect a size. + expect_debug_info_size = platform.system() != 'Darwin' + return self.run_test("a.out", expect_debug_info_size) + ''' + expect_debug_info_size = platform.system() != 'Darwin' + return self.run_test("a.out", expect_debug_info_size) + + @skipIfWindows + @skipUnlessDarwin + @skipIfRemote + def test_module_event_dsym(self): + ''' + Darwin only test with dSYM file. + + On mac, if we load a.out.dSYM as our symbol file, we will have debug symbols and we + will have DWARF sections added to the module, so we will expect a size. + return self.run_test("a.out.dSYM", True) + ''' + return self.run_test("a.out.dSYM", True) + @skipIfWindows @skipUnlessDarwin @skipIfRemote def test_compile_units(self): - program= self.getBuildArtifact("a.out") + program = self.getBuildArtifact("a.out") self.build_and_launch(program) source = "main.cpp" main_source_path = self.getSourcePath(source) diff --git a/lldb/tools/lldb-vscode/JSONUtils.cpp b/lldb/tools/lldb-vscode/JSONUtils.cpp index 1ebaa5c377121..f6cdcf5a46cfc 100644 --- a/lldb/tools/lldb-vscode/JSONUtils.cpp +++ b/lldb/tools/lldb-vscode/JSONUtils.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include +#include +#include #include "llvm/ADT/Optional.h" #include "llvm/Support/FormatAdapters.h" @@ -327,6 +329,50 @@ llvm::json::Value CreateBreakpoint(lldb::SBBreakpoint &bp, return llvm::json::Value(std::move(object)); } +static uint64_t GetDebugInfoSizeInSection(lldb::SBSection section) { + uint64_t debug_info_size = 0; + llvm::StringRef section_name(section.GetName()); + if (section_name.startswith(".debug") || section_name.startswith("__debug") || + section_name.startswith(".apple") || section_name.startswith("__apple")) + debug_info_size += section.GetFileByteSize(); + size_t num_sub_sections = section.GetNumSubSections(); + for (size_t i = 0; i < num_sub_sections; i++) { + debug_info_size += + GetDebugInfoSizeInSection(section.GetSubSectionAtIndex(i)); + } + return debug_info_size; +} + +static uint64_t GetDebugInfoSize(lldb::SBModule module) { + uint64_t debug_info_size = 0; + size_t num_sections = module.GetNumSections(); + for (size_t i = 0; i < num_sections; i++) { + debug_info_size += GetDebugInfoSizeInSection(module.GetSectionAtIndex(i)); + } + return debug_info_size; +} + +static std::string ConvertDebugInfoSizeToString(uint64_t debug_info) { + std::ostringstream oss; + oss << " ("; + oss << std::fixed << std::setprecision(1); + + if (debug_info < 1024) { + oss << debug_info << "B"; + } else if (debug_info < 1024 * 1024) { + double kb = double(debug_info) / 1024.0; + oss << kb << "KB"; + } else if (debug_info < 1024 * 1024 * 1024) { + double mb = double(debug_info) / (1024.0 * 1024.0); + oss << mb << "MB"; + } else { + double gb = double(debug_info) / (1024.0 * 1024.0 * 1024.0); + oss << gb << "GB"; + ; + } + oss << ")"; + return oss.str(); +} llvm::json::Value CreateModule(lldb::SBModule &module) { llvm::json::Object object; if (!module.IsValid()) @@ -339,9 +385,15 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { std::string module_path(module_path_arr); object.try_emplace("path", module_path); if (module.GetNumCompileUnits() > 0) { - object.try_emplace("symbolStatus", "Symbols loaded."); + std::string symbol_str = "Symbols loaded."; + uint64_t debug_info = GetDebugInfoSize(module); + if (debug_info > 0) { + symbol_str += ConvertDebugInfoSizeToString(debug_info); + } + object.try_emplace("symbolStatus", symbol_str); char symbol_path_arr[PATH_MAX]; - module.GetSymbolFileSpec().GetPath(symbol_path_arr, sizeof(symbol_path_arr)); + module.GetSymbolFileSpec().GetPath(symbol_path_arr, + sizeof(symbol_path_arr)); std::string symbol_path(symbol_path_arr); object.try_emplace("symbolFilePath", symbol_path); } else { @@ -352,8 +404,9 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { object.try_emplace("addressRange", loaded_addr); std::string version_str; uint32_t version_nums[3]; - uint32_t num_versions = module.GetVersion(version_nums, sizeof(version_nums)/sizeof(uint32_t)); - for (uint32_t i=0; i Date: Fri, 24 Jul 2020 13:36:13 -0700 Subject: [PATCH 0052/1035] [CMake] Find zlib when building lldb as standalone This addresses the issue introduced by 10b1b4a. --- lldb/cmake/modules/LLDBStandalone.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lldb/cmake/modules/LLDBStandalone.cmake b/lldb/cmake/modules/LLDBStandalone.cmake index 752113bcc6c41..edd2b34ec8655 100644 --- a/lldb/cmake/modules/LLDBStandalone.cmake +++ b/lldb/cmake/modules/LLDBStandalone.cmake @@ -73,6 +73,11 @@ endif() # We append the directory in which LLVMConfig.cmake lives. We expect LLVM's # CMake modules to be in that directory as well. list(APPEND CMAKE_MODULE_PATH "${LLVM_DIR}") + +if(LLVM_ENABLE_ZLIB) + find_package(ZLIB) +endif() + include(AddLLVM) include(TableGen) include(HandleLLVMOptions) From 2bd72abef0f24d9f7da13af3a43a25409159a87e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 22 Jul 2020 21:24:21 -0400 Subject: [PATCH 0053/1035] AMDGPU: Skip other terminators before inserting s_cbranch_exec[n]z PHIElimination/createPHISourceCopy inserts non-branch terminators after the control flow pseudo if a successor phi reads a register defined by the control flow pseudo. If this happens, we need to split the expansion of the control flow pseudo to ensure all the branches are after all of the other mask management instructions. GlobalISel hit this in testscases that happened to be tail duplicated. The original testcase still does not work, since the same problem appears to be present in a later pass. --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 24 +- .../lower-control-flow-other-terminators.mir | 246 ++++++++++++++++++ .../AMDGPU/si-if-lower-user-terminators.mir | 75 ------ 3 files changed, 269 insertions(+), 76 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir delete mode 100644 llvm/test/CodeGen/AMDGPU/si-if-lower-user-terminators.mir diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 36d52ac3ee891..140e1f0e122a9 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -122,6 +122,19 @@ class SILowerControlFlow : public MachineFunctionPass { skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const; + /// Find the insertion point for a new conditional branch. + MachineBasicBlock::iterator + skipToUncondBrOrEnd(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + assert(I->isTerminator()); + + // FIXME: What if we had multiple pre-existing conditional branches? + MachineBasicBlock::iterator End = MBB.end(); + while (I != End && !I->isUnconditionalBranch()) + ++I; + return I; + } + // Remove redundant SI_END_CF instructions. void optimizeEndCf(); @@ -275,6 +288,10 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) .addReg(Tmp, RegState::Kill); + // Skip ahead to the unconditional branch in case there are other terminators + // present. + I = skipToUncondBrOrEnd(MBB, I); + // Insert the S_CBRANCH_EXECZ instruction which will be optimized later // during SIRemoveShortExecBranches. MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) @@ -353,6 +370,10 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { .addReg(Exec) .addReg(DstReg); + // Skip ahead to the unconditional branch in case there are other terminators + // present. + ElsePt = skipToUncondBrOrEnd(MBB, ElsePt); + MachineInstr *Branch = BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) .addMBB(DestBB); @@ -435,8 +456,9 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { .addReg(Exec) .add(MI.getOperand(0)); + auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator()); MachineInstr *Branch = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) .add(MI.getOperand(1)); if (LIS) { diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir new file mode 100644 index 0000000000000..08e6f1a067ac5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir @@ -0,0 +1,246 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-lower-control-flow -o - %s | FileCheck %s + +# Test si-lower-control-flow insertion points when other terminator +# instructions are present besides the control flow pseudo and a +# branch. + + +# There's another terminator instruction between SI_IF and +# S_BRANCH. The S_CBRANCH_EXECZ should be inserted immediately before +# S_BRANCH. +--- +name: other_terminator_sbranch_after_si_if +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_sbranch_after_si_if + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + %3:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +# S_CBRANCH_EXECZ should be inserted after the other terminator +--- +name: other_terminator_fallthrough_after_si_if +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_fallthrough_after_si_if + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + %3:sreg_64_xexec = SI_IF %2, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +--- +name: other_terminator_sbranch_after_si_else +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_sbranch_after_si_else + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY %2 + ; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 [[COPY]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY1]], implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY2]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + %3:sreg_64_xexec = SI_ELSE %2, %bb.1, 0, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +--- +name: other_terminator_sbranch_after_si_loop +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_sbranch_after_si_loop + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec + ; CHECK: $exec = S_ANDN2_B64_term $exec, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + SI_LOOP %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +# The save exec result register of SI_IF is used by other terminators +# inserted to behave as a lowered phi. The output register of SI_IF +# was ignored, and the def was removed, so the S_MOV_B64_term uses +# would fail the verifier. + +--- +name: si_if_use +alignment: 1 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: si_if_use + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[COPY]], killed [[COPY1]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec + ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] + ; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]] + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]] + ; CHECK: $exec = S_OR_B64 $exec, killed [[COPY5]], implicit-def $scc + ; CHECK: S_SLEEP 1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] + ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec + ; CHECK: [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:vgpr_32 = COPY killed $vgpr1 + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec + %10:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %14:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec + %13:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec + S_BRANCH %bb.2 + + bb.1: + %11:sreg_64_xexec = COPY %13 + dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %14:sreg_64_xexec = COPY %11 + + bb.2: + %12:sreg_64_xexec = COPY %14 + SI_END_CF killed %12, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_SLEEP 1 + %9:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %14:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec + %13:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec + S_BRANCH %bb.2 + +... diff --git a/llvm/test/CodeGen/AMDGPU/si-if-lower-user-terminators.mir b/llvm/test/CodeGen/AMDGPU/si-if-lower-user-terminators.mir deleted file mode 100644 index 5850a3b27bce8..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/si-if-lower-user-terminators.mir +++ /dev/null @@ -1,75 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-lower-control-flow -verify-machineinstrs -o - %s | FileCheck %s - -# The save exec result register of SI_IF is used by other terminators -# inserted to behave as a lowered phi. The output register of SI_IF -# was ignored, and the def was removed, so the S_MOV_B64_term uses -# would fail the verifier. - ---- -name: si_if_use -alignment: 1 -legalized: true -regBankSelected: true -selected: true -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: si_if_use - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[COPY]], killed [[COPY1]], implicit $exec - ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; CHECK: S_BRANCH %bb.2 - ; CHECK: bb.1: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] - ; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) - ; CHECK: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]] - ; CHECK: bb.2: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]] - ; CHECK: $exec = S_OR_B64 $exec, killed [[COPY5]], implicit-def $scc - ; CHECK: S_SLEEP 1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc - ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec - ; CHECK: [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec - ; CHECK: S_BRANCH %bb.2 - bb.0: - liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 - - %0:vgpr_32 = COPY killed $vgpr0 - %1:vgpr_32 = COPY killed $vgpr1 - %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec - %10:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %14:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec - %13:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec - S_BRANCH %bb.2 - - bb.1: - %11:sreg_64_xexec = COPY %13 - dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) - %14:sreg_64_xexec = COPY %11 - - bb.2: - %12:sreg_64_xexec = COPY %14 - SI_END_CF killed %12, implicit-def $exec, implicit-def dead $scc, implicit $exec - S_SLEEP 1 - %9:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %14:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec - %13:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec - S_BRANCH %bb.2 - -... From 4dc3014c51fda2a3189318c4ae54c4da9cfc6a0e Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 24 Jul 2020 14:54:17 -0600 Subject: [PATCH 0054/1035] [compiler-rt][fuzzer] Disable bcmp.test on darwin It broke one of the buildbots: http://lab.llvm.org:8080/green/job/clang-stage1-RA/13026/console --- compiler-rt/test/fuzzer/bcmp.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/fuzzer/bcmp.test b/compiler-rt/test/fuzzer/bcmp.test index 5bbbe9845beb2..8c1e532b16aba 100644 --- a/compiler-rt/test/fuzzer/bcmp.test +++ b/compiler-rt/test/fuzzer/bcmp.test @@ -1,4 +1,4 @@ -UNSUPPORTED: freebsd, windows +UNSUPPORTED: darwin, freebsd, windows RUN: %cpp_compiler -DMEMCMP=bcmp %S/MemcmpTest.cpp -o %t RUN: not %run %t -seed=1 -runs=10000000 2>&1 | FileCheck %s CHECK: BINGO From 3554cf4f382c7f18beb1265f5199651470c438d8 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 24 Jul 2020 20:55:52 +0000 Subject: [PATCH 0055/1035] [compiler-rt][CMake] Remove unused -stdlib when passing -nostdinc++ We added -nostdinc++ to clang_rt.profile in https://reviews.llvm.org/D84205. This will cause warnings when building with LLVM_ENABLE_LIBCXX, and failure if with Werror on. This patch is to fix it by removing unused -stdlib, similar to what we have done in https://reviews.llvm.org/D42238. Reviewed By: phosek Differential Revision: https://reviews.llvm.org/D84543 --- compiler-rt/lib/profile/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 5ff0e10182b4d..a968009f9ea90 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -113,6 +113,8 @@ endif() # We don't use the C++ Standard Library here, so avoid including it by mistake. append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS) +# Remove -stdlib= which is unused when passing -nostdinc++. +string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) # This appears to be a C-only warning banning the use of locals in aggregate # initializers. All other compilers accept this, though. From 8bf4c1f4fb257774f66c8cda07adc6c5e8668326 Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Thu, 9 Apr 2020 18:29:40 -0700 Subject: [PATCH 0056/1035] Reapply "[DomTree] Replace ChildrenGetter with GraphTraits over GraphDiff." This is the part of the patch that's moving the Updates to a CFGDiff object. Splitting off from the clean-up work merging the two branches when BUI is null. Differential Revision: https://reviews.llvm.org/D77341 --- llvm/include/llvm/IR/Dominators.h | 9 +- llvm/include/llvm/Support/CFGDiff.h | 42 ++++- llvm/include/llvm/Support/GenericDomTree.h | 11 +- .../llvm/Support/GenericDomTreeConstruction.h | 170 +++++------------- llvm/lib/IR/Dominators.cpp | 5 +- 5 files changed, 98 insertions(+), 139 deletions(-) diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h index 71595cb15df48..7b34309cb1319 100644 --- a/llvm/include/llvm/IR/Dominators.h +++ b/llvm/include/llvm/IR/Dominators.h @@ -44,6 +44,9 @@ using BBPostDomTree = PostDomTreeBase; using BBUpdates = ArrayRef>; +using BBDomTreeGraphDiff = GraphDiff; +using BBPostDomTreeGraphDiff = GraphDiff; + extern template void Calculate(BBDomTree &DT); extern template void CalculateWithUpdates(BBDomTree &DT, BBUpdates U); @@ -62,8 +65,10 @@ extern template void DeleteEdge(BBPostDomTree &DT, BasicBlock *From, BasicBlock *To); -extern template void ApplyUpdates(BBDomTree &DT, BBUpdates); -extern template void ApplyUpdates(BBPostDomTree &DT, BBUpdates); +extern template void ApplyUpdates(BBDomTree &DT, + BBDomTreeGraphDiff &); +extern template void ApplyUpdates(BBPostDomTree &DT, + BBPostDomTreeGraphDiff &); extern template bool Verify(const BBDomTree &DT, BBDomTree::VerificationLevel VL); diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h index 94734ce70e02c..269984872bfac 100644 --- a/llvm/include/llvm/Support/CFGDiff.h +++ b/llvm/include/llvm/Support/CFGDiff.h @@ -70,6 +70,23 @@ namespace llvm { +namespace detail { +template +auto reverse_if_helper(Range &&R, std::integral_constant) { + return std::forward(R); +} + +template +auto reverse_if_helper(Range &&R, std::integral_constant) { + return llvm::reverse(std::forward(R)); +} + +template auto reverse_if(Range &&R) { + return reverse_if_helper(std::forward(R), + std::integral_constant{}); +} +} // namespace detail + // GraphDiff defines a CFG snapshot: given a set of Update, provide // utilities to skip edges marked as deleted and return a set of edges marked as // newly inserted. The current diff treats the CFG as a graph rather than a @@ -113,8 +130,7 @@ template class GraphDiff { GraphDiff() : UpdatedAreReverseApplied(false) {} GraphDiff(ArrayRef> Updates, bool ReverseApplyUpdates = false) { - cfg::LegalizeUpdates(Updates, LegalizedUpdates, InverseGraph, - /*ReverseResultOrder=*/true); + cfg::LegalizeUpdates(Updates, LegalizedUpdates, InverseGraph); // The legalized updates are stored in reverse so we can pop_back when doing // incremental updates. for (auto U : LegalizedUpdates) { @@ -174,6 +190,25 @@ template class GraphDiff { return make_range(It->second.begin(), It->second.end()); } + using VectRet = SmallVector; + + template VectRet getChildren(NodePtr N) const { + using DirectedNodeT = + std::conditional_t, NodePtr>; + auto R = children(N); + auto CurrentCFGChildren = detail::reverse_if(R); + + VectRet UpdatedCFGChildren; + for (auto Child : CurrentCFGChildren) + if (Child && !ignoreChild(N, Child, InverseEdge)) + UpdatedCFGChildren.push_back(Child); + + auto AddedCFGChildren = getAddedChildren(N, InverseEdge); + UpdatedCFGChildren.insert(UpdatedCFGChildren.end(), + AddedCFGChildren.begin(), AddedCFGChildren.end()); + return UpdatedCFGChildren; + } + void print(raw_ostream &OS) const { OS << "===== GraphDiff: CFG edge changes to create a CFG snapshot. \n" "===== (Note: notion of children/inverse_children depends on " @@ -210,9 +245,10 @@ struct CFGViewChildren { // filter iterator init: auto R = make_range(GT::child_begin(N.second), GT::child_end(N.second)); + auto RR = detail::reverse_if(R); // This lambda is copied into the iterators and persists to callers, ensure // captures are by value or otherwise have sufficient lifetime. - auto First = make_filter_range(makeChildRange(R, N.first), [N](NodeRef C) { + auto First = make_filter_range(makeChildRange(RR, N.first), [N](NodeRef C) { return !C.first->ignoreChild(N.second, C.second, InverseEdge); }); diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h index 10e591a69d369..325365824b3b9 100644 --- a/llvm/include/llvm/Support/GenericDomTree.h +++ b/llvm/include/llvm/Support/GenericDomTree.h @@ -28,6 +28,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Support/CFGDiff.h" #include "llvm/Support/CFGUpdate.h" #include "llvm/Support/raw_ostream.h" #include @@ -211,7 +212,8 @@ void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, template void ApplyUpdates(DomTreeT &DT, - ArrayRef Updates); + GraphDiff &PreViewCFG); template bool Verify(const DomTreeT &DT, typename DomTreeT::VerificationLevel VL); @@ -535,10 +537,13 @@ class DominatorTreeBase { /// The type of updates is the same for DomTreeBase and PostDomTreeBase /// with the same template parameter T. /// - /// \param Updates An unordered sequence of updates to perform. + /// \param Updates An unordered sequence of updates to perform. The current + /// CFG and the reverse of these updates provides the pre-view of the CFG. /// void applyUpdates(ArrayRef Updates) { - DomTreeBuilder::ApplyUpdates(*this, Updates); + GraphDiff PreViewCFG( + Updates, /*ReverseApplyUpdates=*/true); + DomTreeBuilder::ApplyUpdates(*this, PreViewCFG); } /// Inform the dominator tree about a CFG edge insertion and update the tree. diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h index 464de4e2b3ba1..709276ab7a29f 100644 --- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h +++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h @@ -58,6 +58,7 @@ struct SemiNCAInfo { using TreeNodePtr = DomTreeNodeBase *; using RootsT = decltype(DomTreeT::Roots); static constexpr bool IsPostDom = DomTreeT::IsPostDominator; + using GraphDiffT = GraphDiff; // Information record used by Semi-NCA during tree construction. struct InfoRec { @@ -77,28 +78,27 @@ struct SemiNCAInfo { using UpdateT = typename DomTreeT::UpdateType; using UpdateKind = typename DomTreeT::UpdateKind; struct BatchUpdateInfo { - SmallVector Updates; - using NodePtrAndKind = PointerIntPair; - - // In order to be able to walk a CFG that is out of sync with the CFG - // DominatorTree last knew about, use the list of updates to reconstruct - // previous CFG versions of the current CFG. For each node, we store a set - // of its virtually added/deleted future successors and predecessors. - // Note that these children are from the future relative to what the - // DominatorTree knows about -- using them to gets us some snapshot of the - // CFG from the past (relative to the state of the CFG). - DenseMap> FutureSuccessors; - DenseMap> FuturePredecessors; + // Note: Updates inside PreViewCFG are aleady legalized. + BatchUpdateInfo(GraphDiffT &PreViewCFG) + : PreViewCFG(PreViewCFG), + NumLegalized(PreViewCFG.getNumLegalizedUpdates()) {} + // Remembers if the whole tree was recalculated at some point during the // current batch update. bool IsRecalculated = false; + GraphDiffT &PreViewCFG; + const size_t NumLegalized; }; BatchUpdateInfo *BatchUpdates; using BatchUpdatePtr = BatchUpdateInfo *; + std::unique_ptr EmptyGD; // If BUI is a nullptr, then there's no batch update in progress. - SemiNCAInfo(BatchUpdatePtr BUI) : BatchUpdates(BUI) {} + SemiNCAInfo(BatchUpdatePtr BUI) : BatchUpdates(BUI) { + if (!BatchUpdates) + EmptyGD = std::make_unique(); + } void clear() { NumToNode = {nullptr}; // Restore to initial state with a dummy start node. @@ -107,8 +107,7 @@ struct SemiNCAInfo { // in progress, we need this information to continue it. } - template - struct ChildrenGetter { + template struct ChildrenGetter { using ResultTy = SmallVector; static ResultTy Get(NodePtr N, std::integral_constant) { @@ -121,50 +120,16 @@ struct SemiNCAInfo { return ResultTy(IChildren.begin(), IChildren.end()); } - using Tag = std::integral_constant; + using Tag = std::integral_constant; // The function below is the core part of the batch updater. It allows the // Depth Based Search algorithm to perform incremental updates in lockstep // with updates to the CFG. We emulated lockstep CFG updates by getting its // next snapshots by reverse-applying future updates. static ResultTy Get(NodePtr N, BatchUpdatePtr BUI) { - ResultTy Res = Get(N, Tag()); - // If there's no batch update in progress, simply return node's children. - if (!BUI) return Res; - - // CFG children are actually its *most current* children, and we have to - // reverse-apply the future updates to get the node's children at the - // point in time the update was performed. - auto &FutureChildren = (Inverse != IsPostDom) ? BUI->FuturePredecessors - : BUI->FutureSuccessors; - auto FCIt = FutureChildren.find(N); - if (FCIt == FutureChildren.end()) return Res; - - for (auto ChildAndKind : FCIt->second) { - const NodePtr Child = ChildAndKind.getPointer(); - const UpdateKind UK = ChildAndKind.getInt(); - - // Reverse-apply the future update. - if (UK == UpdateKind::Insert) { - // If there's an insertion in the future, it means that the edge must - // exist in the current CFG, but was not present in it before. - assert(llvm::find(Res, Child) != Res.end() - && "Expected child not found in the CFG"); - Res.erase(std::remove(Res.begin(), Res.end(), Child), Res.end()); - LLVM_DEBUG(dbgs() << "\tHiding edge " << BlockNamePrinter(N) << " -> " - << BlockNamePrinter(Child) << "\n"); - } else { - // If there's an deletion in the future, it means that the edge cannot - // exist in the current CFG, but existed in it before. - assert(llvm::find(Res, Child) == Res.end() && - "Unexpected child found in the CFG"); - LLVM_DEBUG(dbgs() << "\tShowing virtual edge " << BlockNamePrinter(N) - << " -> " << BlockNamePrinter(Child) << "\n"); - Res.push_back(Child); - } - } - - return Res; + if (!BUI) + return Get(N, Tag()); + return BUI->PreViewCFG.template getChildren(N); } }; @@ -1005,15 +970,14 @@ struct SemiNCAInfo { const TreeNodePtr TN) { LLVM_DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n"); - for (const NodePtr Pred : - ChildrenGetter::Get(TN->getBlock(), BUI)) { + auto TNB = TN->getBlock(); + for (const NodePtr Pred : ChildrenGetter::Get(TNB, BUI)) { LLVM_DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n"); if (!DT.getNode(Pred)) continue; - const NodePtr Support = - DT.findNearestCommonDominator(TN->getBlock(), Pred); + const NodePtr Support = DT.findNearestCommonDominator(TNB, Pred); LLVM_DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n"); - if (Support != TN->getBlock()) { + if (Support != TNB) { LLVM_DEBUG(dbgs() << "\t" << BlockNamePrinter(TN) << " is reachable from support " << BlockNamePrinter(Support) << "\n"); @@ -1144,53 +1108,23 @@ struct SemiNCAInfo { //===--------------------- DomTree Batch Updater --------------------------=== //~~ - static void ApplyUpdates(DomTreeT &DT, ArrayRef Updates) { - const size_t NumUpdates = Updates.size(); + static void ApplyUpdates(DomTreeT &DT, GraphDiffT &PreViewCFG) { + const size_t NumUpdates = PreViewCFG.getNumLegalizedUpdates(); if (NumUpdates == 0) return; // Take the fast path for a single update and avoid running the batch update // machinery. if (NumUpdates == 1) { - const auto &Update = Updates.front(); + UpdateT Update = PreViewCFG.popUpdateForIncrementalUpdates(); if (Update.getKind() == UpdateKind::Insert) - DT.insertEdge(Update.getFrom(), Update.getTo()); + InsertEdge(DT, /*BUI=*/nullptr, Update.getFrom(), Update.getTo()); else - DT.deleteEdge(Update.getFrom(), Update.getTo()); - + DeleteEdge(DT, /*BUI=*/nullptr, Update.getFrom(), Update.getTo()); return; } - BatchUpdateInfo BUI; - LLVM_DEBUG(dbgs() << "Legalizing " << BUI.Updates.size() << " updates\n"); - cfg::LegalizeUpdates(Updates, BUI.Updates, IsPostDom); - - const size_t NumLegalized = BUI.Updates.size(); - BUI.FutureSuccessors.reserve(NumLegalized); - BUI.FuturePredecessors.reserve(NumLegalized); - - // Use the legalized future updates to initialize future successors and - // predecessors. Note that these sets will only decrease size over time, as - // the next CFG snapshots slowly approach the actual (current) CFG. - for (UpdateT &U : BUI.Updates) { - BUI.FutureSuccessors[U.getFrom()].push_back({U.getTo(), U.getKind()}); - BUI.FuturePredecessors[U.getTo()].push_back({U.getFrom(), U.getKind()}); - } - -#if 0 - // FIXME: The LLVM_DEBUG macro only plays well with a modular - // build of LLVM when the header is marked as textual, but doing - // so causes redefinition errors. - LLVM_DEBUG(dbgs() << "About to apply " << NumLegalized << " updates\n"); - LLVM_DEBUG(if (NumLegalized < 32) for (const auto &U - : reverse(BUI.Updates)) { - dbgs() << "\t"; - U.dump(); - dbgs() << "\n"; - }); - LLVM_DEBUG(dbgs() << "\n"); -#endif - + BatchUpdateInfo BUI(PreViewCFG); // Recalculate the DominatorTree when the number of updates // exceeds a threshold, which usually makes direct updating slower than // recalculation. We select this threshold proportional to the @@ -1200,21 +1134,21 @@ struct SemiNCAInfo { // Make unittests of the incremental algorithm work if (DT.DomTreeNodes.size() <= 100) { - if (NumLegalized > DT.DomTreeNodes.size()) + if (BUI.NumLegalized > DT.DomTreeNodes.size()) CalculateFromScratch(DT, &BUI); - } else if (NumLegalized > DT.DomTreeNodes.size() / 40) + } else if (BUI.NumLegalized > DT.DomTreeNodes.size() / 40) CalculateFromScratch(DT, &BUI); // If the DominatorTree was recalculated at some point, stop the batch // updates. Full recalculations ignore batch updates and look at the actual // CFG. - for (size_t i = 0; i < NumLegalized && !BUI.IsRecalculated; ++i) + for (size_t i = 0; i < BUI.NumLegalized && !BUI.IsRecalculated; ++i) ApplyNextUpdate(DT, BUI); } static void ApplyNextUpdate(DomTreeT &DT, BatchUpdateInfo &BUI) { - assert(!BUI.Updates.empty() && "No updates to apply!"); - UpdateT CurrentUpdate = BUI.Updates.pop_back_val(); + // Popping the next update, will move the PreViewCFG to the next snapshot. + UpdateT CurrentUpdate = BUI.PreViewCFG.popUpdateForIncrementalUpdates(); #if 0 // FIXME: The LLVM_DEBUG macro only plays well with a modular // build of LLVM when the header is marked as textual, but doing @@ -1223,21 +1157,6 @@ struct SemiNCAInfo { LLVM_DEBUG(CurrentUpdate.dump(); dbgs() << "\n"); #endif - // Move to the next snapshot of the CFG by removing the reverse-applied - // current update. Since updates are performed in the same order they are - // legalized it's sufficient to pop the last item here. - auto &FS = BUI.FutureSuccessors[CurrentUpdate.getFrom()]; - assert(FS.back().getPointer() == CurrentUpdate.getTo() && - FS.back().getInt() == CurrentUpdate.getKind()); - FS.pop_back(); - if (FS.empty()) BUI.FutureSuccessors.erase(CurrentUpdate.getFrom()); - - auto &FP = BUI.FuturePredecessors[CurrentUpdate.getTo()]; - assert(FP.back().getPointer() == CurrentUpdate.getFrom() && - FP.back().getInt() == CurrentUpdate.getKind()); - FP.pop_back(); - if (FP.empty()) BUI.FuturePredecessors.erase(CurrentUpdate.getTo()); - if (CurrentUpdate.getKind() == UpdateKind::Insert) InsertEdge(DT, &BUI, CurrentUpdate.getFrom(), CurrentUpdate.getTo()); else @@ -1596,19 +1515,11 @@ void Calculate(DomTreeT &DT) { template void CalculateWithUpdates(DomTreeT &DT, ArrayRef Updates) { - // TODO: Move BUI creation in common method, reuse in ApplyUpdates. - typename SemiNCAInfo::BatchUpdateInfo BUI; - LLVM_DEBUG(dbgs() << "Legalizing " << BUI.Updates.size() << " updates\n"); - cfg::LegalizeUpdates(Updates, BUI.Updates, - DomTreeT::IsPostDominator); - const size_t NumLegalized = BUI.Updates.size(); - BUI.FutureSuccessors.reserve(NumLegalized); - BUI.FuturePredecessors.reserve(NumLegalized); - for (auto &U : BUI.Updates) { - BUI.FutureSuccessors[U.getFrom()].push_back({U.getTo(), U.getKind()}); - BUI.FuturePredecessors[U.getTo()].push_back({U.getFrom(), U.getKind()}); - } - + // FIXME: Updated to use the PreViewCFG and behave the same as until now. + // This behavior is however incorrect; this actually needs the PostViewCFG. + GraphDiff PreViewCFG( + Updates, /*ReverseApplyUpdates=*/true); + typename SemiNCAInfo::BatchUpdateInfo BUI(PreViewCFG); SemiNCAInfo::CalculateFromScratch(DT, &BUI); } @@ -1628,8 +1539,9 @@ void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, template void ApplyUpdates(DomTreeT &DT, - ArrayRef Updates) { - SemiNCAInfo::ApplyUpdates(DT, Updates); + GraphDiff &PreViewCFG) { + SemiNCAInfo::ApplyUpdates(DT, PreViewCFG); } template diff --git a/llvm/lib/IR/Dominators.cpp b/llvm/lib/IR/Dominators.cpp index bb1cc347dcb12..f0b169ea1337f 100644 --- a/llvm/lib/IR/Dominators.cpp +++ b/llvm/lib/IR/Dominators.cpp @@ -90,9 +90,10 @@ template void llvm::DomTreeBuilder::DeleteEdge( DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To); template void llvm::DomTreeBuilder::ApplyUpdates( - DomTreeBuilder::BBDomTree &DT, DomTreeBuilder::BBUpdates); + DomTreeBuilder::BBDomTree &DT, DomTreeBuilder::BBDomTreeGraphDiff &); template void llvm::DomTreeBuilder::ApplyUpdates( - DomTreeBuilder::BBPostDomTree &DT, DomTreeBuilder::BBUpdates); + DomTreeBuilder::BBPostDomTree &DT, + DomTreeBuilder::BBPostDomTreeGraphDiff &); template bool llvm::DomTreeBuilder::Verify( const DomTreeBuilder::BBDomTree &DT, From 31d58858425f6021d380eff879dd8983e25a5715 Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Fri, 24 Jul 2020 15:55:25 -0700 Subject: [PATCH 0057/1035] [lld-macho] Partial support for weak definitions This diff adds support for weak definitions, though it doesn't handle weak symbols in dylibs quite correctly -- we need to emit binding opcodes for them in the weak binding section rather than the lazy binding section. What *is* covered in this diff: 1. Reading the weak flag from symbol table / export trie, and writing it to the export trie 2. Refining the symbol table's rules for choosing one symbol definition over another. Wrote a few dozen test cases to make sure we were matching ld64's behavior. We can now link basic C++ programs. Reviewed By: #lld-macho, compnerd Differential Revision: https://reviews.llvm.org/D83532 --- lld/MachO/Arch/X86_64.cpp | 1 + lld/MachO/ExportTrie.cpp | 14 +-- lld/MachO/InputFiles.cpp | 16 ++-- lld/MachO/SymbolTable.cpp | 28 ++++-- lld/MachO/SymbolTable.h | 11 ++- lld/MachO/Symbols.h | 25 ++++-- lld/MachO/SyntheticSections.cpp | 3 +- lld/test/MachO/weak-definition-direct-fetch.s | 90 +++++++++++++++++++ .../MachO/weak-definition-indirect-fetch.s | 42 +++++++++ lld/test/MachO/weak-definition-order.s | 36 ++++++++ lld/test/MachO/weak-definition-over-dysym.s | 39 ++++++++ 11 files changed, 275 insertions(+), 30 deletions(-) create mode 100644 lld/test/MachO/weak-definition-direct-fetch.s create mode 100644 lld/test/MachO/weak-definition-indirect-fetch.s create mode 100644 lld/test/MachO/weak-definition-order.s create mode 100644 lld/test/MachO/weak-definition-over-dysym.s diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp index 36f686ca2f1d8..458dad805b4a2 100644 --- a/lld/MachO/Arch/X86_64.cpp +++ b/lld/MachO/Arch/X86_64.cpp @@ -218,6 +218,7 @@ void X86_64::prepareSymbolRelocation(lld::macho::Symbol &sym, in.got->addEntry(sym); break; case X86_64_RELOC_BRANCH: { + // TODO: weak dysyms should go into the weak binding section instead if (auto *dysym = dyn_cast(&sym)) in.stubs->addEntry(*dysym); break; diff --git a/lld/MachO/ExportTrie.cpp b/lld/MachO/ExportTrie.cpp index 7cc81bcfd5f18..993a552435325 100644 --- a/lld/MachO/ExportTrie.cpp +++ b/lld/MachO/ExportTrie.cpp @@ -59,6 +59,10 @@ struct Edge { struct ExportInfo { uint64_t address; + uint8_t flags; + explicit ExportInfo(const Symbol &sym) + : address(sym.getVA()), + flags(sym.isWeakDef() ? EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION : 0) {} // TODO: Add proper support for re-exports & stub-and-resolver flags. }; @@ -83,9 +87,8 @@ bool TrieNode::updateOffset(size_t &nextOffset) { // node. size_t nodeSize; if (info) { - uint64_t flags = 0; uint32_t terminalSize = - getULEB128Size(flags) + getULEB128Size(info->address); + getULEB128Size(info->flags) + getULEB128Size(info->address); // Overall node size so far is the uleb128 size of the length of the symbol // info + the symbol info itself. nodeSize = terminalSize + getULEB128Size(terminalSize); @@ -110,11 +113,10 @@ void TrieNode::writeTo(uint8_t *buf) const { buf += offset; if (info) { // TrieNodes with Symbol info: size, flags address - uint64_t flags = 0; // TODO: emit proper flags uint32_t terminalSize = - getULEB128Size(flags) + getULEB128Size(info->address); + getULEB128Size(info->flags) + getULEB128Size(info->address); buf += encodeULEB128(terminalSize, buf); - buf += encodeULEB128(flags, buf); + buf += encodeULEB128(info->flags, buf); buf += encodeULEB128(info->address, buf); } else { // TrieNode with no Symbol info. @@ -194,7 +196,7 @@ void TrieBuilder::sortAndBuild(MutableArrayRef vec, if (isTerminal) { assert(j - i == 1); // no duplicate symbols - node->info = {pivotSymbol->getVA()}; + node->info = ExportInfo(*pivotSymbol); } else { // This is the tail-call-optimized version of the following: // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 46fe82f988222..f1afc187aca23 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -228,10 +228,9 @@ void InputFile::parseSymbols(ArrayRef nList, StringRef name = strtab + sym.n_strx; if (sym.n_type & N_EXT) // Global defined symbol - return symtab->addDefined(name, isec, value); - else - // Local defined symbol - return make(name, isec, value); + return symtab->addDefined(name, isec, value, sym.n_desc & N_WEAK_DEF); + // Local defined symbol + return make(name, isec, value, sym.n_desc & N_WEAK_DEF); }; for (size_t i = 0, n = nList.size(); i < n; ++i) { @@ -351,7 +350,9 @@ DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella) auto *c = reinterpret_cast(cmd); parseTrie(buf + c->export_off, c->export_size, [&](const Twine &name, uint64_t flags) { - symbols.push_back(symtab->addDylib(saver.save(name), umbrella)); + bool isWeakDef = flags & EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; + symbols.push_back( + symtab->addDylib(saver.save(name), umbrella, isWeakDef)); }); } else { error("LC_DYLD_INFO_ONLY not found in " + getName()); @@ -390,10 +391,11 @@ DylibFile::DylibFile(std::shared_ptr interface, dylibName = saver.save(interface->getInstallName()); // TODO(compnerd) filter out symbols based on the target platform + // TODO: handle weak defs for (const auto symbol : interface->symbols()) if (symbol->getArchitectures().has(config->arch)) - symbols.push_back( - symtab->addDylib(saver.save(symbol->getName()), umbrella)); + symbols.push_back(symtab->addDylib(saver.save(symbol->getName()), + umbrella, /*isWeakDef=*/false)); // TODO(compnerd) properly represent the hierarchy of the documents as it is // in theory possible to have re-exported dylibs from re-exported dylibs which // should be parent'ed to the child. diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp index 80e870d79890c..061642d73f441 100644 --- a/lld/MachO/SymbolTable.cpp +++ b/lld/MachO/SymbolTable.cpp @@ -37,15 +37,23 @@ std::pair SymbolTable::insert(StringRef name) { } Symbol *SymbolTable::addDefined(StringRef name, InputSection *isec, - uint32_t value) { + uint32_t value, bool isWeakDef) { Symbol *s; bool wasInserted; std::tie(s, wasInserted) = insert(name); - if (!wasInserted && isa(s)) - error("duplicate symbol: " + name); - - replaceSymbol(s, name, isec, value); + if (!wasInserted) { + if (auto *defined = dyn_cast(s)) { + if (isWeakDef) + return s; + if (!defined->isWeakDef()) + error("duplicate symbol: " + name); + } + // Defined symbols take priority over other types of symbols, so in case + // of a name conflict, we fall through to the replaceSymbol() call below. + } + + replaceSymbol(s, name, isec, value, isWeakDef); return s; } @@ -61,13 +69,15 @@ Symbol *SymbolTable::addUndefined(StringRef name) { return s; } -Symbol *SymbolTable::addDylib(StringRef name, DylibFile *file) { +Symbol *SymbolTable::addDylib(StringRef name, DylibFile *file, bool isWeakDef) { Symbol *s; bool wasInserted; std::tie(s, wasInserted) = insert(name); - if (wasInserted || isa(s)) - replaceSymbol(s, file, name); + if (wasInserted || isa(s) || + (isa(s) && !isWeakDef && s->isWeakDef())) + replaceSymbol(s, file, name, isWeakDef); + return s; } @@ -79,7 +89,7 @@ Symbol *SymbolTable::addLazy(StringRef name, ArchiveFile *file, if (wasInserted) replaceSymbol(s, file, sym); - else if (isa(s)) + else if (isa(s) || (isa(s) && s->isWeakDef())) file->fetch(sym); return s; } diff --git a/lld/MachO/SymbolTable.h b/lld/MachO/SymbolTable.h index 2379008db56da..088b0e97c8406 100644 --- a/lld/MachO/SymbolTable.h +++ b/lld/MachO/SymbolTable.h @@ -22,13 +22,20 @@ class DylibFile; class InputSection; class Symbol; +/* + * Note that the SymbolTable handles name collisions by calling + * replaceSymbol(), which does an in-place update of the Symbol via `placement + * new`. Therefore, there is no need to update any relocations that hold + * pointers the "old" Symbol -- they will automatically point to the new one. + */ class SymbolTable { public: - Symbol *addDefined(StringRef name, InputSection *isec, uint32_t value); + Symbol *addDefined(StringRef name, InputSection *isec, uint32_t value, + bool isWeakDef); Symbol *addUndefined(StringRef name); - Symbol *addDylib(StringRef name, DylibFile *file); + Symbol *addDylib(StringRef name, DylibFile *file, bool isWeakDef); Symbol *addLazy(StringRef name, ArchiveFile *file, const llvm::object::Archive::Symbol &sym); diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h index 63748ee483245..2dcccd03a8d02 100644 --- a/lld/MachO/Symbols.h +++ b/lld/MachO/Symbols.h @@ -39,6 +39,8 @@ class Symbol { LazyKind, }; + virtual ~Symbol() {} + Kind kind() const { return static_cast(symbolKind); } StringRef getName() const { return {name.data, name.size}; } @@ -47,6 +49,8 @@ class Symbol { uint64_t getFileOffset() const; + virtual bool isWeakDef() const { llvm_unreachable("cannot be weak"); } + uint32_t gotIndex = UINT32_MAX; protected: @@ -58,13 +62,19 @@ class Symbol { class Defined : public Symbol { public: - Defined(StringRefZ name, InputSection *isec, uint32_t value) - : Symbol(DefinedKind, name), isec(isec), value(value) {} + Defined(StringRefZ name, InputSection *isec, uint32_t value, bool isWeakDef) + : Symbol(DefinedKind, name), isec(isec), value(value), + weakDef(isWeakDef) {} + + bool isWeakDef() const override { return weakDef; } + + static bool classof(const Symbol *s) { return s->kind() == DefinedKind; } InputSection *isec; uint32_t value; - static bool classof(const Symbol *s) { return s->kind() == DefinedKind; } +private: + const bool weakDef; }; class Undefined : public Symbol { @@ -76,14 +86,19 @@ class Undefined : public Symbol { class DylibSymbol : public Symbol { public: - DylibSymbol(DylibFile *file, StringRefZ name) - : Symbol(DylibKind, name), file(file) {} + DylibSymbol(DylibFile *file, StringRefZ name, bool isWeakDef) + : Symbol(DylibKind, name), file(file), weakDef(isWeakDef) {} + + bool isWeakDef() const override { return weakDef; } static bool classof(const Symbol *s) { return s->kind() == DylibKind; } DylibFile *file; uint32_t stubsIndex = UINT32_MAX; uint32_t lazyBindOffset = UINT32_MAX; + +private: + const bool weakDef; }; class LazySymbol : public Symbol { diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index cc0d5a93c40de..a2d8bf42e9edf 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -264,7 +264,8 @@ void StubHelperSection::setup() { in.got->addEntry(*stubBinder); inputSections.push_back(in.imageLoaderCache); - symtab->addDefined("__dyld_private", in.imageLoaderCache, 0); + symtab->addDefined("__dyld_private", in.imageLoaderCache, 0, + /*isWeakDef=*/false); } ImageLoaderCacheSection::ImageLoaderCacheSection() { diff --git a/lld/test/MachO/weak-definition-direct-fetch.s b/lld/test/MachO/weak-definition-direct-fetch.s new file mode 100644 index 0000000000000..04c022e9c086d --- /dev/null +++ b/lld/test/MachO/weak-definition-direct-fetch.s @@ -0,0 +1,90 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This test exercises the various possible combinations of weak and non-weak +## symbols that get referenced directly by a relocation in an object file. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo; .section __TEXT,nonweak; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,weak; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weakfoo.o + +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libfoo.dylib %t/foo.o -o %t/libfoo.dylib +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libweakfoo.dylib %t/weakfoo.o -o %t/libweakfoo.dylib + +# RUN: llvm-objdump --macho --exports-trie %t/libweakfoo.dylib | FileCheck %s --check-prefix WEAK-DYLIB-CHECK +# WEAK-DYLIB-CHECK: _foo [weak_def] + +## Make sure we are using the export trie and not the symbol table when linking +## against these dylibs. +# RUN: llvm-strip %t/libfoo.dylib +# RUN: llvm-strip %t/libweakfoo.dylib +# RUN: llvm-nm %t/libfoo.dylib 2>&1 | FileCheck %s --check-prefix=NOSYM +# RUN: llvm-nm %t/libweakfoo.dylib 2>&1 | FileCheck %s --check-prefix=NOSYM +# NOSYM: no symbols + +# RUN: rm -f %t/foo.a +# RUN: llvm-ar --format=darwin rcs %t/foo.a %t/foo.o +# RUN: rm -f %t/weakfoo.a +# RUN: llvm-ar --format=darwin rcs %t/weakfoo.a %t/weakfoo.o + +## End of input file setup. The following lines check which symbol "wins" when +## there are multiple definitions. + +# PREFER-NONWEAK-DYLIB: __DATA __la_symbol_ptr 0x{{[0-9a-f]+}} libfoo _foo +# PREFER-WEAK-OBJECT: O __TEXT,weak _foo +# PREFER-NONWEAK-OBJECT: O __TEXT,nonweak _foo + +## First, we test the cases where the symbols are of the same type (both from a +## dylib, or both from an archive, etc.) +## +## For dylibs and object files, the non-weak symbol always wins. But the weak +## flag has no effect when we are dealing with two archive symbols. + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-dylibs -Z -L%t -lweakfoo -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-nonweak-dylibs | FileCheck %s --check-prefix=PREFER-NONWEAK-DYLIB +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-dylibs -Z -L%t -lfoo -lweakfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-weak-dylibs | FileCheck %s --check-prefix=PREFER-NONWEAK-DYLIB + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-objs -Z -L%t %t/weakfoo.o %t/foo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-nonweak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-objs -Z -L%t %t/foo.o %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-weak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-archives -Z -L%t %t/weakfoo.a %t/foo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-nonweak-archives | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-archives -Z -L%t %t/foo.a %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-weak-archives | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +## The remaining lines test symbol pairs of different types. + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-dylib-weak-ar -Z -L%t -lweakfoo %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-dylib-weak-ar | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-ar-weak-dylib -Z -L%t %t/weakfoo.a -lweakfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-ar-weak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-ar-nonweak-dylib -Z -L%t %t/weakfoo.a -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-ar-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-ar -Z -L%t -lfoo %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-ar | FileCheck %s --check-prefix=PREFER-NONWEAK-DYLIB + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-dylib-weak-obj -Z -L%t -lweakfoo %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-dylib-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-weak-dylib -Z -L%t %t/weakfoo.o -lweakfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-weak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-dylib -Z -L%t %t/weakfoo.o -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-obj -Z -L%t -lfoo %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-ar -Z -L%t %t/weakfoo.o %t/foo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-nonweak-ar | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-ar-weak-obj -Z -L%t %t/foo.a %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-ar-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +.globl _main +_main: + callq _foo + ret diff --git a/lld/test/MachO/weak-definition-indirect-fetch.s b/lld/test/MachO/weak-definition-indirect-fetch.s new file mode 100644 index 0000000000000..d22e0a370d5e0 --- /dev/null +++ b/lld/test/MachO/weak-definition-indirect-fetch.s @@ -0,0 +1,42 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This tests examines the effect of .weak_definition on symbols in an archive +## that are not referenced directly, but which are still loaded due to some +## other symbol in the archive member being referenced. +## +## In this particular test, _foo isn't referenced directly, but both archives +## will be fetched when linking against the main test file due to its references +## to _bar and _baz. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo, _bar; .section __TEXT,nonweak; _bar: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: echo ".globl _foo, _baz; .weak_definition _foo; .section __TEXT,weak; _baz: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weakfoo.o + +# RUN: rm -f %t/foo.a +# RUN: llvm-ar --format=darwin rcs %t/foo.a %t/foo.o +# RUN: rm -f %t/weakfoo.a +# RUN: llvm-ar --format=darwin rcs %t/weakfoo.a %t/weakfoo.o + +# PREFER-NONWEAK-OBJECT: O __TEXT,nonweak _foo + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-archives -Z -L%t %t/weakfoo.a %t/foo.a %t/test.o +# RUN: llvm-objdump --syms %t/weak-nonweak-archives | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-archives -Z -L%t %t/foo.a %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --syms %t/nonweak-weak-archives | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-objs -Z -L%t %t/weakfoo.o %t/foo.o %t/test.o +# RUN: llvm-objdump --syms %t/weak-nonweak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-objs -Z -L%t %t/foo.o %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --syms %t/nonweak-weak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-ar -Z -L%t %t/weakfoo.o %t/foo.a %t/test.o +# RUN: llvm-objdump --syms %t/weak-obj-nonweak-ar | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-ar-weak-obj -Z -L%t %t/foo.a %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --syms %t/nonweak-ar-weak-obj | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +.globl _main +_main: + callq _bar + callq _baz + ret diff --git a/lld/test/MachO/weak-definition-order.s b/lld/test/MachO/weak-definition-order.s new file mode 100644 index 0000000000000..6770a5f76b391 --- /dev/null +++ b/lld/test/MachO/weak-definition-order.s @@ -0,0 +1,36 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This test demonstrates that when we have two weak symbols of the same type, +## we pick the one whose containing file appears earlier in the command-line +## invocation. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,weak1; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weak1.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,weak2; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weak2.o + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/obj12 -Z -L%t %t/weak1.o %t/weak2.o %t/test.o +# RUN: llvm-objdump --syms %t/obj12 | FileCheck %s --check-prefix=WEAK1 +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/obj21 -Z -L%t %t/weak2.o %t/weak1.o %t/test.o +# RUN: llvm-objdump --syms %t/obj21 | FileCheck %s --check-prefix=WEAK2 + +# WEAK1: O __TEXT,weak1 _foo +# WEAK2: O __TEXT,weak2 _foo + +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libweak1.dylib %t/weak1.o -o %t/libweak1.dylib +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libweak2.dylib %t/weak2.o -o %t/libweak2.dylib + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/dylib12 -Z -L%t -lweak1 -lweak2 %t/test.o +# RUN: llvm-objdump --macho --lazy-bind %t/dylib12 | FileCheck %s --check-prefix=DYLIB1 +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/dylib21 -Z -L%t -lweak2 -lweak1 %t/test.o +# RUN: llvm-objdump --macho --lazy-bind %t/dylib21 | FileCheck %s --check-prefix=DYLIB2 +## TODO: these should really be in the weak binding section, not the lazy binding section +# DYLIB1: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} libweak1 _foo +# DYLIB2: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} libweak2 _foo + +.globl _main +_main: + callq _foo + ret diff --git a/lld/test/MachO/weak-definition-over-dysym.s b/lld/test/MachO/weak-definition-over-dysym.s new file mode 100644 index 0000000000000..e3cf030b7149f --- /dev/null +++ b/lld/test/MachO/weak-definition-over-dysym.s @@ -0,0 +1,39 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This test demonstrates that when an archive file is fetched, its symbols +## always override any conflicting dylib symbols, regardless of any weak +## definition flags. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/libfoo.o +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libfoo.dylib %t/libfoo.o -o %t/libfoo.dylib + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo, _bar; .section __TEXT,nonweak; _bar: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: echo ".globl _foo, _bar; .weak_definition _foo; .section __TEXT,weak; _bar: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weakfoo.o + +# RUN: rm -f %t/foo.a +# RUN: llvm-ar --format=darwin rcs %t/foo.a %t/foo.o +# RUN: rm -f %t/weakfoo.a +# RUN: llvm-ar --format=darwin rcs %t/weakfoo.a %t/weakfoo.o + +# PREFER-WEAK-OBJECT: O __TEXT,weak _foo +# PREFER-NONWEAK-OBJECT: O __TEXT,nonweak _foo + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-ar -Z -L%t -lfoo %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-ar | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-ar-nonweak-dylib -Z -L%t %t/weakfoo.a -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-ar-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-dylib -Z -L%t %t/weakfoo.o -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-obj -Z -L%t -lfoo %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +.globl _main +_main: + callq _foo + callq _bar + ret From 06a0dd2467d5c4726699eea3589a6444e00a2eef Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Fri, 24 Jul 2020 15:55:14 -0700 Subject: [PATCH 0058/1035] [lld-macho] Ignore -dependency_info and its argument XCode passes in this flag, which we do not yet implement. Skip over the argument for now so we can at least successfully parse the linker invocation. Reviewed By: #lld-macho, compnerd Differential Revision: https://reviews.llvm.org/D84485 --- lld/MachO/Options.td | 9 +++++---- lld/test/MachO/silent-ignore.test | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index 1e42542b9ac47..7d81ca7c14270 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -456,6 +456,11 @@ def map : Separate<["-"], "map">, HelpText<"Writes all symbols and their addresses to ">, Flags<[HelpHidden]>, Group; +def dependency_info : Separate<["-"], "dependency_info">, + MetaVarName<"">, + HelpText<"Dump dependency info">, + Flags<[HelpHidden]>, + Group; def grp_symtab : OptionGroup<"symtab">, HelpText<"SYMBOL TABLE OPTIMIZATIONS">; @@ -1098,10 +1103,6 @@ def demangle : Flag<["-"], "demangle">, HelpText<"This option is undocumented in ld64">, Flags<[HelpHidden]>, Group; -def dependency_info : Flag<["-"], "dependency_info">, - HelpText<"This option is undocumented in ld64">, - Flags<[HelpHidden]>, - Group; def dyld_env : Flag<["-"], "dyld_env">, HelpText<"This option is undocumented in ld64">, Flags<[HelpHidden]>, diff --git a/lld/test/MachO/silent-ignore.test b/lld/test/MachO/silent-ignore.test index ae68dd8fe81ff..0d2086386fa8f 100644 --- a/lld/test/MachO/silent-ignore.test +++ b/lld/test/MachO/silent-ignore.test @@ -4,6 +4,7 @@ RUN: -dynamic \ RUN: -no_deduplicate \ RUN: -lto_library /lib/foo \ RUN: -macosx_version_min 0 \ +RUN: -dependency_info /path/to/dependency_info.dat \ RUN: -syslibroot /path/to/MacOSX.platform/Developer/SDKs/MacOSX.sdk RUN: not lld -flavor darwinnew -v --not-an-ignored-argument 2>&1 | FileCheck %s CHECK: error: unknown argument: --not-an-ignored-argument From 99996213ebd9655bb0488387f906d5438bcd37f8 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 24 Jul 2020 15:10:05 -0700 Subject: [PATCH 0059/1035] [lldb] Don't wrap and release raw pointer in unique_ptr (NFC) --- lldb/source/Target/LanguageRuntime.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lldb/source/Target/LanguageRuntime.cpp b/lldb/source/Target/LanguageRuntime.cpp index 58ad70c2b9028..f8143839bb64c 100644 --- a/lldb/source/Target/LanguageRuntime.cpp +++ b/lldb/source/Target/LanguageRuntime.cpp @@ -202,20 +202,15 @@ class ExceptionBreakpointResolver : public BreakpointResolver { LanguageRuntime *LanguageRuntime::FindPlugin(Process *process, lldb::LanguageType language) { - std::unique_ptr language_runtime_up; LanguageRuntimeCreateInstance create_callback; - for (uint32_t idx = 0; (create_callback = PluginManager::GetLanguageRuntimeCreateCallbackAtIndex(idx)) != nullptr; ++idx) { - language_runtime_up.reset(create_callback(process, language)); - - if (language_runtime_up) - return language_runtime_up.release(); + if (LanguageRuntime *runtime = create_callback(process, language)) + return runtime; } - return nullptr; } From 34d4c8a53e569b1b83a0672015a19f8ca9bb3c35 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 24 Jul 2020 16:20:55 -0700 Subject: [PATCH 0060/1035] [lldb] Have LanguageRuntime and SystemRuntime share a base class (NFC) LangaugeRuntime and SystemRuntime now both inherit from Runtime. --- lldb/include/lldb/Target/LanguageRuntime.h | 12 ++------ lldb/include/lldb/Target/Runtime.h | 33 ++++++++++++++++++++++ lldb/include/lldb/Target/SystemRuntime.h | 10 +++---- lldb/source/Target/LanguageRuntime.cpp | 2 +- lldb/source/Target/SystemRuntime.cpp | 6 ++-- 5 files changed, 43 insertions(+), 20 deletions(-) create mode 100644 lldb/include/lldb/Target/Runtime.h diff --git a/lldb/include/lldb/Target/LanguageRuntime.h b/lldb/include/lldb/Target/LanguageRuntime.h index b0b9b919911a1..da3cb9702392d 100644 --- a/lldb/include/lldb/Target/LanguageRuntime.h +++ b/lldb/include/lldb/Target/LanguageRuntime.h @@ -18,6 +18,7 @@ #include "lldb/Expression/LLVMUserExpression.h" #include "lldb/Symbol/DeclVendor.h" #include "lldb/Target/ExecutionContextScope.h" +#include "lldb/Target/Runtime.h" #include "lldb/lldb-private.h" #include "lldb/lldb-public.h" @@ -56,7 +57,7 @@ class ExceptionSearchFilter : public SearchFilter { void UpdateModuleListIfNeeded(); }; -class LanguageRuntime : public PluginInterface { +class LanguageRuntime : public Runtime, public PluginInterface { public: ~LanguageRuntime() override; @@ -127,10 +128,6 @@ class LanguageRuntime : public PluginInterface { return lldb::ThreadSP(); } - Process *GetProcess() { return m_process; } - - Target &GetTargetRef() { return m_process->GetTarget(); } - virtual DeclVendor *GetDeclVendor() { return nullptr; } virtual lldb::BreakpointResolverSP @@ -159,7 +156,7 @@ class LanguageRuntime : public PluginInterface { return llvm::None; } - virtual void ModulesDidLoad(const ModuleList &module_list) {} + virtual void ModulesDidLoad(const ModuleList &module_list) override {} // Called by ClangExpressionParser::PrepareForExecution to query for any // custom LLVM IR passes that need to be run before an expression is @@ -179,10 +176,7 @@ class LanguageRuntime : public PluginInterface { static char ID; protected: - // Classes that inherit from LanguageRuntime can see and modify these - LanguageRuntime(Process *process); - Process *m_process; private: LanguageRuntime(const LanguageRuntime &) = delete; diff --git a/lldb/include/lldb/Target/Runtime.h b/lldb/include/lldb/Target/Runtime.h new file mode 100644 index 0000000000000..06f0b610e40ba --- /dev/null +++ b/lldb/include/lldb/Target/Runtime.h @@ -0,0 +1,33 @@ +//===-- Runtime.h -----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_TARGET_RUNTIME_H +#define LLDB_TARGET_RUNTIME_H + +#include "lldb/Target/Process.h" + +namespace lldb_private { +class Runtime { +public: + Runtime(Process *process) : m_process(process) {} + virtual ~Runtime() = default; + Runtime(const Runtime &) = delete; + const Runtime &operator=(const Runtime &) = delete; + + Process *GetProcess() { return m_process; } + Target &GetTargetRef() { return m_process->GetTarget(); } + + /// Called when modules have been loaded in the process. + virtual void ModulesDidLoad(const ModuleList &module_list) = 0; + +protected: + Process *m_process; +}; +} // namespace lldb_private + +#endif // LLDB_TARGET_RUNTIME_H diff --git a/lldb/include/lldb/Target/SystemRuntime.h b/lldb/include/lldb/Target/SystemRuntime.h index 4f07d7ab52e5b..0ec0793e95f9b 100644 --- a/lldb/include/lldb/Target/SystemRuntime.h +++ b/lldb/include/lldb/Target/SystemRuntime.h @@ -15,6 +15,7 @@ #include "lldb/Core/PluginInterface.h" #include "lldb/Target/QueueItem.h" #include "lldb/Target/QueueList.h" +#include "lldb/Target/Runtime.h" #include "lldb/Utility/ConstString.h" #include "lldb/Utility/StructuredData.h" #include "lldb/lldb-private.h" @@ -39,7 +40,7 @@ namespace lldb_private { /// can be asked to provide that information. /// -class SystemRuntime : public PluginInterface { +class SystemRuntime : public Runtime, public PluginInterface { public: /// Find a system runtime plugin for a given process. /// @@ -52,7 +53,7 @@ class SystemRuntime : public PluginInterface { static SystemRuntime *FindPlugin(Process *process); /// Construct with a process. - SystemRuntime(lldb_private::Process *process); + SystemRuntime(Process *process); /// Destructor. /// @@ -76,7 +77,7 @@ class SystemRuntime : public PluginInterface { /// /// Allow the SystemRuntime plugin to enable logging features in the system /// runtime libraries. - virtual void ModulesDidLoad(lldb_private::ModuleList &module_list); + virtual void ModulesDidLoad(const ModuleList &module_list) override; /// Called before detaching from a process. /// @@ -294,9 +295,6 @@ class SystemRuntime : public PluginInterface { } protected: - // Member variables. - Process *m_process; - std::vector m_types; private: diff --git a/lldb/source/Target/LanguageRuntime.cpp b/lldb/source/Target/LanguageRuntime.cpp index f8143839bb64c..0bbb9660f7412 100644 --- a/lldb/source/Target/LanguageRuntime.cpp +++ b/lldb/source/Target/LanguageRuntime.cpp @@ -214,7 +214,7 @@ LanguageRuntime *LanguageRuntime::FindPlugin(Process *process, return nullptr; } -LanguageRuntime::LanguageRuntime(Process *process) : m_process(process) {} +LanguageRuntime::LanguageRuntime(Process *process) : Runtime(process) {} LanguageRuntime::~LanguageRuntime() = default; diff --git a/lldb/source/Target/SystemRuntime.cpp b/lldb/source/Target/SystemRuntime.cpp index cd3d8ba2c7b09..6d8a2ef55225f 100644 --- a/lldb/source/Target/SystemRuntime.cpp +++ b/lldb/source/Target/SystemRuntime.cpp @@ -27,9 +27,7 @@ SystemRuntime *SystemRuntime::FindPlugin(Process *process) { return nullptr; } -// SystemRuntime constructor -SystemRuntime::SystemRuntime(Process *process) - : m_process(process), m_types() {} +SystemRuntime::SystemRuntime(Process *process) : Runtime(process), m_types() {} SystemRuntime::~SystemRuntime() = default; @@ -39,7 +37,7 @@ void SystemRuntime::DidLaunch() {} void SystemRuntime::Detach() {} -void SystemRuntime::ModulesDidLoad(ModuleList &module_list) {} +void SystemRuntime::ModulesDidLoad(const ModuleList &module_list) {} const std::vector &SystemRuntime::GetExtendedBacktraceTypes() { return m_types; From f320f83f3ac2d9b04fcab9975031fe8afef20253 Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Fri, 24 Jul 2020 16:43:55 -0700 Subject: [PATCH 0061/1035] [AArch64][GlobalISel] Promote G_UITOFP vector operands to same elt size as result. Fixes legalization failures. --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 2 +- .../AArch64/GlobalISel/legalize-itofp.mir | 37 +++++++++++++++++++ .../GlobalISel/legalizer-info-validation.mir | 8 ++-- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 2eaec0b970fa6..bbceb0e169039 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -399,7 +399,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) .clampScalar(1, s32, s64) - .widenScalarToNextPow2(1) + .minScalarSameAs(1, 0) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir index e0a20e030710e..e348c0e454b80 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir @@ -24,6 +24,9 @@ define void @test_sitofp_v4s32() { ret void } define void @test_uitofp_v4s32() { ret void } + define void @test_uitofp_v2s64_v2i1() { ret void } + define void @test_sitofp_v2s64_v2i1() { ret void } + define void @test_sitofp_s32_s16() { ret void } define void @test_uitofp_s32_s16() { ret void } ... @@ -239,6 +242,40 @@ body: | $q0 = COPY %1 ... +--- +name: test_uitofp_v2s64_v2i1 +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: test_uitofp_v2s64_v2i1 + ; CHECK: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:_(s1) = COPY [[DEF]](s1) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s1>) = G_BUILD_VECTOR [[DEF]](s1), [[COPY]](s1) + ; CHECK: [[ZEXT:%[0-9]+]]:_(<2 x s64>) = G_ZEXT [[BUILD_VECTOR]](<2 x s1>) + ; CHECK: [[UITOFP:%[0-9]+]]:_(<2 x s64>) = G_UITOFP [[ZEXT]](<2 x s64>) + ; CHECK: $q0 = COPY [[UITOFP]](<2 x s64>) + %0:_(<2 x s1>) = G_IMPLICIT_DEF + %1:_(<2 x s64>) = G_UITOFP %0 + $q0 = COPY %1 +... + +--- +name: test_sitofp_v2s64_v2i1 +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: test_sitofp_v2s64_v2i1 + ; CHECK: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:_(s1) = COPY [[DEF]](s1) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s1>) = G_BUILD_VECTOR [[DEF]](s1), [[COPY]](s1) + ; CHECK: [[SEXT:%[0-9]+]]:_(<2 x s64>) = G_SEXT [[BUILD_VECTOR]](<2 x s1>) + ; CHECK: [[SITOFP:%[0-9]+]]:_(<2 x s64>) = G_SITOFP [[SEXT]](<2 x s64>) + ; CHECK: $q0 = COPY [[SITOFP]](<2 x s64>) + %0:_(<2 x s1>) = G_IMPLICIT_DEF + %1:_(<2 x s64>) = G_SITOFP %0 + $q0 = COPY %1 +... + --- name: test_sitofp_s32_s16 body: | diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index dcd310fba7e9e..320ec99a51892 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -407,12 +407,12 @@ # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_SITOFP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_UITOFP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FABS (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected From 4b53072ee526ea41ef918f2e4505d9314bd7fa56 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 19 Jul 2020 13:09:48 -0400 Subject: [PATCH 0062/1035] GlobalISel: Define mulfix/divfix opcodes The full expansion involves the funnel shifts, which depend on another patch to expand those. --- .../llvm/CodeGen/GlobalISel/IRTranslator.h | 2 + llvm/include/llvm/Support/TargetOpcodes.def | 24 +++ llvm/include/llvm/Target/GenericOpcodes.td | 71 +++++++++ .../Target/GlobalISel/SelectionDAGCompat.td | 8 + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 26 ++++ .../irtranslator-fixed-point-intrinsics.ll | 142 ++++++++++++++++++ .../GlobalISel/legalizer-info-validation.mir | 24 +++ 7 files changed, 297 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-fixed-point-intrinsics.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 751ab67c4e973..928743a6cbd7d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -223,6 +223,8 @@ class IRTranslator : public MachineFunctionPass { bool translateOverflowIntrinsic(const CallInst &CI, unsigned Op, MachineIRBuilder &MIRBuilder); + bool translateFixedPointIntrinsic(unsigned Op, const CallInst &CI, + MachineIRBuilder &MIRBuilder); /// Helper function for translateSimpleIntrinsic. /// \return The generic opcode for \p IntrinsicID if \p IntrinsicID is a diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index f61b3f9acb3aa..5eb3398562ba2 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -469,6 +469,30 @@ HANDLE_TARGET_OPCODE(G_USUBSAT) /// Generic saturating signed subtraction. HANDLE_TARGET_OPCODE(G_SSUBSAT) +// Perform signed fixed point multiplication +HANDLE_TARGET_OPCODE(G_SMULFIX) + +// Perform unsigned fixed point multiplication +HANDLE_TARGET_OPCODE(G_UMULFIX) + +// Perform signed, saturating fixed point multiplication +HANDLE_TARGET_OPCODE(G_SMULFIXSAT) + +// Perform unsigned, saturating fixed point multiplication +HANDLE_TARGET_OPCODE(G_UMULFIXSAT) + +// Perform signed fixed point division +HANDLE_TARGET_OPCODE(G_SDIVFIX) + +// Perform unsigned fixed point division +HANDLE_TARGET_OPCODE(G_UDIVFIX) + +// Perform signed, saturating fixed point division +HANDLE_TARGET_OPCODE(G_SDIVFIXSAT) + +// Perform unsigned, saturating fixed point division +HANDLE_TARGET_OPCODE(G_UDIVFIXSAT) + /// Generic FP addition. HANDLE_TARGET_OPCODE(G_FADD) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 03795c0050a2c..5b04713d40e81 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -545,6 +545,77 @@ def G_SSUBSAT : GenericInstruction { let isCommutable = 0; } +/// RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point +/// multiplication on 2 integers with the same width and scale. SCALE +/// represents the scale of both operands as fixed point numbers. This +/// SCALE parameter must be a constant integer. A scale of zero is +/// effectively performing multiplication on 2 integers. +def G_SMULFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +def G_UMULFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +/// Same as the corresponding unsaturated fixed point instructions, but the +/// result is clamped between the min and max values representable by the +/// bits of the first 2 operands. +def G_SMULFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +def G_UMULFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +/// RESULT = [US]DIVFIX(LHS, RHS, SCALE) - Perform fixed point division on +/// 2 integers with the same width and scale. SCALE represents the scale +/// of both operands as fixed point numbers. This SCALE parameter must be a +/// constant integer. +def G_SDIVFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + +def G_UDIVFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + +/// Same as the corresponding unsaturated fixed point instructions, +/// but the result is clamped between the min and max values +/// representable by the bits of the first 2 operands. +def G_SDIVFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + +def G_UDIVFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + //------------------------------------------------------------------------------ // Floating Point Unary Ops. //------------------------------------------------------------------------------ diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index 150834e65b2dc..14718556810f5 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -71,6 +71,14 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index aa898d5a61896..e6b3d51bf1ec4 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1214,6 +1214,16 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, return true; } +bool IRTranslator::translateFixedPointIntrinsic(unsigned Op, const CallInst &CI, + MachineIRBuilder &MIRBuilder) { + Register Dst = getOrCreateVReg(CI); + Register Src0 = getOrCreateVReg(*CI.getOperand(0)); + Register Src1 = getOrCreateVReg(*CI.getOperand(1)); + uint64_t Scale = cast(CI.getOperand(2))->getZExtValue(); + MIRBuilder.buildInstr(Op, {Dst}, { Src0, Src1, Scale }); + return true; +} + unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { switch (ID) { default: @@ -1494,6 +1504,22 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateBinaryOp(TargetOpcode::G_SMIN, CI, MIRBuilder); case Intrinsic::smax: return translateBinaryOp(TargetOpcode::G_SMAX, CI, MIRBuilder); + case Intrinsic::smul_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIX, CI, MIRBuilder); + case Intrinsic::umul_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIX, CI, MIRBuilder); + case Intrinsic::smul_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIXSAT, CI, MIRBuilder); + case Intrinsic::umul_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIXSAT, CI, MIRBuilder); + case Intrinsic::sdiv_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIX, CI, MIRBuilder); + case Intrinsic::udiv_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIX, CI, MIRBuilder); + case Intrinsic::sdiv_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIXSAT, CI, MIRBuilder); + case Intrinsic::udiv_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder); case Intrinsic::fmuladd: { const TargetMachine &TM = MF->getTarget(); const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-fixed-point-intrinsics.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-fixed-point-intrinsics.ll new file mode 100644 index 0000000000000..c70b93e50ded8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-fixed-point-intrinsics.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -stop-after=irtranslator -mtriple=aarch64-- -verify-machineinstrs -o - %s | FileCheck %s + +define i16 @smul_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: smul_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SMULFIX:%[0-9]+]]:_(s16) = G_SMULFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMULFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.smul.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @umul_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: umul_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UMULFIX:%[0-9]+]]:_(s16) = G_UMULFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMULFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.umul.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @smul_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: smul_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SMULFIXSAT:%[0-9]+]]:_(s16) = G_SMULFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMULFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.smul.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @umul_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: umul_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UMULFIXSAT:%[0-9]+]]:_(s16) = G_UMULFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMULFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.umul.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @sdiv_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: sdiv_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SDIVFIX:%[0-9]+]]:_(s16) = G_SDIVFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SDIVFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.sdiv.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @udiv_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: udiv_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UDIVFIX:%[0-9]+]]:_(s16) = G_UDIVFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UDIVFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.udiv.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @sdiv_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: sdiv_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SDIVFIXSAT:%[0-9]+]]:_(s16) = G_SDIVFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SDIVFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.sdiv.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @udiv_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: udiv_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UDIVFIXSAT:%[0-9]+]]:_(s16) = G_UDIVFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UDIVFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.udiv.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +declare i16 @llvm.smul.fix.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.umul.fix.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.smul.fix.sat.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.umul.fix.sat.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.sdiv.fix.i16(i16, i16, i32 immarg) #1 +declare i16 @llvm.udiv.fix.i16(i16, i16, i32 immarg) #1 +declare i16 @llvm.sdiv.fix.sat.i16(i16, i16, i32 immarg) #1 +declare i16 @llvm.udiv.fix.sat.i16(i16, i16, i32 immarg) #1 + +attributes #0 = { nounwind readnone speculatable willreturn } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 320ec99a51892..b21f66e53fd48 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -337,6 +337,30 @@ # DEBUG-NEXT: G_SSUBSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SMULFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UMULFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SDIVFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UDIVFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SDIVFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UDIVFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK From 6fdc6f6c7d34af60c45c405f448370a684ef6f2a Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Fri, 24 Jul 2020 17:13:58 -0700 Subject: [PATCH 0063/1035] [PGO][InstrProf] Do not promote count if the exit blocks contains ret instruction Skip profile count promotion if any of the ExitBlocks contains a ret instruction. This is to prevent dumping of incomplete profile -- if the the loop is a long running loop and dump is called in the middle of the loop, the result profile is incomplete. ExitBlocks containing a ret instruction is an indication of a long running loop -- early exit to error handling code. Differential Revision: https://reviews.llvm.org/D84379 --- .../Instrumentation/PGOInstrumentation.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 67bf264e8eeaf..c4a43abaa53cc 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1131,11 +1131,18 @@ bool PGOUseFunc::setInstrumentedCounts( if (NumCounters != CountFromProfile.size()) { return false; } + auto *FuncEntry = &*F.begin(); + // Set the profile count to the Instrumented BBs. uint32_t I = 0; for (BasicBlock *InstrBB : InstrumentBBs) { uint64_t CountValue = CountFromProfile[I++]; UseBBInfo &Info = getBBInfo(InstrBB); + // If we reach here, we know that we have some nonzero count + // values in this function. The entry count should not be 0. + // Fix it if necessary. + if (InstrBB == FuncEntry && CountValue == 0) + CountValue = 1; Info.setBBInfoCount(CountValue); } ProfileCountSize = CountFromProfile.size(); @@ -1326,7 +1333,6 @@ void PGOUseFunc::populateCounters() { } #endif uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; - F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); uint64_t FuncMaxCount = FuncEntryCount; for (auto &BB : F) { auto BI = findBBInfo(&BB); @@ -1334,6 +1340,11 @@ void PGOUseFunc::populateCounters() { continue; FuncMaxCount = std::max(FuncMaxCount, BI->CountValue); } + + // Fix the obviously inconsistent entry count. + if (FuncMaxCount > 0 && FuncEntryCount == 0) + FuncEntryCount = 1; + F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); markFunctionAttributes(FuncEntryCount, FuncMaxCount); // Now annotate select instructions From fcc55c0952f4e4b30326badd6fb11fa06be2b6fd Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Fri, 24 Jul 2020 16:57:37 -0700 Subject: [PATCH 0064/1035] [AArch64][GlobalISel] Use wzr/xzr for 16 and 32 bit stores of zero We weren't performing this optimization on 16 and 32 bit stores. SDAG happily does this though. e.g. https://godbolt.org/z/cWocKr This saves about 0.2% in code size on CTMark at -O3. Differential Revision: https://reviews.llvm.org/D84568 --- .../GISel/AArch64InstructionSelector.cpp | 15 ++++++--- .../AArch64/GlobalISel/select-store.mir | 33 +++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index eb6a4aa3d8266..0b0de09fff29e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -2306,10 +2306,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // If we're storing a 0, use WZR/XZR. if (auto CVal = getConstantVRegVal(ValReg, MRI)) { if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { - if (I.getOpcode() == AArch64::STRWui) - I.getOperand(0).setReg(AArch64::WZR); - else if (I.getOpcode() == AArch64::STRXui) - I.getOperand(0).setReg(AArch64::XZR); + unsigned Opc = I.getOpcode(); + switch(Opc) { + case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: + I.getOperand(0).setReg(AArch64::WZR); + break; + case AArch64::STRXui: + I.getOperand(0).setReg(AArch64::XZR); + break; + } } } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir index b72b8e5d0d700..d60a34ef6631a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir @@ -11,6 +11,8 @@ define void @store_zero_s64_gpr(i64* %addr) { ret void } define void @store_zero_s32_gpr(i32* %addr) { ret void } + define void @store_zero_s16(i32* %addr) { ret void } + define void @store_zero_s8(i32* %addr) { ret void } define void @store_fi_s64_gpr() { %ptr0 = alloca i64 @@ -176,6 +178,37 @@ body: | ... +--- +name: store_zero_s16 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_zero_s16 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: STRHHui $wzr, [[COPY]], 0 :: (store 2) + %0:gpr(p0) = COPY $x0 + %1:gpr(s16) = G_CONSTANT i16 0 + G_STORE %1(s16), %0(p0) :: (store 2) + +... + +--- +name: store_zero_s8 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_zero_s8 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: STRBBui $wzr, [[COPY]], 0 :: (store 1) + %0:gpr(p0) = COPY $x0 + %1:gpr(s8) = G_CONSTANT i8 0 + G_STORE %1(s8), %0(p0) :: (store 1) +... + --- name: store_fi_s64_gpr legalized: true From 74790a5dde9ae01b7e96bea0b2596ef37b5325bd Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Fri, 24 Jul 2020 17:41:50 -0500 Subject: [PATCH 0065/1035] [PowerPC] Implement Truncate and Store VSX Vector Builtins This patch implements the `vec_xst_trunc` function in altivec.h in order to utilize the Store VSX Vector Rightmost [byte | half | word | doubleword] Indexed instructions introduced in Power10. Differential Revision: https://reviews.llvm.org/D82467 --- clang/lib/Headers/altivec.h | 52 +++++ clang/test/CodeGen/builtins-ppc-p10vector.c | 48 ++++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td | 11 + .../CodeGen/PowerPC/builtins-ppc-p10vsx.ll | 205 +++++++++++++++++- 4 files changed, 315 insertions(+), 1 deletion(-) diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h index ac5f43836316e..4e25ec118072b 100644 --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -16597,6 +16597,58 @@ static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, } #endif +/* vec_xst_trunc */ + +#if defined(__POWER10_VECTOR__) && defined(__VSX__) +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed char *__ptr) { + *(__ptr + __offset) = (signed char)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned char *__ptr) { + *(__ptr + __offset) = (unsigned char)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed short *__ptr) { + *(__ptr + __offset) = (signed short)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned short *__ptr) { + *(__ptr + __offset) = (unsigned short)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed int *__ptr) { + *(__ptr + __offset) = (signed int)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned int *__ptr) { + *(__ptr + __offset) = (unsigned int)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed long long *__ptr) { + *(__ptr + __offset) = (signed long long)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned long long *__ptr) { + *(__ptr + __offset) = (unsigned long long)__vec[0]; +} +#endif + /* vec_xst_be */ #ifdef __LITTLE_ENDIAN__ diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c index 6f38ac77ee242..2182a19f2452d 100644 --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -582,6 +582,54 @@ vector float test_vec_vec_splati_ins_f(void) { return vec_splati_ins(vfa, 0, 1.0f); } +void test_vec_xst_trunc_sc(vector signed __int128 __a, signed long long __b, + signed char *__c) { + // CHECK: store i8 %{{.+}}, i8* %{{.+}}, align 1 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_uc(vector unsigned __int128 __a, signed long long __b, + unsigned char *__c) { + // CHECK: store i8 %{{.+}}, i8* %{{.+}}, align 1 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_ss(vector signed __int128 __a, signed long long __b, + signed short *__c) { + // CHECK: store i16 %{{.+}}, i16* %{{.+}}, align 2 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_us(vector unsigned __int128 __a, signed long long __b, + unsigned short *__c) { + // CHECK: store i16 %{{.+}}, i16* %{{.+}}, align 2 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_si(vector signed __int128 __a, signed long long __b, + signed int *__c) { + // CHECK: store i32 %{{.+}}, i32* %{{.+}}, align 4 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_ui(vector unsigned __int128 __a, signed long long __b, + unsigned int *__c) { + // CHECK: store i32 %{{.+}}, i32* %{{.+}}, align 4 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_sll(vector signed __int128 __a, signed long long __b, + signed long long *__c) { + // CHECK: store i64 %{{.+}}, i64* %{{.+}}, align 8 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_ull(vector unsigned __int128 __a, signed long long __b, + unsigned long long *__c) { + // CHECK: store i64 %{{.+}}, i64* %{{.+}}, align 8 + vec_xst_trunc(__a, __b, __c); +} + int test_vec_test_lsbb_all_ones(void) { // CHECK: @llvm.ppc.vsx.xvtlsbb(<16 x i8> %{{.+}}, i32 1 // CHECK-NEXT: ret i32 diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index b468a8f318ee3..4e048ee9930e9 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1100,6 +1100,17 @@ let Predicates = [IsISA3_1] in { (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>; } +let AddedComplexity = 400, Predicates = [IsISA3_1] in { + def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$rS, 0)), xoaddr:$src), + (STXVRBX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; + def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$rS, 0)), xoaddr:$src), + (STXVRHX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; + def : Pat<(store (i32 (vector_extract v4i32:$rS, 0)), xoaddr:$src), + (STXVRWX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; + def : Pat<(store (i64 (vector_extract v2i64:$rS, 0)), xoaddr:$src), + (STXVRDX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; +} + let AddedComplexity = 400, Predicates = [PrefixInstrs] in { def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A, i32immNonAllOneNonZero:$A, diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll index 2ac1b2b7514bc..faddb5b4cc7f0 100644 --- a/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll +++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll @@ -2,9 +2,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ ; RUN: FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O0 \ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ ; RUN: FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O0 \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-O0 ; These test cases aims to test the builtins for the Power10 VSX vector ; instructions introduced in ISA 3.1. @@ -19,6 +22,14 @@ define signext i32 @test_vec_test_lsbb_all_ones(<16 x i8> %vuca) { ; CHECK-NEXT: srwi r3, r3, 31 ; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: test_vec_test_lsbb_all_ones: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: xvtlsbb cr0, v2 +; CHECK-O0-NEXT: mfocrf r3, 128 +; CHECK-O0-NEXT: srwi r3, r3, 31 +; CHECK-O0-NEXT: extsw r3, r3 +; CHECK-O0-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.vsx.xvtlsbb(<16 x i8> %vuca, i32 1) ret i32 %0 @@ -32,7 +43,199 @@ define signext i32 @test_vec_test_lsbb_all_zeros(<16 x i8> %vuca) { ; CHECK-NEXT: rlwinm r3, r3, 3, 31, 31 ; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: test_vec_test_lsbb_all_zeros: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: xvtlsbb cr0, v2 +; CHECK-O0-NEXT: mfocrf r3, 128 +; CHECK-O0-NEXT: rlwinm r3, r3, 3, 31, 31 +; CHECK-O0-NEXT: extsw r3, r3 +; CHECK-O0-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.vsx.xvtlsbb(<16 x i8> %vuca, i32 0) ret i32 %0 } + +define void @vec_xst_trunc_sc(<1 x i128> %__vec, i64 %__offset, i8* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_sc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stxvrbx v2, r6, r5 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_sc: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextubrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: add r4, r6, r5 +; CHECK-O0-NEXT: stb r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <16 x i8> + %conv = extractelement <16 x i8> %0, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %__ptr, i64 %__offset + store i8 %conv, i8* %add.ptr, align 1 + ret void +} + +define void @vec_xst_trunc_uc(<1 x i128> %__vec, i64 %__offset, i8* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_uc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stxvrbx v2, r6, r5 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_uc: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextubrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: add r4, r6, r5 +; CHECK-O0-NEXT: stb r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <16 x i8> + %conv = extractelement <16 x i8> %0, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %__ptr, i64 %__offset + store i8 %conv, i8* %add.ptr, align 1 + ret void +} + +define void @vec_xst_trunc_ss(<1 x i128> %__vec, i64 %__offset, i16* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_ss: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 1 +; CHECK-NEXT: stxvrhx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_ss: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuhrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 1 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: sth r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <8 x i16> + %conv = extractelement <8 x i16> %0, i32 0 + %add.ptr = getelementptr inbounds i16, i16* %__ptr, i64 %__offset + store i16 %conv, i16* %add.ptr, align 2 + ret void +} + +define void @vec_xst_trunc_us(<1 x i128> %__vec, i64 %__offset, i16* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_us: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 1 +; CHECK-NEXT: stxvrhx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_us: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuhrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 1 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: sth r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <8 x i16> + %conv = extractelement <8 x i16> %0, i32 0 + %add.ptr = getelementptr inbounds i16, i16* %__ptr, i64 %__offset + store i16 %conv, i16* %add.ptr, align 2 + ret void +} + +define void @vec_xst_trunc_si(<1 x i128> %__vec, i64 %__offset, i32* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_si: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 2 +; CHECK-NEXT: stxvrwx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_si: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuwrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 2 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: stw r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <4 x i32> + %conv = extractelement <4 x i32> %0, i32 0 + %add.ptr = getelementptr inbounds i32, i32* %__ptr, i64 %__offset + store i32 %conv, i32* %add.ptr, align 4 + ret void +} + +define void @vec_xst_trunc_ui(<1 x i128> %__vec, i64 %__offset, i32* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 2 +; CHECK-NEXT: stxvrwx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_ui: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuwrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 2 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: stw r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <4 x i32> + %conv = extractelement <4 x i32> %0, i32 0 + %add.ptr = getelementptr inbounds i32, i32* %__ptr, i64 %__offset + store i32 %conv, i32* %add.ptr, align 4 + ret void +} + +define void @vec_xst_trunc_sll(<1 x i128> %__vec, i64 %__offset, i64* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_sll: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 3 +; CHECK-NEXT: stxvrdx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_sll: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: mfvsrld r3, v2 +; CHECK-O0-NEXT: sldi r4, r5, 3 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: std r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <2 x i64> + %conv = extractelement <2 x i64> %0, i32 0 + %add.ptr = getelementptr inbounds i64, i64* %__ptr, i64 %__offset + store i64 %conv, i64* %add.ptr, align 8 + ret void +} + +define void @vec_xst_trunc_ull(<1 x i128> %__vec, i64 %__offset, i64* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_ull: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 3 +; CHECK-NEXT: stxvrdx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_ull: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: mfvsrld r3, v2 +; CHECK-O0-NEXT: sldi r4, r5, 3 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: std r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <2 x i64> + %conv = extractelement <2 x i64> %0, i32 0 + %add.ptr = getelementptr inbounds i64, i64* %__ptr, i64 %__offset + store i64 %conv, i64* %add.ptr, align 8 + ret void +} From 867ef4472d8e57384c929e4f06b74d1ac8883a99 Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Fri, 24 Jul 2020 17:16:25 -0700 Subject: [PATCH 0066/1035] [PGO][InstrProf] Do not promote count if the exit blocks contains ret instruction Forgot including the tests in the commit 6fdc6f6c7d34af60c4. --- .../Inputs/fix_entry_count.proftext | 8 +++++ .../Transforms/PGOProfile/fix_entry_count.ll | 34 +++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext create mode 100644 llvm/test/Transforms/PGOProfile/fix_entry_count.ll diff --git a/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext b/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext new file mode 100644 index 0000000000000..de4771beaacc2 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext @@ -0,0 +1,8 @@ +:ir +:entry_first +test_simple_for +34137660316 +2 +0 +96 + diff --git a/llvm/test/Transforms/PGOProfile/fix_entry_count.ll b/llvm/test/Transforms/PGOProfile/fix_entry_count.ll new file mode 100644 index 0000000000000..5923bc923a6a8 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/fix_entry_count.ll @@ -0,0 +1,34 @@ +; RUN: llvm-profdata merge %S/Inputs/fix_entry_count.proftext -o %t.profdata +; RUN: opt < %s -pgo-instr-use -pgo-instrument-entry=true -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE +; RUN: opt < %s -passes=pgo-instr-use -pgo-instrument-entry=true -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @test_simple_for(i32 %n) { +; USE: define i32 @test_simple_for(i32 %n) +; USE-SAME: !prof ![[ENTRY_COUNT:[0-9]*]] +entry: + br label %for.cond + +for.cond: + %i = phi i32 [ 0, %entry ], [ %inc1, %for.inc ] + %sum = phi i32 [ 1, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %for.body, label %for.end +; USE: br i1 %cmp, label %for.body, label %for.end +; USE-SAME: !prof ![[BW_FOR_COND:[0-9]+]] + +for.body: + %inc = add nsw i32 %sum, 1 + br label %for.inc + +for.inc: + %inc1 = add nsw i32 %i, 1 + br label %for.cond + +for.end: + ret i32 %sum +} +; USE: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 1} +; USE: ![[BW_FOR_COND]] = !{!"branch_weights", i32 96, i32 1} From dcf1bca0de3b2690c017c545466234198573b4dd Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Fri, 24 Jul 2020 17:33:49 -0700 Subject: [PATCH 0067/1035] Revert "[PGO][InstrProf] Do not promote count if the exit blocks contains ret instruction" This reverts commit 867ef4472d8e57384c929e4f06b74d1ac8883a99. --- .../Inputs/fix_entry_count.proftext | 8 ----- .../Transforms/PGOProfile/fix_entry_count.ll | 34 ------------------- 2 files changed, 42 deletions(-) delete mode 100644 llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext delete mode 100644 llvm/test/Transforms/PGOProfile/fix_entry_count.ll diff --git a/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext b/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext deleted file mode 100644 index de4771beaacc2..0000000000000 --- a/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext +++ /dev/null @@ -1,8 +0,0 @@ -:ir -:entry_first -test_simple_for -34137660316 -2 -0 -96 - diff --git a/llvm/test/Transforms/PGOProfile/fix_entry_count.ll b/llvm/test/Transforms/PGOProfile/fix_entry_count.ll deleted file mode 100644 index 5923bc923a6a8..0000000000000 --- a/llvm/test/Transforms/PGOProfile/fix_entry_count.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llvm-profdata merge %S/Inputs/fix_entry_count.proftext -o %t.profdata -; RUN: opt < %s -pgo-instr-use -pgo-instrument-entry=true -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE -; RUN: opt < %s -passes=pgo-instr-use -pgo-instrument-entry=true -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE - -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define i32 @test_simple_for(i32 %n) { -; USE: define i32 @test_simple_for(i32 %n) -; USE-SAME: !prof ![[ENTRY_COUNT:[0-9]*]] -entry: - br label %for.cond - -for.cond: - %i = phi i32 [ 0, %entry ], [ %inc1, %for.inc ] - %sum = phi i32 [ 1, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i, %n - br i1 %cmp, label %for.body, label %for.end -; USE: br i1 %cmp, label %for.body, label %for.end -; USE-SAME: !prof ![[BW_FOR_COND:[0-9]+]] - -for.body: - %inc = add nsw i32 %sum, 1 - br label %for.inc - -for.inc: - %inc1 = add nsw i32 %i, 1 - br label %for.cond - -for.end: - ret i32 %sum -} -; USE: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 1} -; USE: ![[BW_FOR_COND]] = !{!"branch_weights", i32 96, i32 1} From 5546c2ab42d474dde9b490035ed69e96989f22c5 Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Fri, 24 Jul 2020 17:35:44 -0700 Subject: [PATCH 0068/1035] Revert "[PGO][InstrProf] Do not promote count if the exit blocks contains ret instruction" This reverts commit 6fdc6f6c7d34af60c45c405f448370a684ef6f2a. --- .../Instrumentation/PGOInstrumentation.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c4a43abaa53cc..67bf264e8eeaf 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1131,18 +1131,11 @@ bool PGOUseFunc::setInstrumentedCounts( if (NumCounters != CountFromProfile.size()) { return false; } - auto *FuncEntry = &*F.begin(); - // Set the profile count to the Instrumented BBs. uint32_t I = 0; for (BasicBlock *InstrBB : InstrumentBBs) { uint64_t CountValue = CountFromProfile[I++]; UseBBInfo &Info = getBBInfo(InstrBB); - // If we reach here, we know that we have some nonzero count - // values in this function. The entry count should not be 0. - // Fix it if necessary. - if (InstrBB == FuncEntry && CountValue == 0) - CountValue = 1; Info.setBBInfoCount(CountValue); } ProfileCountSize = CountFromProfile.size(); @@ -1333,6 +1326,7 @@ void PGOUseFunc::populateCounters() { } #endif uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; + F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); uint64_t FuncMaxCount = FuncEntryCount; for (auto &BB : F) { auto BI = findBBInfo(&BB); @@ -1340,11 +1334,6 @@ void PGOUseFunc::populateCounters() { continue; FuncMaxCount = std::max(FuncMaxCount, BI->CountValue); } - - // Fix the obviously inconsistent entry count. - if (FuncMaxCount > 0 && FuncEntryCount == 0) - FuncEntryCount = 1; - F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); markFunctionAttributes(FuncEntryCount, FuncMaxCount); // Now annotate select instructions From 31bd15c562449954d8211c067fc38b1907d60615 Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Fri, 24 Jul 2020 17:38:31 -0700 Subject: [PATCH 0069/1035] [PGO][InstrProf] Do not promote count if the exit blocks contains ret instruction Skip profile count promotion if any of the ExitBlocks contains a ret instruction. This is to prevent dumping of incomplete profile -- if the the loop is a long running loop and dump is called in the middle of the loop, the result profile is incomplete. ExitBlocks containing a ret instruction is an indication of a long running loop -- early exit to error handling code. Differential Revision: https://reviews.llvm.org/D84379 --- .../Instrumentation/InstrProfiling.cpp | 16 ++++++++ .../Transforms/PGOProfile/counter_promo.ll | 10 ++--- .../PGOProfile/not_promote_ret_exit.ll | 38 +++++++++++++++++++ 3 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/PGOProfile/not_promote_ret_exit.ll diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7b03bbfcdfe4b..8279716002864 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -150,6 +150,10 @@ cl::opt IterativeCounterPromotion( cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true), cl::desc("Allow counter promotion across the whole loop nest.")); +cl::opt SkipRetExitBlock( + cl::ZeroOrMore, "skip-ret-exit-block", cl::init(true), + cl::desc("Suppress counter promotion if exit blocks contain ret.")); + class InstrProfilingLegacyPass : public ModulePass { InstrProfiling InstrProf; @@ -272,6 +276,18 @@ class PGOCounterPromoter { // Skip 'infinite' loops: if (ExitBlocks.size() == 0) return false; + + // Skip if any of the ExitBlocks contains a ret instruction. + // This is to prevent dumping of incomplete profile -- if the + // the loop is a long running loop and dump is called in the middle + // of the loop, the result profile is incomplete. + // FIXME: add other heuristics to detect long running loops. + if (SkipRetExitBlock) { + for (auto BB : ExitBlocks) + if (dyn_cast(BB->getTerminator()) != nullptr) + return false; + } + unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L); if (MaxProm == 0) return false; diff --git a/llvm/test/Transforms/PGOProfile/counter_promo.ll b/llvm/test/Transforms/PGOProfile/counter_promo.ll index 812d0fefaa79e..55930c1be1660 100644 --- a/llvm/test/Transforms/PGOProfile/counter_promo.ll +++ b/llvm/test/Transforms/PGOProfile/counter_promo.ll @@ -1,12 +1,12 @@ -; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO --check-prefix=NONATOMIC_PROMO %s -; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO --check-prefix=NONATOMIC_PROMO %s -; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -atomic-counter-update-promoted -S | FileCheck --check-prefix=PROMO --check-prefix=ATOMIC_PROMO %s -; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -atomic-counter-update-promoted -S | FileCheck --check-prefix=PROMO --check-prefix=ATOMIC_PROMO %s +; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -skip-ret-exit-block=0 -S | FileCheck --check-prefix=PROMO --check-prefix=NONATOMIC_PROMO %s +; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -skip-ret-exit-block=0 -S | FileCheck --check-prefix=PROMO --check-prefix=NONATOMIC_PROMO %s +; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -atomic-counter-update-promoted -skip-ret-exit-block=0 -S | FileCheck --check-prefix=PROMO --check-prefix=ATOMIC_PROMO %s +; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -atomic-counter-update-promoted -skip-ret-exit-block=0 -S | FileCheck --check-prefix=PROMO --check-prefix=ATOMIC_PROMO %s define void @foo(i32 %n, i32 %N) { ; PROMO-LABEL: @foo ; PROMO: {{.*}} = load {{.*}} @__profc_foo{{.*}} 3) -; PROMO-NEXT: add +; PROMO-NEXT: add ; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}3) bb: %tmp = add nsw i32 %n, 1 diff --git a/llvm/test/Transforms/PGOProfile/not_promote_ret_exit.ll b/llvm/test/Transforms/PGOProfile/not_promote_ret_exit.ll new file mode 100644 index 0000000000000..25d9784e23b1f --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/not_promote_ret_exit.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -instrprof -S -o - -do-counter-promotion=1 -skip-ret-exit-block=1 | FileCheck %s --check-prefixes=CHECK,SKIP +; RUN: opt < %s -instrprof -S -o - -do-counter-promotion=1 -skip-ret-exit-block=0 | FileCheck %s --check-prefixes=CHECK,NOTSKIP + +$__llvm_profile_raw_version = comdat any + +@bar = dso_local local_unnamed_addr global i32 0, align 4 +@__llvm_profile_raw_version = constant i64 72057594037927941, comdat +@__profn_foo = private constant [3 x i8] c"foo" + +define dso_local void @foo(i32 %n) { +entry: + call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 29212902728, i32 2, i32 1) + br label %for.cond + +for.cond: + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %cmp = icmp slt i32 %i.0, %n + %0 = load i32, i32* @bar, align 4 + %tobool.not = icmp eq i32 %0, 0 + %or.cond = and i1 %cmp, %tobool.not + br i1 %or.cond, label %if.end, label %cleanup + +if.end: + call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 29212902728, i32 2, i32 0) + call void (...) @bar2() + %inc = add nuw nsw i32 %i.0, 1 + br label %for.cond + +cleanup: +; CHECK: cleanup: +; SKIP-NOT: %pgocount.promoted +; NOTSKIP: %pgocount.promoted + ret void +} + +declare dso_local void @bar2(...) + +declare void @llvm.instrprof.increment(i8*, i64, i32, i32) From 1dd39b1133136c58847d9f2d3c73d2ad8e76e25d Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Fri, 24 Jul 2020 17:39:55 -0700 Subject: [PATCH 0070/1035] [PGO] Fix incorrect function entry count Function entry count might be zero after the profile counts reset and before reentry to the function. Zero profile entry count is very bad as the profile count from BFI will be wrong. A simple fix is to set the profile entry count to 1 if there are non-zero profile counts in this function. Differential Revision: https://reviews.llvm.org/D84378 --- .../Instrumentation/PGOInstrumentation.cpp | 13 ++++++- .../Inputs/fix_entry_count.proftext | 8 +++++ .../Transforms/PGOProfile/fix_entry_count.ll | 34 +++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext create mode 100644 llvm/test/Transforms/PGOProfile/fix_entry_count.ll diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 67bf264e8eeaf..c4a43abaa53cc 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1131,11 +1131,18 @@ bool PGOUseFunc::setInstrumentedCounts( if (NumCounters != CountFromProfile.size()) { return false; } + auto *FuncEntry = &*F.begin(); + // Set the profile count to the Instrumented BBs. uint32_t I = 0; for (BasicBlock *InstrBB : InstrumentBBs) { uint64_t CountValue = CountFromProfile[I++]; UseBBInfo &Info = getBBInfo(InstrBB); + // If we reach here, we know that we have some nonzero count + // values in this function. The entry count should not be 0. + // Fix it if necessary. + if (InstrBB == FuncEntry && CountValue == 0) + CountValue = 1; Info.setBBInfoCount(CountValue); } ProfileCountSize = CountFromProfile.size(); @@ -1326,7 +1333,6 @@ void PGOUseFunc::populateCounters() { } #endif uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; - F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); uint64_t FuncMaxCount = FuncEntryCount; for (auto &BB : F) { auto BI = findBBInfo(&BB); @@ -1334,6 +1340,11 @@ void PGOUseFunc::populateCounters() { continue; FuncMaxCount = std::max(FuncMaxCount, BI->CountValue); } + + // Fix the obviously inconsistent entry count. + if (FuncMaxCount > 0 && FuncEntryCount == 0) + FuncEntryCount = 1; + F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); markFunctionAttributes(FuncEntryCount, FuncMaxCount); // Now annotate select instructions diff --git a/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext b/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext new file mode 100644 index 0000000000000..de4771beaacc2 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/Inputs/fix_entry_count.proftext @@ -0,0 +1,8 @@ +:ir +:entry_first +test_simple_for +34137660316 +2 +0 +96 + diff --git a/llvm/test/Transforms/PGOProfile/fix_entry_count.ll b/llvm/test/Transforms/PGOProfile/fix_entry_count.ll new file mode 100644 index 0000000000000..5923bc923a6a8 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/fix_entry_count.ll @@ -0,0 +1,34 @@ +; RUN: llvm-profdata merge %S/Inputs/fix_entry_count.proftext -o %t.profdata +; RUN: opt < %s -pgo-instr-use -pgo-instrument-entry=true -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE +; RUN: opt < %s -passes=pgo-instr-use -pgo-instrument-entry=true -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @test_simple_for(i32 %n) { +; USE: define i32 @test_simple_for(i32 %n) +; USE-SAME: !prof ![[ENTRY_COUNT:[0-9]*]] +entry: + br label %for.cond + +for.cond: + %i = phi i32 [ 0, %entry ], [ %inc1, %for.inc ] + %sum = phi i32 [ 1, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %for.body, label %for.end +; USE: br i1 %cmp, label %for.body, label %for.end +; USE-SAME: !prof ![[BW_FOR_COND:[0-9]+]] + +for.body: + %inc = add nsw i32 %sum, 1 + br label %for.inc + +for.inc: + %inc1 = add nsw i32 %i, 1 + br label %for.cond + +for.end: + ret i32 %sum +} +; USE: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 1} +; USE: ![[BW_FOR_COND]] = !{!"branch_weights", i32 96, i32 1} From e937840dbdce20f2ab7ca4dcd9f04c3fd89e56e3 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 24 Jul 2020 17:59:28 -0700 Subject: [PATCH 0071/1035] Upstream macCatalyst support in ArchSpec and associated unit tests. --- lldb/source/Utility/ArchSpec.cpp | 36 ++++++++++++++++++++++--- lldb/unittests/Utility/ArchSpecTest.cpp | 34 +++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index f220f4e30b29e..a77ae8633070e 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -846,6 +846,15 @@ bool ArchSpec::ContainsOnlyArch(const llvm::Triple &normalized_triple) { } void ArchSpec::MergeFrom(const ArchSpec &other) { + // ios-macabi always wins over macosx. + if ((GetTriple().getOS() == llvm::Triple::MacOSX || + GetTriple().getOS() == llvm::Triple::UnknownOS) && + other.GetTriple().getOS() == llvm::Triple::IOS && + other.GetTriple().getEnvironment() == llvm::Triple::MacABI) { + (*this) = other; + return; + } + if (!TripleVendorWasSpecified() && other.TripleVendorWasSpecified()) GetTriple().setVendor(other.GetTriple().getVendor()); if (!TripleOSWasSpecified() && other.TripleOSWasSpecified()) @@ -1031,6 +1040,22 @@ bool ArchSpec::IsEqualTo(const ArchSpec &rhs, bool exact_match) const { const llvm::Triple::OSType lhs_triple_os = lhs_triple.getOS(); const llvm::Triple::OSType rhs_triple_os = rhs_triple.getOS(); + const llvm::Triple::EnvironmentType lhs_triple_env = + lhs_triple.getEnvironment(); + const llvm::Triple::EnvironmentType rhs_triple_env = + rhs_triple.getEnvironment(); + + if (!exact_match) { + // x86_64-apple-ios-macabi, x86_64-apple-macosx are compatible, no match. + if ((lhs_triple_os == llvm::Triple::IOS && + lhs_triple_env == llvm::Triple::MacABI && + rhs_triple_os == llvm::Triple::MacOSX) || + (lhs_triple_os == llvm::Triple::MacOSX && + rhs_triple_os == llvm::Triple::IOS && + rhs_triple_env == llvm::Triple::MacABI)) + return true; + } + if (lhs_triple_os != rhs_triple_os) { const bool rhs_os_specified = rhs.TripleOSWasSpecified(); const bool lhs_os_specified = TripleOSWasSpecified(); @@ -1045,10 +1070,13 @@ bool ArchSpec::IsEqualTo(const ArchSpec &rhs, bool exact_match) const { return false; } - const llvm::Triple::EnvironmentType lhs_triple_env = - lhs_triple.getEnvironment(); - const llvm::Triple::EnvironmentType rhs_triple_env = - rhs_triple.getEnvironment(); + // x86_64-apple-ios-macabi and x86_64-apple-ios are not compatible. + if (lhs_triple_os == llvm::Triple::IOS && + rhs_triple_os == llvm::Triple::IOS && + (lhs_triple_env == llvm::Triple::MacABI || + rhs_triple_env == llvm::Triple::MacABI) && + lhs_triple_env != rhs_triple_env) + return false; return IsCompatibleEnvironment(lhs_triple_env, rhs_triple_env); } diff --git a/lldb/unittests/Utility/ArchSpecTest.cpp b/lldb/unittests/Utility/ArchSpecTest.cpp index 4e8e2f3c34d71..a8f43ed7dc7c9 100644 --- a/lldb/unittests/Utility/ArchSpecTest.cpp +++ b/lldb/unittests/Utility/ArchSpecTest.cpp @@ -328,6 +328,40 @@ TEST(ArchSpecTest, Compatibility) { ASSERT_TRUE(A.IsExactMatch(B)); ASSERT_TRUE(A.IsCompatibleMatch(B)); } + { + ArchSpec A("x86_64"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + // FIXME: The exact match also looks unintuitive. + ASSERT_TRUE(A.IsExactMatch(B)); + ASSERT_TRUE(A.IsCompatibleMatch(B)); + } + { + ArchSpec A("x86_64-apple-ios12.0.0"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + ASSERT_FALSE(A.IsExactMatch(B)); + ASSERT_FALSE(A.IsCompatibleMatch(B)); + } + { + ArchSpec A("x86_64-apple-macosx10.14.2"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + ASSERT_FALSE(A.IsExactMatch(B)); + ASSERT_TRUE(A.IsCompatibleMatch(B)); + } + { + ArchSpec A("x86_64-apple-macosx10.14.2"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + // ios-macabi wins. + A.MergeFrom(B); + ASSERT_TRUE(A.IsExactMatch(B)); + } + { + ArchSpec A("x86_64-apple-macosx10.14.2"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + ArchSpec C(B); + // ios-macabi wins. + B.MergeFrom(A); + ASSERT_TRUE(B.IsExactMatch(C)); + } } TEST(ArchSpecTest, OperatorBool) { From 739cd2638b12c37d6bf867d68cd86d18bc5e2e42 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Fri, 24 Jul 2020 20:57:57 -0500 Subject: [PATCH 0072/1035] [PowerPC] Exploit the High Order Vector Multiply Instructions on Power10 This patch aims to exploit the following vector multiply high instructions on Power10. vmulhsw VRT, VRA, VRB vmulhsd VRT, VRA, VRB vmulhuw VRT, VRA, VRB vmulhud VRT, VRA, VRB Differential Revision: https://reviews.llvm.org/D82584 --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 4 ++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td | 12 ++-- .../CodeGen/PowerPC/p10-vector-multiply.ll | 59 +++++++++++++++++++ 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index c2ba7195509a1..8c28ead9f6041 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -811,6 +811,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.isISA3_1()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MULHS, MVT::v2i64, Legal); + setOperationAction(ISD::MULHU, MVT::v2i64, Legal); + setOperationAction(ISD::MULHS, MVT::v4i32, Legal); + setOperationAction(ISD::MULHU, MVT::v4i32, Legal); setOperationAction(ISD::UDIV, MVT::v2i64, Legal); setOperationAction(ISD::SDIV, MVT::v2i64, Legal); setOperationAction(ISD::UDIV, MVT::v4i32, Legal); diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 4e048ee9930e9..22839e697381b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -976,13 +976,17 @@ let Predicates = [IsISA3_1] in { "vmulld $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>; def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhsw $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhsw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mulhs v4i32:$vA, v4i32:$vB))]>; def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhuw $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhuw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mulhu v4i32:$vA, v4i32:$vB))]>; def VMULHSD : VXForm_1<969, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhsd $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhsd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (mulhs v2i64:$vA, v2i64:$vB))]>; def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhud $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhud $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (mulhu v2i64:$vA, v2i64:$vB))]>; def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmodsw $vD, $vA, $vB", IIC_VecGeneral, [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>; diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll index e8f77574f66cc..75c6d8c24038e 100644 --- a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll +++ b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll @@ -7,6 +7,9 @@ ; RUN: FileCheck %s ; This test case aims to test the vector multiply instructions on Power10. +; This includes the low order and high order versions of vector multiply. +; The low order version operates on doublewords, whereas the high order version +; operates on signed and unsigned words and doublewords. define <2 x i64> @test_vmulld(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vmulld: @@ -17,3 +20,59 @@ entry: %mul = mul <2 x i64> %b, %a ret <2 x i64> %mul } + +define <2 x i64> @test_vmulhsd(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vmulhsd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhsd v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = sext <2 x i64> %a to <2 x i128> + %1 = sext <2 x i64> %b to <2 x i128> + %mul = mul <2 x i128> %1, %0 + %shr = lshr <2 x i128> %mul, + %tr = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %tr +} + +define <2 x i64> @test_vmulhud(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vmulhud: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhud v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = zext <2 x i64> %a to <2 x i128> + %1 = zext <2 x i64> %b to <2 x i128> + %mul = mul <2 x i128> %1, %0 + %shr = lshr <2 x i128> %mul, + %tr = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %tr +} + +define <4 x i32> @test_vmulhsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmulhsw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhsw v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = sext <4 x i32> %a to <4 x i64> + %1 = sext <4 x i32> %b to <4 x i64> + %mul = mul <4 x i64> %1, %0 + %shr = lshr <4 x i64> %mul, + %tr = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %tr +} + +define <4 x i32> @test_vmulhuw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmulhuw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhuw v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = zext <4 x i32> %a to <4 x i64> + %1 = zext <4 x i32> %b to <4 x i64> + %mul = mul <4 x i64> %1, %0 + %shr = lshr <4 x i64> %mul, + %tr = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %tr +} From 33d9c4109ac234bcf17501ba16880ce80622cc9c Mon Sep 17 00:00:00 2001 From: Kuba Mracek Date: Fri, 24 Jul 2020 20:14:00 -0700 Subject: [PATCH 0073/1035] [tsan] Allow TSan in the Clang driver for Apple Silicon Macs Differential Revision: https://reviews.llvm.org/D84082 --- clang/lib/Driver/ToolChains/Darwin.cpp | 6 +++--- clang/test/Driver/fsanitize.c | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index f910c88fa9674..325dcb7df545c 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -2714,6 +2714,7 @@ void Darwin::CheckObjCARC() const { SanitizerMask Darwin::getSupportedSanitizers() const { const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64; + const bool IsAArch64 = getTriple().getArch() == llvm::Triple::aarch64; SanitizerMask Res = ToolChain::getSupportedSanitizers(); Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; @@ -2731,9 +2732,8 @@ SanitizerMask Darwin::getSupportedSanitizers() const { && !(isTargetIPhoneOS() && isIPhoneOSVersionLT(5, 0))) Res |= SanitizerKind::Vptr; - if (isTargetMacOS()) { - if (IsX86_64) - Res |= SanitizerKind::Thread; + if ((IsX86_64 || IsAArch64) && isTargetMacOS()) { + Res |= SanitizerKind::Thread; } else if (isTargetIOSSimulator() || isTargetTvOSSimulator()) { if (IsX86_64) Res |= SanitizerKind::Thread; diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index 7340bfb35e40c..cfefd3fb632cb 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -458,6 +458,10 @@ // RUN: %clang -target x86_64-apple-darwin -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-DARWIN // CHECK-TSAN-X86-64-DARWIN-NOT: unsupported option +// RUN: %clang -target x86_64-apple-macos -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-MACOS +// CHECK-TSAN-X86-64-MACOS-NOT: unsupported option +// RUN: %clang -target arm64-apple-macos -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-ARM64-MACOS +// CHECK-TSAN-ARM64-MACOS-NOT: unsupported option // RUN: %clang -target x86_64-apple-iossimulator -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-IOSSIMULATOR // CHECK-TSAN-X86-64-IOSSIMULATOR-NOT: unsupported option From 604e33e83a55c854db5fb9594e667f114423bb58 Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Fri, 24 Jul 2020 18:14:41 -0700 Subject: [PATCH 0074/1035] [AArch64][GlobalISel] Look through constants when selection stores of 0 Very minor code size improvements (hits 8 times in Bullet at -O3), but still something. Also very minor NFC change to make sure we only search for a 0 constant when selecting a store. Before, we'd do this for loads as well. Differential Revision: https://reviews.llvm.org/D84573 --- .../GISel/AArch64InstructionSelector.cpp | 25 +++++++++++-------- .../AArch64/GlobalISel/select-store.mir | 17 +++++++++++++ 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 0b0de09fff29e..6de5e5ea5b955 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -2304,18 +2304,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { I.addOperand(MachineOperand::CreateImm(Offset)); // If we're storing a 0, use WZR/XZR. - if (auto CVal = getConstantVRegVal(ValReg, MRI)) { - if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { + if (Opcode == TargetOpcode::G_STORE) { + auto CVal = getConstantVRegValWithLookThrough( + ValReg, MRI, /*LookThroughInstrs = */ true, + /*HandleFConstants = */ false); + if (CVal && CVal->Value == 0) { unsigned Opc = I.getOpcode(); - switch(Opc) { - case AArch64::STRWui: - case AArch64::STRHHui: - case AArch64::STRBBui: - I.getOperand(0).setReg(AArch64::WZR); - break; - case AArch64::STRXui: - I.getOperand(0).setReg(AArch64::XZR); - break; + switch (Opc) { + case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: + I.getOperand(0).setReg(AArch64::WZR); + break; + case AArch64::STRXui: + I.getOperand(0).setReg(AArch64::XZR); + break; } } } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir index d60a34ef6631a..db355dfc151f5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir @@ -13,6 +13,7 @@ define void @store_zero_s32_gpr(i32* %addr) { ret void } define void @store_zero_s16(i32* %addr) { ret void } define void @store_zero_s8(i32* %addr) { ret void } + define void @store_zero_look_through_cst(i32* %addr) { ret void } define void @store_fi_s64_gpr() { %ptr0 = alloca i64 @@ -209,6 +210,22 @@ body: | G_STORE %1(s8), %0(p0) :: (store 1) ... +--- +name: store_zero_look_through_cst +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_zero_look_through_cst + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: STRXui $xzr, [[COPY]], 0 :: (store 8 into %ir.addr) + %0:gpr(p0) = COPY $x0 + %1:gpr(s32) = G_CONSTANT i32 0 + %2:gpr(s64) = G_ZEXT %1 + G_STORE %2, %0 :: (store 8 into %ir.addr) +... + --- name: store_fi_s64_gpr legalized: true From 6c25fc35e096790a216fa5ddeda8c514c7536818 Mon Sep 17 00:00:00 2001 From: Nathan James Date: Sat, 25 Jul 2020 10:37:33 +0100 Subject: [PATCH 0075/1035] [ADT] Add a range-based version of std::move Adds a range-based version of `std::move`, the version that moves a range, not the one that creates r-value references. Reviewed By: dblaikie, gamesh411 Differential Revision: https://reviews.llvm.org/D83902 --- llvm/include/llvm/ADT/STLExtras.h | 7 ++++ llvm/unittests/ADT/STLExtrasTest.cpp | 52 ++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index 2e7a097174763..92eea4e83f693 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -1535,6 +1535,13 @@ OutputIt copy(R &&Range, OutputIt Out) { return std::copy(adl_begin(Range), adl_end(Range), Out); } +/// Provide wrappers to std::move which take ranges instead of having to +/// pass begin/end explicitly. +template +OutputIt move(R &&Range, OutputIt Out) { + return std::move(adl_begin(Range), adl_end(Range), Out); +} + /// Wrapper function around std::find to detect if an element exists /// in a container. template diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp index 1d275d1db9094..f09d5986c0090 100644 --- a/llvm/unittests/ADT/STLExtrasTest.cpp +++ b/llvm/unittests/ADT/STLExtrasTest.cpp @@ -568,4 +568,56 @@ TEST(STLExtras, hasNItemsOrLess) { EXPECT_FALSE( hasNItemsOrLess(V3.begin(), V3.end(), 2, [](int x) { return x < 10; })); } + +TEST(STLExtras, MoveRange) { + class Foo { + bool A; + + public: + Foo() : A(true) {} + Foo(const Foo &) = delete; + Foo(Foo &&Other) : A(Other.A) { Other.A = false; } + Foo &operator=(const Foo &) = delete; + Foo &operator=(Foo &&Other) { + if (this != &Other) { + A = Other.A; + Other.A = false; + } + return *this; + } + operator bool() const { return A; } + }; + + constexpr size_t ItemCount = 4; + SmallVector V1, V2, V3, V4; + auto HasVal = [](const Foo &Item) { return static_cast(Item); }; + auto Build = [&] { + SmallVector Foos; + Foos.resize(ItemCount); + return Foos; + }; + + V1.resize(ItemCount); + EXPECT_TRUE(llvm::all_of(V1, HasVal)); + + llvm::move(V1, std::back_inserter(V2)); + + // Ensure input container is same size, but its contents were moved out. + EXPECT_EQ(V1.size(), ItemCount); + EXPECT_TRUE(llvm::none_of(V1, HasVal)); + + // Ensure output container has the contents of the input container. + EXPECT_EQ(V2.size(), ItemCount); + EXPECT_TRUE(llvm::all_of(V2, HasVal)); + + llvm::move(std::move(V2), std::back_inserter(V3)); + + EXPECT_TRUE(llvm::none_of(V2, HasVal)); + EXPECT_EQ(V3.size(), ItemCount); + EXPECT_TRUE(llvm::all_of(V3, HasVal)); + + llvm::move(Build(), std::back_inserter(V4)); + EXPECT_EQ(V4.size(), ItemCount); + EXPECT_TRUE(llvm::all_of(V4, HasVal)); +} } // namespace From 66998ae59f4e334b285f735bb2d927976cdd323c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 25 Jul 2020 10:50:56 +0100 Subject: [PATCH 0076/1035] [X86][SSE] getFauxShuffle - ignore undemanded sources for PACKSS/PACKUS faux shuffles If we don't care about an entire LHS/RHS of the PACK op, then can just treat it the same as undef (we don't care if it saturates) and is safe to treat as a shuffle. This can happen if we attempt to decode as a faux shuffle before SimplifyDemandedVectorElts has been called on the PACK which should replace the source with UNDEF entirely. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +-- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 106 +++++++++------------ 2 files changed, 51 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2e97bef6c106a..4dc68d469bbae 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7595,19 +7595,19 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, APInt EltsLHS, EltsRHS; getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); - // If we know input saturation won't happen we can treat this - // as a truncation shuffle. + // If we know input saturation won't happen (or we don't care for particular + // lanes), we can treat this as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { - if ((!N0.isUndef() && + if ((!(N0.isUndef() || EltsLHS.isNullValue()) && DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || - (!N1.isUndef() && + (!(N1.isUndef() || EltsRHS.isNullValue()) && DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!N0.isUndef() && + if ((!(N0.isUndef() || EltsLHS.isNullValue()) && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || - (!N1.isUndef() && + (!(N1.isUndef() || EltsRHS.isNullValue()) && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 08abd66e8eb77..bafe112c5dfc8 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1697,9 +1697,8 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -1718,17 +1717,16 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $8, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1741,10 +1739,8 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -1889,9 +1885,8 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -1919,16 +1914,15 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: pmullw %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $8, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq @@ -1947,10 +1941,8 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -2130,9 +2122,8 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -2173,14 +2164,13 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $8, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax @@ -2210,10 +2200,8 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -2447,9 +2435,8 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax @@ -2514,17 +2501,16 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm2, %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: packuswb %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $8, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -2569,10 +2555,8 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax From 4363ea61058b67fd6a86e3798ff7c21631270c6d Mon Sep 17 00:00:00 2001 From: Nathan James Date: Sat, 25 Jul 2020 11:03:59 +0100 Subject: [PATCH 0077/1035] Fix C2975 error under MSVC Apparantly a constexpr value isn't a compile time constant under certain versions of MSVC. --- llvm/unittests/ADT/STLExtrasTest.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp index f09d5986c0090..aaef46a2cf05e 100644 --- a/llvm/unittests/ADT/STLExtrasTest.cpp +++ b/llvm/unittests/ADT/STLExtrasTest.cpp @@ -587,37 +587,35 @@ TEST(STLExtras, MoveRange) { } operator bool() const { return A; } }; - - constexpr size_t ItemCount = 4; - SmallVector V1, V2, V3, V4; + SmallVector V1, V2, V3, V4; auto HasVal = [](const Foo &Item) { return static_cast(Item); }; auto Build = [&] { - SmallVector Foos; - Foos.resize(ItemCount); + SmallVector Foos; + Foos.resize(4U); return Foos; }; - V1.resize(ItemCount); + V1.resize(4U); EXPECT_TRUE(llvm::all_of(V1, HasVal)); llvm::move(V1, std::back_inserter(V2)); // Ensure input container is same size, but its contents were moved out. - EXPECT_EQ(V1.size(), ItemCount); + EXPECT_EQ(V1.size(), 4U); EXPECT_TRUE(llvm::none_of(V1, HasVal)); // Ensure output container has the contents of the input container. - EXPECT_EQ(V2.size(), ItemCount); + EXPECT_EQ(V2.size(), 4U); EXPECT_TRUE(llvm::all_of(V2, HasVal)); llvm::move(std::move(V2), std::back_inserter(V3)); EXPECT_TRUE(llvm::none_of(V2, HasVal)); - EXPECT_EQ(V3.size(), ItemCount); + EXPECT_EQ(V3.size(), 4U); EXPECT_TRUE(llvm::all_of(V3, HasVal)); llvm::move(Build(), std::back_inserter(V4)); - EXPECT_EQ(V4.size(), ItemCount); + EXPECT_EQ(V4.size(), 4U); EXPECT_TRUE(llvm::all_of(V4, HasVal)); } } // namespace From 3c1476d26c769cd97a631a129b30c62232ac96b6 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 25 Jul 2020 11:52:14 +0100 Subject: [PATCH 0078/1035] [IPSCCP] Drop argmemonly after replacing pointer argument. This patch updates IPSCCP to drop argmemonly and inaccessiblemem_or_argmemonly if it replaces a pointer argument. Fixes PR46717. Reviewers: efriedma, davide, nikic, jdoerfert Reviewed By: efriedma, jdoerfert Differential Revision: https://reviews.llvm.org/D84432 --- llvm/lib/Transforms/Scalar/SCCP.cpp | 30 +++++++++++++++---- .../Transforms/SCCP/ipscp-drop-argmemonly.ll | 17 +++++------ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 32dc14e5ec195..428f9675e088c 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -1911,15 +1911,35 @@ bool llvm::runIPSCCP( SmallVector BlocksToErase; - if (Solver.isBlockExecutable(&F.front())) - for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; - ++AI) { - if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) { + if (Solver.isBlockExecutable(&F.front())) { + bool ReplacedPointerArg = false; + for (Argument &Arg : F.args()) { + if (!Arg.use_empty() && tryToReplaceWithConstant(Solver, &Arg)) { + ReplacedPointerArg |= Arg.getType()->isPointerTy(); ++IPNumArgsElimed; - continue; } } + // If we replaced an argument, the argmemonly and + // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove + // them from both the function and callsites. + if (ReplacedPointerArg) { + SmallVector AttributesToRemove = { + Attribute::ArgMemOnly, Attribute::InaccessibleMemOrArgMemOnly}; + for (auto Attr : AttributesToRemove) + F.removeFnAttr(Attr); + + for (User *U : F.users()) { + auto *CB = dyn_cast(U); + if (!CB || CB->getCalledFunction() != &F) + continue; + + for (auto Attr : AttributesToRemove) + CB->removeAttribute(AttributeList::FunctionIndex, Attr); + } + } + } + SmallPtrSet InsertedValues; for (BasicBlock &BB : F) { if (!Solver.isBlockExecutable(&BB)) { diff --git a/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll b/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll index a110476eb4302..2e3a35779a15c 100644 --- a/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll +++ b/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll @@ -11,7 +11,7 @@ ; Here the pointer argument %arg will be replaced by a constant. We need to ; drop argmemonly. define internal void @ptrarg.1(i32* %arg, i32 %val) argmemonly nounwind { -; CHECK: Function Attrs: argmemonly nounwind +; CHECK: Function Attrs: nounwind ; CHECK-LABEL: @ptrarg.1( ; CHECK-NEXT: store i32 10, i32* @g, align 4 ; CHECK-NEXT: ret void @@ -59,7 +59,7 @@ define void @caller.2(i32* %ptr) { ; Here the pointer argument %arg will be replaced by a constant. We need to ; drop inaccessiblemem_or_argmemonly. define internal void @ptrarg.3(i32* %arg, i32 %val) inaccessiblemem_or_argmemonly nounwind { -; CHECK: Function Attrs: inaccessiblemem_or_argmemonly nounwind +; CHECK: Function Attrs: nounwind ; CHECK-LABEL: @ptrarg.3( ; CHECK-NEXT: store i32 10, i32* @g, align 4 ; CHECK-NEXT: ret void @@ -107,7 +107,7 @@ define void @caller.4(i32* %ptr) { ; Here the pointer argument %arg will be replaced by a constant. We need to ; drop inaccessiblemem_or_argmemonly. define internal void @ptrarg.5(i32* %arg, i32 %val) argmemonly inaccessiblemem_or_argmemonly nounwind { -; CHECK: Function Attrs: argmemonly inaccessiblemem_or_argmemonly nounwind +; CHECK: Function Attrs: nounwind ; CHECK-LABEL: @ptrarg.5( ; CHECK-NEXT: store i32 10, i32* @g, align 4 ; CHECK-NEXT: ret void @@ -143,9 +143,9 @@ define internal void @ptrarg.6.cs.attributes(i32* %arg, i32 %val) { define i32 @caller.6.cs.attributes(i32 %n) { ; CHECK-LABEL: @caller.6.cs.attributes( ; CHECK-NEXT: store i32 1, i32* @g, align 4 -; CHECK-NEXT: tail call void @ptrarg.5(i32* @g, i32 10) [[ARGMEMONLY_INACCESSIBLEMEM_OR_ARGMEMONLY_NOUNWIND:#[0-9]+]] -; CHECK-NEXT: tail call void @ptrarg.5(i32* @g, i32 10) [[INACCESSIBLEMEM_OR_ARGMEMONLY_NOUNWIND:#[0-9]+]] -; CHECK-NEXT: tail call void @ptrarg.5(i32* @g, i32 10) [[ARGMEMONLY_NOUNWIND:#[0-9]+]] +; CHECK-NEXT: tail call void @ptrarg.5(i32* @g, i32 10) [[NOUNWIND:#[0-9]+]] +; CHECK-NEXT: tail call void @ptrarg.5(i32* @g, i32 10) [[NOUNWIND:#[0-9]+]] +; CHECK-NEXT: tail call void @ptrarg.5(i32* @g, i32 10) [[NOUNWIND:#[0-9]+]] ; CHECK-NEXT: tail call void @ptrarg.5(i32* @g, i32 10) [[NOUNWIND:#[0-9]+]] ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, i32* @g, align 4 ; CHECK-NEXT: ret i32 [[G_VAL]] @@ -159,7 +159,4 @@ define i32 @caller.6.cs.attributes(i32 %n) { ret i32 %g.val } -; CHECK-DAG: [[ARGMEMONLY_INACCESSIBLEMEM_OR_ARGMEMONLY_NOUNWIND]] = { argmemonly inaccessiblemem_or_argmemonly nounwind } -; CHECK-DAG: [[INACCESSIBLEMEM_OR_ARGMEMONLY_NOUNWIND]] = { inaccessiblemem_or_argmemonly nounwind } -; CHECK-DAG: [[ARGMEMONLY_NOUNWIND]] = { argmemonly nounwind } -; CHECK-DAG: [[NOUNWIND]] = { nounwind } +; CHECK: [[NOUNWIND]] = { nounwind } From 18d481cdf9d0e4a8ac5ad05007f34c944327f3d8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 25 Jul 2020 11:35:47 +0100 Subject: [PATCH 0079/1035] SymbolRemappingReader.h - pass Twine by reference not value. NFCI. --- llvm/include/llvm/Support/SymbolRemappingReader.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Support/SymbolRemappingReader.h b/llvm/include/llvm/Support/SymbolRemappingReader.h index 2b9ab570eb8b3..820cf9e021920 100644 --- a/llvm/include/llvm/Support/SymbolRemappingReader.h +++ b/llvm/include/llvm/Support/SymbolRemappingReader.h @@ -68,7 +68,7 @@ namespace llvm { class SymbolRemappingParseError : public ErrorInfo { public: - SymbolRemappingParseError(StringRef File, int64_t Line, Twine Message) + SymbolRemappingParseError(StringRef File, int64_t Line, const Twine &Message) : File(File), Line(Line), Message(Message.str()) {} void log(llvm::raw_ostream &OS) const override { From 3b21823e4abb501ca0b0de7777493be56a11cb67 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 25 Jul 2020 12:08:06 +0100 Subject: [PATCH 0080/1035] [X86][SSE] combineX86ShufflesRecursively - move all Root node asserts to the same location. NFCI. Minor tidyup for some upcoming shuffle combine improvements. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4dc68d469bbae..24bc264df129f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35346,6 +35346,9 @@ static SDValue combineX86ShufflesRecursively( assert(RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); + assert(Root.getSimpleValueType().isVector() && + "Shuffles operate on vector types!"); + unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. @@ -35361,9 +35364,6 @@ static SDValue combineX86ShufflesRecursively( if (!VT.isVector()) return SDValue(); // Bail if we hit a non-vector. - assert(Root.getSimpleValueType().isVector() && - "Shuffles operate on vector types!"); - unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); assert(VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."); From b5e14d78f16960ebb94f0a22e2a8c10f421f8757 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 25 Jul 2020 12:58:39 +0100 Subject: [PATCH 0081/1035] SimplifyLibCalls - remove unnecessary header and forward declaration. NFC. We include TargetLibraryInfo.h so don't need to forward declare it, and we don't need to include TargetLibraryInfo.h in SimplifyLibCalls.cpp as well. --- llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h | 1 - llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h index d6ee19365c729..2819a3468766d 100644 --- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -24,7 +24,6 @@ class CallInst; class DataLayout; class Instruction; class IRBuilderBase; -class TargetLibraryInfo; class Function; class OptimizationRemarkEmitter; class BlockFrequencyInfo; diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index cfcc3454a2102..b03389aec7bf4 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -20,7 +20,6 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/CaptureTracking.h" From ad16e71c953c92dbb4ffe9581c4e31a4cde70ccf Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 21 Jul 2020 21:26:30 +0200 Subject: [PATCH 0082/1035] Reapply [SCCP] Directly remove non-feasible edges Reapply with DTU update moved after CFG update, which is a requirement of the API. ----- Non-feasible control-flow edges are currently removed by replacing the branch condition with a constant and then calling ConstantFoldTerminator. This happens in a rather roundabout manner, by inspecting the users (effectively: predecessors) of unreachable blocks, and further complicated by the need to explicitly materialize the condition for "forced" edges. I would like to extend SCCP to discard switch conditions that are non-feasible based on range information, but this is incompatible with the current approach (as there is no single constant we could use.) Instead, this patch explicitly removes non-feasible edges. It currently only needs to handle the case where there is a single feasible edge. The llvm_unreachable() branch will need to be implemented for the aforementioned switch improvement. Differential Revision: https://reviews.llvm.org/D84264 --- llvm/lib/Transforms/Scalar/SCCP.cpp | 118 +++++++----------- .../test/Transforms/SCCP/conditions-ranges.ll | 4 +- llvm/test/Transforms/SCCP/domtree-update.ll | 41 ++++++ .../Transforms/SCCP/predicateinfo-cond.ll | 2 +- .../SCCP/resolvedundefsin-tracked-fn.ll | 8 +- .../SCCP/switch-constantfold-crash.ll | 12 +- llvm/test/Transforms/SCCP/switch.ll | 8 +- llvm/test/Transforms/SCCP/widening.ll | 14 +-- 8 files changed, 110 insertions(+), 97 deletions(-) create mode 100644 llvm/test/Transforms/SCCP/domtree-update.ll diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 428f9675e088c..270524a009596 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -276,7 +276,7 @@ class SCCPSolver : public InstVisitor { // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. - bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); + bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const; std::vector getStructLatticeValueFor(Value *V) const { std::vector StructValues; @@ -705,7 +705,7 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI, // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. -bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { +bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // Check if we've called markEdgeExecutable on the edge yet. (We could // be more aggressive and try to consider edges which haven't been marked // yet, but there isn't any need.) @@ -1807,39 +1807,51 @@ static void findReturnsToZap(Function &F, } } -// Update the condition for terminators that are branching on indeterminate -// values, forcing them to use a specific edge. -static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) { - BasicBlock *Dest = nullptr; - Constant *C = nullptr; - if (SwitchInst *SI = dyn_cast(I)) { - if (!isa(SI->getCondition())) { - // Indeterminate switch; use first case value. - Dest = SI->case_begin()->getCaseSuccessor(); - C = SI->case_begin()->getCaseValue(); - } - } else if (BranchInst *BI = dyn_cast(I)) { - if (!isa(BI->getCondition())) { - // Indeterminate branch; use false. - Dest = BI->getSuccessor(1); - C = ConstantInt::getFalse(BI->getContext()); - } - } else if (IndirectBrInst *IBR = dyn_cast(I)) { - if (!isa(IBR->getAddress()->stripPointerCasts())) { - // Indeterminate indirectbr; use successor 0. - Dest = IBR->getSuccessor(0); - C = BlockAddress::get(IBR->getSuccessor(0)); - } - } else { - llvm_unreachable("Unexpected terminator instruction"); +static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, + DomTreeUpdater &DTU) { + SmallPtrSet FeasibleSuccessors; + bool HasNonFeasibleEdges = false; + for (BasicBlock *Succ : successors(BB)) { + if (Solver.isEdgeFeasible(BB, Succ)) + FeasibleSuccessors.insert(Succ); + else + HasNonFeasibleEdges = true; } - if (C) { - assert(Solver.isEdgeFeasible(I->getParent(), Dest) && - "Didn't find feasible edge?"); - (void)Dest; - I->setOperand(0, C); + // All edges feasible, nothing to do. + if (!HasNonFeasibleEdges) + return false; + + // SCCP can only determine non-feasible edges for br, switch and indirectbr. + Instruction *TI = BB->getTerminator(); + assert((isa(TI) || isa(TI) || + isa(TI)) && + "Terminator must be a br, switch or indirectbr"); + + if (FeasibleSuccessors.size() == 1) { + // Replace with an unconditional branch to the only feasible successor. + BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin(); + SmallVector Updates; + bool HaveSeenOnlyFeasibleSuccessor = false; + for (BasicBlock *Succ : successors(BB)) { + if (Succ == OnlyFeasibleSuccessor && !HaveSeenOnlyFeasibleSuccessor) { + // Don't remove the edge to the only feasible successor the first time + // we see it. We still do need to remove any multi-edges to it though. + HaveSeenOnlyFeasibleSuccessor = true; + continue; + } + + Succ->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, Succ}); + } + + BranchInst::Create(OnlyFeasibleSuccessor, BB); + TI->eraseFromParent(); + DTU.applyUpdatesPermissive(Updates); + } else { + llvm_unreachable("Either all successors are feasible, or exactly one is"); } + return true; } bool llvm::runIPSCCP( @@ -1972,45 +1984,11 @@ bool llvm::runIPSCCP( /*UseLLVMTrap=*/false, /*PreserveLCSSA=*/false, &DTU); - // Now that all instructions in the function are constant folded, - // use ConstantFoldTerminator to get rid of in-edges, record DT updates and - // delete dead BBs. - for (BasicBlock *DeadBB : BlocksToErase) { - // If there are any PHI nodes in this successor, drop entries for BB now. - for (Value::user_iterator UI = DeadBB->user_begin(), - UE = DeadBB->user_end(); - UI != UE;) { - // Grab the user and then increment the iterator early, as the user - // will be deleted. Step past all adjacent uses from the same user. - auto *I = dyn_cast(*UI); - do { ++UI; } while (UI != UE && *UI == I); - - // Ignore blockaddress users; BasicBlock's dtor will handle them. - if (!I) continue; - - // If we have forced an edge for an indeterminate value, then force the - // terminator to fold to that edge. - forceIndeterminateEdge(I, Solver); - BasicBlock *InstBB = I->getParent(); - bool Folded = ConstantFoldTerminator(InstBB, - /*DeleteDeadConditions=*/false, - /*TLI=*/nullptr, &DTU); - assert(Folded && - "Expect TermInst on constantint or blockaddress to be folded"); - (void) Folded; - // If we folded the terminator to an unconditional branch to another - // dead block, replace it with Unreachable, to avoid trying to fold that - // branch again. - BranchInst *BI = cast(InstBB->getTerminator()); - if (BI && BI->isUnconditional() && - !Solver.isBlockExecutable(BI->getSuccessor(0))) { - InstBB->getTerminator()->eraseFromParent(); - new UnreachableInst(InstBB->getContext(), InstBB); - } - } - // Mark dead BB for deletion. + for (BasicBlock &BB : F) + removeNonFeasibleEdges(Solver, &BB, DTU); + + for (BasicBlock *DeadBB : BlocksToErase) DTU.deleteBB(DeadBB); - } for (BasicBlock &BB : F) { for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll index 612a38f008fc5..dada59099d81b 100644 --- a/llvm/test/Transforms/SCCP/conditions-ranges.ll +++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll @@ -231,12 +231,12 @@ define void @f7_nested_conds(i32* %a, i32 %b) { ; CHECK-NEXT: [[C_1:%.*]] = icmp ne i32 [[A_V]], 0 ; CHECK-NEXT: br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]] ; CHECK: false: -; CHECK-NEXT: br i1 true, label [[TRUE_2:%.*]], label [[TRUE]] +; CHECK-NEXT: br label [[TRUE_2:%.*]] ; CHECK: true.2: ; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: ret void ; CHECK: true: -; CHECK-NEXT: store i32 [[B:%.*]], i32* [[A]] +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[A]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SCCP/domtree-update.ll b/llvm/test/Transforms/SCCP/domtree-update.ll new file mode 100644 index 0000000000000..32adbde300e88 --- /dev/null +++ b/llvm/test/Transforms/SCCP/domtree-update.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -ipsccp < %s | FileCheck %s +; RUN: opt -S -passes='ipsccp,function(verify)' < %s | FileCheck %s + +; DTU should not crash. + +define i32 @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br label [[IF_THEN2:%.*]] +; CHECK: if.then2: +; CHECK-NEXT: br label [[FOR_INC:%.*]] +; CHECK: for.inc: +; CHECK-NEXT: unreachable +; +entry: + br label %for.body + +for.body: ; preds = %entry + br i1 true, label %if.then2, label %if.else + +if.then2: ; preds = %for.body + br label %for.inc + +if.else: ; preds = %for.body + br i1 undef, label %lor.rhs, label %if.then19.critedge + +lor.rhs: ; preds = %if.else + br i1 undef, label %if.then19, label %for.inc + +if.then19.critedge: ; preds = %if.else + br label %if.then19 + +if.then19: ; preds = %if.then19.critedge, %lor.rhs + unreachable + +for.inc: ; preds = %lor.rhs, %if.then2 + unreachable +} diff --git a/llvm/test/Transforms/SCCP/predicateinfo-cond.ll b/llvm/test/Transforms/SCCP/predicateinfo-cond.ll index 8ed96ec9301f5..1443cc72c2ef8 100644 --- a/llvm/test/Transforms/SCCP/predicateinfo-cond.ll +++ b/llvm/test/Transforms/SCCP/predicateinfo-cond.ll @@ -105,7 +105,7 @@ define void @pr46814(i32 %a) { ; CHECK-NEXT: [[C3:%.*]] = and i1 [[C1]], [[C2]] ; CHECK-NEXT: br i1 [[C3]], label [[IF_1:%.*]], label [[EXIT:%.*]] ; CHECK: if.1: -; CHECK-NEXT: br i1 true, label [[IF_2:%.*]], label [[EXIT]] +; CHECK-NEXT: br label [[IF_2:%.*]] ; CHECK: if.2: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[EXIT]] ; CHECK: exit: diff --git a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll index e1c7b3d5662d0..9e9d1256c4cc0 100644 --- a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll +++ b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll @@ -136,13 +136,12 @@ define internal i1 @test2_g(%t1* %h, i32 %i) { ; CHECK-LABEL: define {{[^@]+}}@test2_g ; CHECK-SAME: (%t1* [[H:%.*]], i32 [[I:%.*]]) ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; CHECK-NEXT: br label [[LAND_RHS:%.*]] ; CHECK: land.rhs: ; CHECK-NEXT: [[CALL:%.*]] = call i32 (...) @test2_j() ; CHECK-NEXT: [[TOBOOL1:%.*]] = icmp ne i32 [[CALL]], 0 -; CHECK-NEXT: br label [[LAND_END]] +; CHECK-NEXT: br label [[LAND_END:%.*]] ; CHECK: land.end: -; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TOBOOL1]], [[LAND_RHS]] ] ; CHECK-NEXT: ret i1 undef ; entry: @@ -196,10 +195,9 @@ define internal i32 @test3_k(i8 %h, i32 %i) { ; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[CONV]] to %t1* ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ undef, [[ENTRY:%.*]] ], [ false, [[LOOP]] ] ; CHECK-NEXT: [[CALL:%.*]] = call i1 @test3_g(%t1* [[TMP1]], i32 0) ; CHECK-NEXT: call void @use.1(i1 false) -; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret i32 undef ; diff --git a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll index 7596a56b81229..17b37f000407c 100644 --- a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll +++ b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll @@ -5,11 +5,11 @@ define void @barney() { ; CHECK-LABEL: @barney( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label %bb9 +; CHECK-NEXT: br label [[BB9:%.*]] ; CHECK: bb6: ; CHECK-NEXT: unreachable ; CHECK: bb9: -; CHECK-NEXT: unreachable +; CHECK-NEXT: br label [[BB6:%.*]] ; bb: br label %bb9 @@ -29,9 +29,9 @@ bb9: ; preds = %bb define void @blam() { ; CHECK-LABEL: @blam( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label %bb16 +; CHECK-NEXT: br label [[BB16:%.*]] ; CHECK: bb16: -; CHECK-NEXT: br label %bb38 +; CHECK-NEXT: br label [[BB38:%.*]] ; CHECK: bb38: ; CHECK-NEXT: unreachable ; @@ -62,9 +62,9 @@ bb38: ; preds = %bb16 define void @hoge() { ; CHECK-LABEL: @hoge( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label %bb2 +; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: unreachable +; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: ; CHECK-NEXT: unreachable ; diff --git a/llvm/test/Transforms/SCCP/switch.ll b/llvm/test/Transforms/SCCP/switch.ll index 5f607a3afd89d..3587587bcb91d 100644 --- a/llvm/test/Transforms/SCCP/switch.ll +++ b/llvm/test/Transforms/SCCP/switch.ll @@ -23,15 +23,11 @@ define i32 @test_duplicate_successors_phi(i1 %c, i32 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[SWITCH:%.*]], label [[END:%.*]] ; CHECK: switch: -; CHECK-NEXT: switch i32 -1, label [[SWITCH_DEFAULT:%.*]] [ -; CHECK-NEXT: i32 0, label [[END]] -; CHECK-NEXT: i32 1, label [[END]] -; CHECK-NEXT: ] +; CHECK-NEXT: br label [[SWITCH_DEFAULT:%.*]] ; CHECK: switch.default: ; CHECK-NEXT: ret i32 -1 ; CHECK: end: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[X:%.*]], [[ENTRY:%.*]] ], [ 1, [[SWITCH]] ], [ 1, [[SWITCH]] ] -; CHECK-NEXT: ret i32 [[PHI]] +; CHECK-NEXT: ret i32 [[X:%.*]] ; entry: br i1 %c, label %switch, label %end diff --git a/llvm/test/Transforms/SCCP/widening.ll b/llvm/test/Transforms/SCCP/widening.ll index 2703bdb27dff4..23a88c35a93ea 100644 --- a/llvm/test/Transforms/SCCP/widening.ll +++ b/llvm/test/Transforms/SCCP/widening.ll @@ -216,11 +216,11 @@ define void @rotated_loop_2(i32 %x) { ; IPSCCP: bb3: ; IPSCCP-NEXT: br label [[EXIT]] ; IPSCCP: exit: -; IPSCCP-NEXT: [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 3, [[BB1]] ], [ 2, [[BB2]] ], [ 5, [[BB3]] ], [ [[A:%.*]], [[EXIT]] ] -; IPSCCP-NEXT: [[A]] = add i32 [[P]], 1 +; IPSCCP-NEXT: [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 3, [[BB1]] ], [ 2, [[BB2]] ], [ 5, [[BB3]] ] +; IPSCCP-NEXT: [[A:%.*]] = add i32 [[P]], 1 ; IPSCCP-NEXT: call void @use(i1 true) ; IPSCCP-NEXT: call void @use(i1 false) -; IPSCCP-NEXT: br i1 false, label [[EXIT]], label [[EXIT_1:%.*]] +; IPSCCP-NEXT: br label [[EXIT_1:%.*]] ; IPSCCP: exit.1: ; IPSCCP-NEXT: ret void ; @@ -451,10 +451,10 @@ define void @foo(i64* %arg) { ; SCCP-NEXT: [[TMP7:%.*]] = sub i64 3, [[TMP6]] ; SCCP-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 1 ; SCCP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; SCCP-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; SCCP-NEXT: [[TMP0:%.*]] = zext i32 [[TMP9]] to i64 ; SCCP-NEXT: br label [[BB11:%.*]] ; SCCP: bb11: -; SCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] +; SCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP0]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] ; SCCP-NEXT: br label [[BB13:%.*]] ; SCCP: bb13: ; SCCP-NEXT: [[C_1:%.*]] = icmp eq i64 [[TMP12]], 6 @@ -489,10 +489,10 @@ define void @foo(i64* %arg) { ; IPSCCP-NEXT: [[TMP7:%.*]] = sub i64 3, [[TMP6]] ; IPSCCP-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 1 ; IPSCCP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; IPSCCP-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; IPSCCP-NEXT: [[TMP0:%.*]] = zext i32 [[TMP9]] to i64 ; IPSCCP-NEXT: br label [[BB11:%.*]] ; IPSCCP: bb11: -; IPSCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] +; IPSCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP0]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] ; IPSCCP-NEXT: br label [[BB13:%.*]] ; IPSCCP: bb13: ; IPSCCP-NEXT: [[C_1:%.*]] = icmp eq i64 [[TMP12]], 6 From 632a89e866f38e8fdfae7785ffa94fb2f302eefc Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 25 Jul 2020 15:10:48 +0200 Subject: [PATCH 0083/1035] [SCCP] Restore the change reporting as well Reapply 5db5b4bc4394ca247c9eb665e03b851848aa2fbf. --- llvm/lib/Transforms/Scalar/SCCP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 270524a009596..c4f5c522e4509 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -1985,7 +1985,7 @@ bool llvm::runIPSCCP( /*PreserveLCSSA=*/false, &DTU); for (BasicBlock &BB : F) - removeNonFeasibleEdges(Solver, &BB, DTU); + MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU); for (BasicBlock *DeadBB : BlocksToErase) DTU.deleteBB(DeadBB); From f4199b8f0bdbea12ff69dbb78336a05894f81a8c Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 25 Jul 2020 16:02:15 +0200 Subject: [PATCH 0084/1035] [SCCP] Add assume non null test (NFC) --- llvm/test/Transforms/SCCP/assume.ll | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/llvm/test/Transforms/SCCP/assume.ll b/llvm/test/Transforms/SCCP/assume.ll index 764c1737c2874..dc827f03c0abe 100644 --- a/llvm/test/Transforms/SCCP/assume.ll +++ b/llvm/test/Transforms/SCCP/assume.ll @@ -46,3 +46,30 @@ define void @basic(i32 %v) { call void @use(i1 %c8) ret void } + +define void @nonnull(i32* %v) { +; CHECK-LABEL: @nonnull( +; CHECK-NEXT: [[A:%.*]] = icmp ne i32* [[V:%.*]], null +; CHECK-NEXT: call void @llvm.assume(i1 [[A]]) +; CHECK-NEXT: [[C1:%.*]] = icmp eq i32* [[V]], null +; CHECK-NEXT: call void @use(i1 [[C1]]) +; CHECK-NEXT: [[C2:%.*]] = icmp ne i32* [[V]], null +; CHECK-NEXT: call void @use(i1 [[C2]]) +; CHECK-NEXT: [[C3:%.*]] = icmp eq i32* null, [[V]] +; CHECK-NEXT: call void @use(i1 [[C3]]) +; CHECK-NEXT: [[C4:%.*]] = icmp ne i32* null, [[V]] +; CHECK-NEXT: call void @use(i1 [[C4]]) +; CHECK-NEXT: ret void +; + %a = icmp ne i32* %v, null + call void @llvm.assume(i1 %a) + %c1 = icmp eq i32* %v, null + call void @use(i1 %c1) + %c2 = icmp ne i32* %v, null + call void @use(i1 %c2) + %c3 = icmp eq i32* null, %v + call void @use(i1 %c3) + %c4 = icmp ne i32* null, %v + call void @use(i1 %c4) + ret void +} From 392b969c3203001d076eafc080d6d014827d39ca Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 15 Jun 2020 20:13:24 -0400 Subject: [PATCH 0085/1035] AMDGPU/GlobalISel: Don't assert on G_INSERT > 128-bits Just fallback for now. Really tablegen needs to generate all of the subregister index handling we need. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 4 +++ .../GlobalISel/inst-select-insert.xfail.mir | 26 ++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a637442002932..4740a58519996 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -736,6 +736,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { if (Offset % 32 != 0 || InsSize % 32 != 0) return false; + // Currently not handled by getSubRegFromChannel. + if (InsSize > 128) + return false; + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); if (SubReg == AMDGPU::NoSubRegister) return false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir index 5e58e8b633ec4..150b341561f97 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir @@ -1,9 +1,13 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s -# FIXME: This should not be legal and this test should be deleted -# ERR: remark: :0:0: cannot select: %3:sgpr(<4 x s16>) = G_INSERT %0:sgpr, %2:sgpr(s16), 0 (in function: insert_sgpr_2s16_to_v4s16_offset0) +# ERR: remark: :0:0: cannot select: %3:sgpr(<4 x s16>) = G_INSERT %0:sgpr, %2:sgpr(s16), 0 (in function: insert_sgpr_s16_to_v4s16_offset0) +# ERR-NEXT: remark: :0:0: cannot select: %2:sgpr(<16 x s32>) = G_INSERT %0:sgpr, %1:sgpr(<8 x s32>), 0 (in function: insert_sgpr_v8s32_to_v16s32_offset0) +# ERR-NOT: remark + +# FIXME: This 16-bit insert source should not be legal and this test +# should be deleted --- -name: insert_sgpr_2s16_to_v4s16_offset0 +name: insert_sgpr_s16_to_v4s16_offset0 legalized: true regBankSelected: true @@ -17,3 +21,19 @@ body: | S_ENDPGM 0, implicit %3 ... + +# getSubRegFromChannel current does not handle cases > 128-bits +--- +name: insert_sgpr_v8s32_to_v16s32_offset0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 + %0:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:sgpr(<8 x s32>) = COPY $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 + %2:sgpr(<16 x s32>) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... From bc79ed7e16003c8550da8710b321d6d5d4243faf Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 25 Jul 2020 16:32:22 +0200 Subject: [PATCH 0086/1035] [LVI] Don't require operand number for range (NFC) Pass the Value* instead of the operand number, rename I to CxtI. This makes the function a bit more generally useful. --- llvm/lib/Analysis/LazyValueInfo.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index f5ffa7286b3b8..34cc81c4bf2a3 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -388,8 +388,8 @@ class LazyValueInfoImpl { BasicBlock *BB); Optional solveBlockValueSelect(SelectInst *S, BasicBlock *BB); - Optional getRangeForOperand(unsigned Op, Instruction *I, - BasicBlock *BB); + Optional getRangeFor(Value *V, Instruction *CxtI, + BasicBlock *BB); Optional solveBlockValueBinaryOpImpl( Instruction *I, BasicBlock *BB, std::function LazyValueInfoImpl::solveBlockValueSelect( return Result; } -Optional LazyValueInfoImpl::getRangeForOperand(unsigned Op, - Instruction *I, - BasicBlock *BB) { - Optional OptVal = getBlockValue(I->getOperand(Op), BB); +Optional LazyValueInfoImpl::getRangeFor(Value *V, + Instruction *CxtI, + BasicBlock *BB) { + Optional OptVal = getBlockValue(V, BB); if (!OptVal) return None; ValueLatticeElement &Val = *OptVal; - intersectAssumeOrGuardBlockValueConstantRange(I->getOperand(Op), Val, I); + intersectAssumeOrGuardBlockValueConstantRange(V, Val, CxtI); if (Val.isConstantRange()) return Val.getConstantRange(); - const unsigned OperandBitWidth = - DL.getTypeSizeInBits(I->getOperand(Op)->getType()); + const unsigned OperandBitWidth = DL.getTypeSizeInBits(V->getType()); return ConstantRange::getFull(OperandBitWidth); } @@ -962,7 +961,7 @@ Optional LazyValueInfoImpl::solveBlockValueCast( // Figure out the range of the LHS. If that fails, we still apply the // transfer rule on the full set since we may be able to locally infer // interesting facts. - Optional LHSRes = getRangeForOperand(0, CI, BB); + Optional LHSRes = getRangeFor(CI->getOperand(0), CI, BB); if (!LHSRes.hasValue()) // More work to do before applying this transfer rule. return None; @@ -985,8 +984,8 @@ Optional LazyValueInfoImpl::solveBlockValueBinaryOpImpl( // conservative range, but apply the transfer rule anyways. This // lets us pick up facts from expressions like "and i32 (call i32 // @foo()), 32" - Optional LHSRes = getRangeForOperand(0, I, BB); - Optional RHSRes = getRangeForOperand(1, I, BB); + Optional LHSRes = getRangeFor(I->getOperand(0), I, BB); + Optional RHSRes = getRangeFor(I->getOperand(1), I, BB); if (!LHSRes.hasValue() || !RHSRes.hasValue()) // More work to do before applying this transfer rule. return None; From c09a10845b429307a38a93799e7520c0e16850fd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 25 Jul 2020 15:45:24 +0100 Subject: [PATCH 0087/1035] [X86] Remove stress-scheduledagrrlist.ll. This test seems to take quite a long time with EXPENSIVE_CHECKS. Remove it. --- llvm/test/CodeGen/X86/stress-scheduledagrrlist.ll | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/stress-scheduledagrrlist.ll diff --git a/llvm/test/CodeGen/X86/stress-scheduledagrrlist.ll b/llvm/test/CodeGen/X86/stress-scheduledagrrlist.ll deleted file mode 100644 index a699134a8c7c5..0000000000000 --- a/llvm/test/CodeGen/X86/stress-scheduledagrrlist.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc -O0 -mtriple=x86_64-apple-macosx %s -o %t.s - -; Stress test for the list scheduler. The store will be expanded to a very -; large number of stores during isel, stressing ScheduleDAGRRList. It should -; compiles in a reasonable amount of time. Run with -O0, to disable most other -; optimizations. - -define void @test(i1000000* %ptr) { -entry: - store i1000000 0, i1000000* %ptr, align 4 - ret void -} From 1d9b860fb6a85df33fd52fcacc6a5efb421621bd Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Sat, 25 Jul 2020 08:27:21 -0700 Subject: [PATCH 0088/1035] Unify the return value of GetByteSize to an llvm::Optional (NFC-ish) This cleanup patch unifies all methods called GetByteSize() in the ValueObject hierarchy to return an optional, like the methods in CompilerType do. This means fewer magic 0 values, which could fix bugs down the road in languages where types can have a size of zero, such as Swift and C (but not C++). Differential Revision: https://reviews.llvm.org/D84285 --- lldb/include/lldb/Core/ValueObject.h | 2 +- lldb/include/lldb/Core/ValueObjectCast.h | 2 +- lldb/include/lldb/Core/ValueObjectChild.h | 2 +- .../lldb/Core/ValueObjectConstResult.h | 4 ++-- .../lldb/Core/ValueObjectDynamicValue.h | 2 +- lldb/include/lldb/Core/ValueObjectMemory.h | 2 +- lldb/include/lldb/Core/ValueObjectRegister.h | 4 ++-- .../lldb/Core/ValueObjectSyntheticFilter.h | 2 +- lldb/include/lldb/Core/ValueObjectVariable.h | 2 +- .../lldb/Expression/ExpressionVariable.h | 2 +- .../lldb/Target/StackFrameRecognizer.h | 4 +++- lldb/source/API/SBValue.cpp | 2 +- .../Commands/CommandObjectWatchpoint.cpp | 2 +- lldb/source/Core/ValueObject.cpp | 12 +++++----- lldb/source/Core/ValueObjectCast.cpp | 2 +- lldb/source/Core/ValueObjectConstResult.cpp | 10 ++++---- lldb/source/Core/ValueObjectDynamicValue.cpp | 2 +- lldb/source/Core/ValueObjectMemory.cpp | 8 +++---- lldb/source/Core/ValueObjectRegister.cpp | 6 +++-- .../Core/ValueObjectSyntheticFilter.cpp | 4 +++- lldb/source/Core/ValueObjectVariable.cpp | 6 ++--- lldb/source/Expression/ExpressionVariable.cpp | 8 +++---- lldb/source/Expression/Materializer.cpp | 23 +++++++++++-------- lldb/source/Target/StackFrame.cpp | 6 ++--- 24 files changed, 62 insertions(+), 57 deletions(-) diff --git a/lldb/include/lldb/Core/ValueObject.h b/lldb/include/lldb/Core/ValueObject.h index 0080368fd9965..a557d69f3ae30 100644 --- a/lldb/include/lldb/Core/ValueObject.h +++ b/lldb/include/lldb/Core/ValueObject.h @@ -358,7 +358,7 @@ class ValueObject : public UserID { virtual bool CanProvideValue(); // Subclasses must implement the functions below. - virtual uint64_t GetByteSize() = 0; + virtual llvm::Optional GetByteSize() = 0; virtual lldb::ValueType GetValueType() const = 0; diff --git a/lldb/include/lldb/Core/ValueObjectCast.h b/lldb/include/lldb/Core/ValueObjectCast.h index d91ca6a92be8d..342803f8ca63a 100644 --- a/lldb/include/lldb/Core/ValueObjectCast.h +++ b/lldb/include/lldb/Core/ValueObjectCast.h @@ -30,7 +30,7 @@ class ValueObjectCast : public ValueObject { ConstString name, const CompilerType &cast_type); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; size_t CalculateNumChildren(uint32_t max) override; diff --git a/lldb/include/lldb/Core/ValueObjectChild.h b/lldb/include/lldb/Core/ValueObjectChild.h index c6f44a29b0591..9a9fd9294261a 100644 --- a/lldb/include/lldb/Core/ValueObjectChild.h +++ b/lldb/include/lldb/Core/ValueObjectChild.h @@ -30,7 +30,7 @@ class ValueObjectChild : public ValueObject { public: ~ValueObjectChild() override; - uint64_t GetByteSize() override { return m_byte_size; } + llvm::Optional GetByteSize() override { return m_byte_size; } lldb::offset_t GetByteOffset() override { return m_byte_offset; } diff --git a/lldb/include/lldb/Core/ValueObjectConstResult.h b/lldb/include/lldb/Core/ValueObjectConstResult.h index 0e868c687e931..8d823baa0b7b4 100644 --- a/lldb/include/lldb/Core/ValueObjectConstResult.h +++ b/lldb/include/lldb/Core/ValueObjectConstResult.h @@ -62,7 +62,7 @@ class ValueObjectConstResult : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const Status &error); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override; @@ -113,7 +113,7 @@ class ValueObjectConstResult : public ValueObject { CompilerType GetCompilerTypeImpl() override; ConstString m_type_name; - uint64_t m_byte_size; + llvm::Optional m_byte_size; ValueObjectConstResultImpl m_impl; diff --git a/lldb/include/lldb/Core/ValueObjectDynamicValue.h b/lldb/include/lldb/Core/ValueObjectDynamicValue.h index 9f5304b55e934..2806857339efb 100644 --- a/lldb/include/lldb/Core/ValueObjectDynamicValue.h +++ b/lldb/include/lldb/Core/ValueObjectDynamicValue.h @@ -34,7 +34,7 @@ class ValueObjectDynamicValue : public ValueObject { public: ~ValueObjectDynamicValue() override; - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectMemory.h b/lldb/include/lldb/Core/ValueObjectMemory.h index d1cd6ae41445d..b5d5e6ecf4c0e 100644 --- a/lldb/include/lldb/Core/ValueObjectMemory.h +++ b/lldb/include/lldb/Core/ValueObjectMemory.h @@ -40,7 +40,7 @@ class ValueObjectMemory : public ValueObject { const Address &address, const CompilerType &ast_type); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectRegister.h b/lldb/include/lldb/Core/ValueObjectRegister.h index 41051d93b707e..3968584ad5185 100644 --- a/lldb/include/lldb/Core/ValueObjectRegister.h +++ b/lldb/include/lldb/Core/ValueObjectRegister.h @@ -36,7 +36,7 @@ class ValueObjectRegisterSet : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t set_idx); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegisterSet; @@ -86,7 +86,7 @@ class ValueObjectRegister : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t reg_num); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegister; diff --git a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h index cb471657aec9b..41c461ce13f0d 100644 --- a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h +++ b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h @@ -36,7 +36,7 @@ class ValueObjectSynthetic : public ValueObject { public: ~ValueObjectSynthetic() override; - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectVariable.h b/lldb/include/lldb/Core/ValueObjectVariable.h index b7e262574a14d..23fdedbf5a4a6 100644 --- a/lldb/include/lldb/Core/ValueObjectVariable.h +++ b/lldb/include/lldb/Core/ValueObjectVariable.h @@ -37,7 +37,7 @@ class ValueObjectVariable : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const lldb::VariableSP &var_sp); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Expression/ExpressionVariable.h b/lldb/include/lldb/Expression/ExpressionVariable.h index 60062d212badf..4259e6395da47 100644 --- a/lldb/include/lldb/Expression/ExpressionVariable.h +++ b/lldb/include/lldb/Expression/ExpressionVariable.h @@ -32,7 +32,7 @@ class ExpressionVariable virtual ~ExpressionVariable(); - size_t GetByteSize() { return m_frozen_sp->GetByteSize(); } + llvm::Optional GetByteSize() { return m_frozen_sp->GetByteSize(); } ConstString GetName() { return m_frozen_sp->GetName(); } diff --git a/lldb/include/lldb/Target/StackFrameRecognizer.h b/lldb/include/lldb/Target/StackFrameRecognizer.h index 302b56bec907b..baffc890bb065 100644 --- a/lldb/include/lldb/Target/StackFrameRecognizer.h +++ b/lldb/include/lldb/Target/StackFrameRecognizer.h @@ -154,7 +154,9 @@ class ValueObjectRecognizerSynthesizedValue : public ValueObject { SetName(parent.GetName()); } - uint64_t GetByteSize() override { return m_parent->GetByteSize(); } + llvm::Optional GetByteSize() override { + return m_parent->GetByteSize(); + } lldb::ValueType GetValueType() const override { return m_type; } bool UpdateValue() override { if (!m_parent->UpdateValueIfNeeded()) return false; diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp index 7485b0ee1838e..686d1f23a75a8 100644 --- a/lldb/source/API/SBValue.cpp +++ b/lldb/source/API/SBValue.cpp @@ -333,7 +333,7 @@ size_t SBValue::GetByteSize() { ValueLocker locker; lldb::ValueObjectSP value_sp(GetSP(locker)); if (value_sp) { - result = value_sp->GetByteSize(); + result = value_sp->GetByteSize().getValueOr(0); } return result; diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp index ce4662930a7c2..c2a008af79d6f 100644 --- a/lldb/source/Commands/CommandObjectWatchpoint.cpp +++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp @@ -905,7 +905,7 @@ corresponding to the byte size of the data type."); // We're in business. // Find out the size of this variable. size = m_option_watchpoint.watch_size == 0 - ? valobj_sp->GetByteSize() + ? valobj_sp->GetByteSize().getValueOr(0) : m_option_watchpoint.watch_size; } compiler_type = valobj_sp->GetCompilerType(); diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index 3a775b07e5e1f..aedefd0cf0fd9 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -849,7 +849,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize(); + const size_t byte_size = GetByteSize().getValueOr(0); Value::ValueType value_type = m_value.GetValueType(); @@ -1524,7 +1524,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize(); + const size_t byte_size = GetByteSize().getValueOr(0); Value::ValueType value_type = m_value.GetValueType(); @@ -1741,13 +1741,13 @@ ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to, uint32_t bit_field_offset = from; if (GetDataExtractor().GetByteOrder() == eByteOrderBig) bit_field_offset = - GetByteSize() * 8 - bit_field_size - bit_field_offset; + GetByteSize().getValueOr(0) * 8 - bit_field_size - bit_field_offset; // We haven't made a synthetic array member for INDEX yet, so lets make // one and cache it for any future reference. ValueObjectChild *synthetic_child = new ValueObjectChild( - *this, GetCompilerType(), index_const_str, GetByteSize(), 0, - bit_field_size, bit_field_offset, false, false, eAddressTypeInvalid, - 0); + *this, GetCompilerType(), index_const_str, + GetByteSize().getValueOr(0), 0, bit_field_size, bit_field_offset, + false, false, eAddressTypeInvalid, 0); // Cache the value if we got one back... if (synthetic_child) { diff --git a/lldb/source/Core/ValueObjectCast.cpp b/lldb/source/Core/ValueObjectCast.cpp index 22e856be539b5..7b6d3591faf44 100644 --- a/lldb/source/Core/ValueObjectCast.cpp +++ b/lldb/source/Core/ValueObjectCast.cpp @@ -47,7 +47,7 @@ size_t ValueObjectCast::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -uint64_t ValueObjectCast::GetByteSize() { +llvm::Optional ValueObjectCast::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); return m_value.GetValueByteSize(nullptr, &exe_ctx); } diff --git a/lldb/source/Core/ValueObjectConstResult.cpp b/lldb/source/Core/ValueObjectConstResult.cpp index 8d84f8e62ccc5..fd31ddc676b43 100644 --- a/lldb/source/Core/ValueObjectConstResult.cpp +++ b/lldb/source/Core/ValueObjectConstResult.cpp @@ -179,8 +179,7 @@ ValueObjectSP ValueObjectConstResult::Create(ExecutionContextScope *exe_scope, ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Status &error) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this) { + : ValueObject(exe_scope, manager), m_impl(this) { m_error = error; SetIsConstant(); } @@ -189,8 +188,7 @@ ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Value &value, ConstString name, Module *module) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this) { + : ValueObject(exe_scope, manager), m_impl(this) { m_value = value; m_name = name; ExecutionContext exe_ctx; @@ -208,9 +206,9 @@ lldb::ValueType ValueObjectConstResult::GetValueType() const { return eValueTypeConstResult; } -uint64_t ValueObjectConstResult::GetByteSize() { +llvm::Optional ValueObjectConstResult::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); - if (m_byte_size == 0) { + if (!m_byte_size) { if (auto size = GetCompilerType().GetByteSize(exe_ctx.GetBestExecutionContextScope())) SetByteSize(*size); diff --git a/lldb/source/Core/ValueObjectDynamicValue.cpp b/lldb/source/Core/ValueObjectDynamicValue.cpp index ca66740cb55d4..1c25b8c85a059 100644 --- a/lldb/source/Core/ValueObjectDynamicValue.cpp +++ b/lldb/source/Core/ValueObjectDynamicValue.cpp @@ -98,7 +98,7 @@ size_t ValueObjectDynamicValue::CalculateNumChildren(uint32_t max) { return m_parent->GetNumChildren(max); } -uint64_t ValueObjectDynamicValue::GetByteSize() { +llvm::Optional ValueObjectDynamicValue::GetByteSize() { const bool success = UpdateValueIfNeeded(false); if (success && m_dynamic_type_info.HasType()) { ExecutionContext exe_ctx(GetExecutionContextRef()); diff --git a/lldb/source/Core/ValueObjectMemory.cpp b/lldb/source/Core/ValueObjectMemory.cpp index 8e7d3ebc93f69..17fade9e5fdc3 100644 --- a/lldb/source/Core/ValueObjectMemory.cpp +++ b/lldb/source/Core/ValueObjectMemory.cpp @@ -139,13 +139,11 @@ size_t ValueObjectMemory::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -uint64_t ValueObjectMemory::GetByteSize() { +llvm::Optional ValueObjectMemory::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); if (m_type_sp) - return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()) - .getValueOr(0); - return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()) - .getValueOr(0); + return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()); + return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); } lldb::ValueType ValueObjectMemory::GetValueType() const { diff --git a/lldb/source/Core/ValueObjectRegister.cpp b/lldb/source/Core/ValueObjectRegister.cpp index ec87c38fb3679..27461e9cebc41 100644 --- a/lldb/source/Core/ValueObjectRegister.cpp +++ b/lldb/source/Core/ValueObjectRegister.cpp @@ -81,7 +81,7 @@ size_t ValueObjectRegisterSet::CalculateNumChildren(uint32_t max) { return 0; } -uint64_t ValueObjectRegisterSet::GetByteSize() { return 0; } +llvm::Optional ValueObjectRegisterSet::GetByteSize() { return 0; } bool ValueObjectRegisterSet::UpdateValue() { m_error.Clear(); @@ -229,7 +229,9 @@ size_t ValueObjectRegister::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -uint64_t ValueObjectRegister::GetByteSize() { return m_reg_info.byte_size; } +llvm::Optional ValueObjectRegister::GetByteSize() { + return m_reg_info.byte_size; +} bool ValueObjectRegister::UpdateValue() { m_error.Clear(); diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp index 32d1e6ab8368c..fb2d32e602cea 100644 --- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp +++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp @@ -121,7 +121,9 @@ bool ValueObjectSynthetic::MightHaveChildren() { return (m_might_have_children != eLazyBoolNo); } -uint64_t ValueObjectSynthetic::GetByteSize() { return m_parent->GetByteSize(); } +llvm::Optional ValueObjectSynthetic::GetByteSize() { + return m_parent->GetByteSize(); +} lldb::ValueType ValueObjectSynthetic::GetValueType() const { return m_parent->GetValueType(); diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp index 0d1e7b047a0ac..ab67e3038cf0a 100644 --- a/lldb/source/Core/ValueObjectVariable.cpp +++ b/lldb/source/Core/ValueObjectVariable.cpp @@ -105,15 +105,15 @@ size_t ValueObjectVariable::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -uint64_t ValueObjectVariable::GetByteSize() { +llvm::Optional ValueObjectVariable::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); CompilerType type(GetCompilerType()); if (!type.IsValid()) - return 0; + return {}; - return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()).getValueOr(0); + return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); } lldb::ValueType ValueObjectVariable::GetValueType() const { diff --git a/lldb/source/Expression/ExpressionVariable.cpp b/lldb/source/Expression/ExpressionVariable.cpp index d95f0745cf4ba..8b3dda7b2fe10 100644 --- a/lldb/source/Expression/ExpressionVariable.cpp +++ b/lldb/source/Expression/ExpressionVariable.cpp @@ -16,10 +16,10 @@ using namespace lldb_private; ExpressionVariable::~ExpressionVariable() {} uint8_t *ExpressionVariable::GetValueBytes() { - const size_t byte_size = m_frozen_sp->GetByteSize(); - if (byte_size > 0) { - if (m_frozen_sp->GetDataExtractor().GetByteSize() < byte_size) { - m_frozen_sp->GetValue().ResizeData(byte_size); + llvm::Optional byte_size = m_frozen_sp->GetByteSize(); + if (byte_size && *byte_size) { + if (m_frozen_sp->GetDataExtractor().GetByteSize() < *byte_size) { + m_frozen_sp->GetValue().ResizeData(*byte_size); m_frozen_sp->GetValue().GetData(m_frozen_sp->GetDataExtractor()); } return const_cast( diff --git a/lldb/source/Expression/Materializer.cpp b/lldb/source/Expression/Materializer.cpp index 6f8d9b154570a..327e15a26266f 100644 --- a/lldb/source/Expression/Materializer.cpp +++ b/lldb/source/Expression/Materializer.cpp @@ -67,7 +67,7 @@ class EntityPersistentVariable : public Materializer::Entity { const bool zero_memory = false; lldb::addr_t mem = map.Malloc( - m_persistent_variable_sp->GetByteSize(), 8, + m_persistent_variable_sp->GetByteSize().getValueOr(0), 8, lldb::ePermissionsReadable | lldb::ePermissionsWritable, IRMemoryMap::eAllocationPolicyMirror, zero_memory, allocate_error); @@ -106,7 +106,8 @@ class EntityPersistentVariable : public Materializer::Entity { Status write_error; map.WriteMemory(mem, m_persistent_variable_sp->GetValueBytes(), - m_persistent_variable_sp->GetByteSize(), write_error); + m_persistent_variable_sp->GetByteSize().getValueOr(0), + write_error); if (!write_error.Success()) { err.SetErrorStringWithFormat( @@ -234,7 +235,7 @@ class EntityPersistentVariable : public Materializer::Entity { map.GetBestExecutionContextScope(), m_persistent_variable_sp.get()->GetCompilerType(), m_persistent_variable_sp->GetName(), location, eAddressTypeLoad, - m_persistent_variable_sp->GetByteSize()); + m_persistent_variable_sp->GetByteSize().getValueOr(0)); if (frame_top != LLDB_INVALID_ADDRESS && frame_bottom != LLDB_INVALID_ADDRESS && location >= frame_bottom && @@ -279,7 +280,8 @@ class EntityPersistentVariable : public Materializer::Entity { LLDB_LOGF(log, "Dematerializing %s from 0x%" PRIx64 " (size = %llu)", m_persistent_variable_sp->GetName().GetCString(), (uint64_t)mem, - (unsigned long long)m_persistent_variable_sp->GetByteSize()); + (unsigned long long)m_persistent_variable_sp->GetByteSize() + .getValueOr(0)); // Read the contents of the spare memory area @@ -288,7 +290,7 @@ class EntityPersistentVariable : public Materializer::Entity { Status read_error; map.ReadMemory(m_persistent_variable_sp->GetValueBytes(), mem, - m_persistent_variable_sp->GetByteSize(), read_error); + m_persistent_variable_sp->GetByteSize().getValueOr(0), read_error); if (!read_error.Success()) { err.SetErrorStringWithFormat( @@ -369,10 +371,11 @@ class EntityPersistentVariable : public Materializer::Entity { if (!err.Success()) { dump_stream.Printf(" \n"); } else { - DataBufferHeap data(m_persistent_variable_sp->GetByteSize(), 0); + DataBufferHeap data( + m_persistent_variable_sp->GetByteSize().getValueOr(0), 0); map.ReadMemory(data.GetBytes(), target_address, - m_persistent_variable_sp->GetByteSize(), err); + m_persistent_variable_sp->GetByteSize().getValueOr(0), err); if (!err.Success()) { dump_stream.Printf(" \n"); @@ -621,8 +624,8 @@ class EntityVariable : public Materializer::Entity { Status extract_error; - map.GetMemoryData(data, m_temporary_allocation, valobj_sp->GetByteSize(), - extract_error); + map.GetMemoryData(data, m_temporary_allocation, + valobj_sp->GetByteSize().getValueOr(0), extract_error); if (!extract_error.Success()) { err.SetErrorStringWithFormat("couldn't get the data for variable %s", @@ -919,7 +922,7 @@ class EntityResultVariable : public Materializer::Entity { ret->ValueUpdated(); - const size_t pvar_byte_size = ret->GetByteSize(); + const size_t pvar_byte_size = ret->GetByteSize().getValueOr(0); uint8_t *pvar_data = ret->GetValueBytes(); map.ReadMemory(pvar_data, address, pvar_byte_size, read_error); diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 098aed9cd8125..22bca52d7f98a 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1408,7 +1408,7 @@ ValueObjectSP GetValueForOffset(StackFrame &frame, ValueObjectSP &parent, } int64_t child_offset = child_sp->GetByteOffset(); - int64_t child_size = child_sp->GetByteSize(); + int64_t child_size = child_sp->GetByteSize().getValueOr(0); if (offset >= child_offset && offset < (child_offset + child_size)) { return GetValueForOffset(frame, child_sp, offset - child_offset); @@ -1441,8 +1441,8 @@ ValueObjectSP GetValueForDereferincingOffset(StackFrame &frame, } if (offset >= 0 && uint64_t(offset) >= pointee->GetByteSize()) { - int64_t index = offset / pointee->GetByteSize(); - offset = offset % pointee->GetByteSize(); + int64_t index = offset / pointee->GetByteSize().getValueOr(1); + offset = offset % pointee->GetByteSize().getValueOr(1); const bool can_create = true; pointee = base->GetSyntheticArrayMember(index, can_create); } From 136c8f50e96381ee9daf0ac3dbe524ba63d51560 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sat, 25 Jul 2020 21:43:36 +0300 Subject: [PATCH 0089/1035] [Reduce] Try turning function definitions into declarations first, NFCI-ish ReduceFunctions could do it, but it also replaces *all* calls with undef, so if any of undef replacements makes reduction uninteresting, it won't work. ReduceBasicBlocks also could do it, but well, it may take many guesses for all the blocks of a function to happen to be out-of-chunk, which is not a very efficient way to go about it. So let's just do this first. --- llvm/test/Reduce/remove-function-bodies.ll | 17 ++++++ llvm/tools/llvm-reduce/CMakeLists.txt | 1 + llvm/tools/llvm-reduce/DeltaManager.h | 2 + .../deltas/ReduceFunctionBodies.cpp | 54 +++++++++++++++++++ .../llvm-reduce/deltas/ReduceFunctionBodies.h | 18 +++++++ 5 files changed, 92 insertions(+) create mode 100644 llvm/test/Reduce/remove-function-bodies.ll create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h diff --git a/llvm/test/Reduce/remove-function-bodies.ll b/llvm/test/Reduce/remove-function-bodies.ll new file mode 100644 index 0000000000000..c70df0e38f3dd --- /dev/null +++ b/llvm/test/Reduce/remove-function-bodies.ll @@ -0,0 +1,17 @@ +; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s + +; CHECK-INTERESTINGNESS: @callee( +; CHECK-FINAL: declare void @callee() +define void @callee() { + ret void +} + +; CHECK-ALL: define void @caller() +define void @caller() { +entry: +; CHECK-ALL: call void @callee() +; CHECK-ALL: ret void + call void @callee() + ret void +} diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt index 01b9d0b4afe1a..81b4e95eece59 100644 --- a/llvm/tools/llvm-reduce/CMakeLists.txt +++ b/llvm/tools/llvm-reduce/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_tool(llvm-reduce deltas/ReduceArguments.cpp deltas/ReduceAttributes.cpp deltas/ReduceBasicBlocks.cpp + deltas/ReduceFunctionBodies.cpp deltas/ReduceFunctions.cpp deltas/ReduceGlobalVars.cpp deltas/ReduceInstructions.cpp diff --git a/llvm/tools/llvm-reduce/DeltaManager.h b/llvm/tools/llvm-reduce/DeltaManager.h index b1a4ee0df4dbe..83278c88791b0 100644 --- a/llvm/tools/llvm-reduce/DeltaManager.h +++ b/llvm/tools/llvm-reduce/DeltaManager.h @@ -16,6 +16,7 @@ #include "deltas/ReduceArguments.h" #include "deltas/ReduceAttributes.h" #include "deltas/ReduceBasicBlocks.h" +#include "deltas/ReduceFunctionBodies.h" #include "deltas/ReduceFunctions.h" #include "deltas/ReduceGlobalVars.h" #include "deltas/ReduceInstructions.h" @@ -26,6 +27,7 @@ namespace llvm { // TODO: Add CLI option to run only specified Passes (for unit tests) inline void runDeltaPasses(TestRunner &Tester) { + reduceFunctionBodiesDeltaPass(Tester); reduceFunctionsDeltaPass(Tester); reduceBasicBlocksDeltaPass(Tester); reduceGlobalsDeltaPass(Tester); diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp new file mode 100644 index 0000000000000..a047d42b50c56 --- /dev/null +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp @@ -0,0 +1,54 @@ +//===- ReduceFunctions.cpp - Specialized Delta Pass -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function which calls the Generic Delta pass in order +// to reduce function bodies in the provided Module. +// +//===----------------------------------------------------------------------===// + +#include "ReduceFunctionBodies.h" +#include "Delta.h" + +using namespace llvm; + +/// Removes all the bodies of defined functions that aren't inside any of the +/// desired Chunks. +static void +extractFunctionBodiesFromModule(const std::vector &ChunksToKeep, + Module *Program) { + Oracle O(ChunksToKeep); + + // Delete out-of-chunk function bodies + std::vector FuncDefsToReduce; + for (auto &F : *Program) + if (!F.isDeclaration() && !O.shouldKeep()) + F.deleteBody(); +} + +/// Counts the amount of non-declaration functions and prints their +/// respective name & index +static int countFunctionDefinitions(Module *Program) { + // TODO: Silence index with --quiet flag + errs() << "----------------------------\n"; + errs() << "Function Definition Index Reference:\n"; + int FunctionDefinitionCount = 0; + for (auto &F : *Program) + if (!F.isDeclaration()) + errs() << "\t" << ++FunctionDefinitionCount << ": " << F.getName() + << "\n"; + + errs() << "----------------------------\n"; + return FunctionDefinitionCount; +} + +void llvm::reduceFunctionBodiesDeltaPass(TestRunner &Test) { + errs() << "*** Reducing Function Bodies...\n"; + int Functions = countFunctionDefinitions(Test.getProgram()); + runDeltaPass(Test, Functions, extractFunctionBodiesFromModule); + errs() << "----------------------------\n"; +} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h new file mode 100644 index 0000000000000..8c06c2e4a1a93 --- /dev/null +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h @@ -0,0 +1,18 @@ +//===- ReduceFunctionBodies.h - Specialized Delta Pass --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function which calls the Generic Delta pass in order +// to reduce function bodies in the provided Module. +// +//===----------------------------------------------------------------------===// + +#include "Delta.h" + +namespace llvm { +void reduceFunctionBodiesDeltaPass(TestRunner &Test); +} // namespace llvm From 48c3228c5cf7ba4605180b9d6b81fa6e575d964d Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Sat, 25 Jul 2020 18:51:58 +0000 Subject: [PATCH 0090/1035] [gn build] Port 136c8f50e96 --- llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn index a8648d73ca0d1..14c38d87fe5f8 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn @@ -14,6 +14,7 @@ executable("llvm-reduce") { "deltas/ReduceArguments.cpp", "deltas/ReduceAttributes.cpp", "deltas/ReduceBasicBlocks.cpp", + "deltas/ReduceFunctionBodies.cpp", "deltas/ReduceFunctions.cpp", "deltas/ReduceGlobalVars.cpp", "deltas/ReduceInstructions.cpp", From 6a75496836ea14bcfd2f4b59d35a1cad4ac58cee Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 25 Jul 2020 12:33:18 -0700 Subject: [PATCH 0091/1035] [Driver] Define LinkOption and fix forwarded options to GCC for linking Many driver options are neither 'DriverOption' nor 'LinkerInput'. When gcc is used for linking, these options get forwarded even if they don't have anything to do with linking. Among these options, clang-specific ones can cause gcc to error. Just use 'OPT_Link_Group' and a new flag 'LinkOption' for options which already have a group. gfortran support apparently bit rots (which does not seem to make much sense). XFAIL the test. --- clang/include/clang/Driver/Options.h | 3 ++- clang/include/clang/Driver/Options.td | 24 ++++++++++++++---------- clang/lib/Driver/ToolChains/Gnu.cpp | 22 +--------------------- clang/test/Driver/gcc_forward.c | 17 ++++++++--------- clang/test/Driver/gfortran.f90 | 1 + 5 files changed, 26 insertions(+), 41 deletions(-) diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Driver/Options.h index 7c5cddd9e8960..9831efda4e580 100644 --- a/clang/include/clang/Driver/Options.h +++ b/clang/include/clang/Driver/Options.h @@ -33,7 +33,8 @@ enum ClangFlags { CC1Option = (1 << 10), CC1AsOption = (1 << 11), NoDriverOption = (1 << 12), - Ignored = (1 << 13) + LinkOption = (1 << 13), + Ignored = (1 << 14), }; enum ID { diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 700a5c4578f68..b6e31700c0a62 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -52,6 +52,10 @@ def CC1AsOption : OptionFlag; // NoDriverOption - This option should not be accepted by the driver. def NoDriverOption : OptionFlag; +// If an option affects linking, but has a primary group (so Link_Group cannot +// be used), add this flag. +def LinkOption : OptionFlag; + // A short name to show in documentation. The name will be interpreted as rST. class DocName { string DocName = name; } @@ -573,7 +577,7 @@ def config_system_dir_EQ : Joined<["--"], "config-system-dir=">, Flags<[DriverOp HelpText<"System directory for configuration files">; def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, Flags<[DriverOption, HelpHidden]>, HelpText<"User directory for configuration files">; -def coverage : Flag<["-", "--"], "coverage">, Flags<[CoreOption]>; +def coverage : Flag<["-", "--"], "coverage">, Group, Flags<[CoreOption]>; def cpp_precomp : Flag<["-"], "cpp-precomp">, Group; def current__version : JoinedOrSeparate<["-"], "current_version">; def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group, @@ -1747,7 +1751,7 @@ def fpass_plugin_EQ : Joined<["-"], "fpass-plugin=">, HelpText<"Load pass plugin from a dynamic shared object file (only with new pass manager).">; defm preserve_as_comments : OptOutFFlag<"preserve-as-comments", "", "Do not preserve comments in inline assembly">; -def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group; +def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group, Flags<[LinkOption]>; def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group; def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>; def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group; @@ -2724,7 +2728,7 @@ def nostdinc : Flag<["-"], "nostdinc">, Flags<[CoreOption]>; def nostdlibinc : Flag<["-"], "nostdlibinc">; def nostdincxx : Flag<["-"], "nostdinc++">, Flags<[CC1Option]>, HelpText<"Disable standard #include directories for the C++ standard library">; -def nostdlib : Flag<["-"], "nostdlib">; +def nostdlib : Flag<["-"], "nostdlib">, Group; def nostdlibxx : Flag<["-"], "nostdlib++">; def object : Flag<["-"], "object">; def o : JoinedOrSeparate<["-"], "o">, Flags<[DriverOption, RenderAsInput, CC1Option, CC1AsOption]>, @@ -2768,15 +2772,15 @@ def pthread : Flag<["-"], "pthread">, Flags<[CC1Option]>, HelpText<"Support POSIX threads in generated code">; def no_pthread : Flag<["-"], "no-pthread">, Flags<[CC1Option]>; def p : Flag<["-"], "p">; -def pie : Flag<["-"], "pie">; -def static_pie : Flag<["-"], "static-pie">; +def pie : Flag<["-"], "pie">, Group; +def static_pie : Flag<["-"], "static-pie">, Group; def read__only__relocs : Separate<["-"], "read_only_relocs">; def remap : Flag<["-"], "remap">; def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[DriverOption,CC1Option]>, HelpText<"Rewrite Objective-C source to C++">, Group; def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">, Flags<[DriverOption]>, HelpText<"Rewrite Legacy Objective-C source to C++">; -def rdynamic : Flag<["-"], "rdynamic">; +def rdynamic : Flag<["-"], "rdynamic">, Group; def resource_dir : Separate<["-"], "resource-dir">, Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>, HelpText<"The directory which holds the compiler resource files">; @@ -2818,13 +2822,13 @@ def segs__read__only__addr : Separate<["-"], "segs_read_only_addr">; def segs__read__write__addr : Separate<["-"], "segs_read_write_addr">; def segs__read__ : Joined<["-"], "segs_read_">; def shared_libgcc : Flag<["-"], "shared-libgcc">; -def shared : Flag<["-", "--"], "shared">; +def shared : Flag<["-", "--"], "shared">, Group; def single__module : Flag<["-"], "single_module">; def specs_EQ : Joined<["-", "--"], "specs=">; def specs : Separate<["-", "--"], "specs">, Flags<[Unsupported]>; def static_libgcc : Flag<["-"], "static-libgcc">; def static_libstdcxx : Flag<["-"], "static-libstdc++">; -def static : Flag<["-", "--"], "static">, Flags<[NoArgumentUnused]>; +def static : Flag<["-", "--"], "static">, Group, Flags<[NoArgumentUnused]>; def std_default_EQ : Joined<["-"], "std-default=">; def std_EQ : Joined<["-", "--"], "std=">, Flags<[CC1Option]>, Group, HelpText<"Language standard to compile for">, @@ -3283,8 +3287,8 @@ defm : BooleanFFlag<"keep-inline-functions">, Group, Group; def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group; diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 9ea07e44e37ec..1806c14c395dc 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -38,10 +38,7 @@ using tools::addMultilibFlag; using tools::addPathIfExists; static bool forwardToGCC(const Option &O) { - // Don't forward inputs from the original command line. They are added from - // InputInfoList. - return O.getKind() != Option::InputClass && - !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput); + return O.matches(options::OPT_Link_Group) || O.hasFlag(options::LinkOption); } // Switch CPU names not recognized by GNU assembler to a close CPU that it does @@ -76,23 +73,6 @@ void tools::gcc::Common::ConstructJob(Compilation &C, const JobAction &JA, // to get to the assembler. A->claim(); - // Don't forward any -g arguments to assembly steps. - if (isa(JA) && - A->getOption().matches(options::OPT_g_Group)) - continue; - - // Don't forward any -W arguments to assembly and link steps. - if ((isa(JA) || isa(JA)) && - A->getOption().matches(options::OPT_W_Group)) - continue; - - // Don't forward -mno-unaligned-access since GCC doesn't understand - // it and because it doesn't affect the assembly or link steps. - if ((isa(JA) || isa(JA)) && - (A->getOption().matches(options::OPT_munaligned_access) || - A->getOption().matches(options::OPT_mno_unaligned_access))) - continue; - A->render(Args, CmdArgs); } } diff --git a/clang/test/Driver/gcc_forward.c b/clang/test/Driver/gcc_forward.c index f75b1c738b05c..9579d0d60d1cf 100644 --- a/clang/test/Driver/gcc_forward.c +++ b/clang/test/Driver/gcc_forward.c @@ -1,3 +1,8 @@ +// RUN: %clang -### %s -target aarch64-none-elf \ +// RUN: --coverage -fuse-ld=lld --ld-path=ld -nostdlib -r -rdynamic -static -static-pie \ +// RUN: 2>&1 | FileCheck --check-prefix=FORWARD %s +// FORWARD: gcc{{[^"]*}}" "--coverage" "-fuse-ld=lld" "--ld-path=ld" "-nostdlib" "-r" "-rdynamic" "-static" "-static-pie" + // Check that we don't try to forward -Xclang or -mlinker-version to GCC. // PR12920 -- Check also we may not forward W_Group options to GCC. // @@ -5,7 +10,7 @@ // RUN: %s \ // RUN: -Wall -Wdocumentation \ // RUN: -Xclang foo-bar \ -// RUN: -march=x86-64 \ +// RUN: -pie -march=x86-64 \ // RUN: -mlinker-version=10 -### 2> %t // RUN: FileCheck < %t %s // @@ -15,13 +20,13 @@ // CHECK: "-o" "{{[^"]+}}.o" // // gcc as ld. -// CHECK: gcc{{[^"]*}}" +// CHECK: gcc{{[^"]*}}" "-pie" // CHECK-NOT: "-mlinker-version=10" // CHECK-NOT: "-Xclang" // CHECK-NOT: "foo-bar" // CHECK-NOT: "-Wall" // CHECK-NOT: "-Wdocumentation" -// CHECK: -march +// CHECK-NOT: -march // CHECK-NOT: "-mlinker-version=10" // CHECK-NOT: "-Xclang" // CHECK-NOT: "foo-bar" @@ -34,9 +39,3 @@ // RUN: | FileCheck --check-prefix=CHECK-ASM %s // CHECK-ASM: as // CHECK-ASM-NOT: "-g" - -// Check that we're not forwarding -mno-unaligned-access. -// RUN: %clang -target aarch64-none-elf -mno-unaligned-access %s -### 2>&1 \ -// RUN: | FileCheck --check-prefix=CHECK-ARM %s -// CHECK-ARM: gcc{{[^"]*}}" -// CHECK-ARM-NOT: -mno-unaligned-access diff --git a/clang/test/Driver/gfortran.f90 b/clang/test/Driver/gfortran.f90 index d2f90b47a3902..6f972cc333ae0 100644 --- a/clang/test/Driver/gfortran.f90 +++ b/clang/test/Driver/gfortran.f90 @@ -1,3 +1,4 @@ +! XFAIL: * ! Test that Clang can forward all of the flags which are documented as ! being supported by gfortran to GCC when falling back to GCC for ! a fortran input file. From 60a5799e6e8b1206640a2321eab60dcf1b65975d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jul 2020 12:12:16 -0700 Subject: [PATCH 0092/1035] [X86] Fix intrinsic names in strict fp80 tests to use f80 in their names instead of x86_fp80. The type is called x86_fp80, but when it is printed in the intrinsic name it should be f80. The parser doesn't seem to care that the name was wrong. --- .../CodeGen/X86/fp80-strict-scalar-cmp.ll | 60 ++++----- llvm/test/CodeGen/X86/fp80-strict-scalar.ll | 116 +++++++++--------- 2 files changed, 88 insertions(+), 88 deletions(-) diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll index 1e38b6744f3c0..a3951f826fe09 100644 --- a/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll +++ b/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll @@ -33,7 +33,7 @@ define i32 @test_oeq_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: cmovnel %esi, %eax ; X87-64-NEXT: cmovpl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"oeq", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -70,7 +70,7 @@ define i32 @test_ogt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ogt", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -107,7 +107,7 @@ define i32 @test_oge_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"oge", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -144,7 +144,7 @@ define i32 @test_olt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"olt", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -181,7 +181,7 @@ define i32 @test_ole_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ole", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -218,7 +218,7 @@ define i32 @test_one_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"one", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -255,7 +255,7 @@ define i32 @test_ord_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovpl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ord", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -292,7 +292,7 @@ define i32 @test_ueq_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovnel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ueq", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -329,7 +329,7 @@ define i32 @test_ugt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovael %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ugt", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -366,7 +366,7 @@ define i32 @test_uge_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmoval %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"uge", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -403,7 +403,7 @@ define i32 @test_ult_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovael %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ult", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -440,7 +440,7 @@ define i32 @test_ule_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmoval %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ule", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -478,7 +478,7 @@ define i32 @test_une_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: cmovnel %edi, %eax ; X87-64-NEXT: cmovpl %edi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"une", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -515,7 +515,7 @@ define i32 @test_uno_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovnpl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmp.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"uno", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -553,7 +553,7 @@ define i32 @test_oeq_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: cmovnel %esi, %eax ; X87-64-NEXT: cmovpl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"oeq", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -590,7 +590,7 @@ define i32 @test_ogt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ogt", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -627,7 +627,7 @@ define i32 @test_oge_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"oge", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -664,7 +664,7 @@ define i32 @test_olt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"olt", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -701,7 +701,7 @@ define i32 @test_ole_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovbl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ole", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -738,7 +738,7 @@ define i32 @test_one_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"one", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -775,7 +775,7 @@ define i32 @test_ord_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovpl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ord", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -812,7 +812,7 @@ define i32 @test_ueq_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovnel %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ueq", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -849,7 +849,7 @@ define i32 @test_ugt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovael %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ugt", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -886,7 +886,7 @@ define i32 @test_uge_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmoval %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"uge", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -923,7 +923,7 @@ define i32 @test_ult_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovael %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ult", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -960,7 +960,7 @@ define i32 @test_ule_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmoval %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"ule", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -998,7 +998,7 @@ define i32 @test_une_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: cmovnel %edi, %eax ; X87-64-NEXT: cmovpl %edi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"une", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -1035,7 +1035,7 @@ define i32 @test_uno_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { ; X87-64-NEXT: wait ; X87-64-NEXT: cmovnpl %esi, %eax ; X87-64-NEXT: retq - %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80( + %cond = call i1 @llvm.experimental.constrained.fcmps.f80( x86_fp80 %f1, x86_fp80 %f2, metadata !"uno", metadata !"fpexcept.strict") #0 %res = select i1 %cond, i32 %a, i32 %b @@ -1044,5 +1044,5 @@ define i32 @test_uno_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 { attributes #0 = { strictfp } -declare i1 @llvm.experimental.constrained.fcmp.x86_fp80(x86_fp80, x86_fp80, metadata, metadata) -declare i1 @llvm.experimental.constrained.fcmps.x86_fp80(x86_fp80, x86_fp80, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmp.f80(x86_fp80, x86_fp80, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmps.f80(x86_fp80, x86_fp80, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll index cf4a51fd6920a..221bebea2957e 100644 --- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll +++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll @@ -2,35 +2,35 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X64 -declare x86_fp80 @llvm.experimental.constrained.fadd.x86_fp80(x86_fp80, x86_fp80, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.fsub.x86_fp80(x86_fp80, x86_fp80, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.fmul.x86_fp80(x86_fp80, x86_fp80, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.fdiv.x86_fp80(x86_fp80, x86_fp80, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f32(float, metadata) -declare x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f64(double, metadata) -declare x86_fp80 @llvm.experimental.constrained.sqrt.x86_fp80(x86_fp80, metadata, metadata) -declare float @llvm.experimental.constrained.fptrunc.f32.x86_fp80(x86_fp80, metadata, metadata) -declare double @llvm.experimental.constrained.fptrunc.f64.x86_fp80(x86_fp80, metadata, metadata) -declare i1 @llvm.experimental.constrained.fptosi.i1.x86_fp80(x86_fp80, metadata) -declare i8 @llvm.experimental.constrained.fptosi.i8.x86_fp80(x86_fp80, metadata) -declare i16 @llvm.experimental.constrained.fptosi.i16.x86_fp80(x86_fp80, metadata) -declare i32 @llvm.experimental.constrained.fptosi.i32.x86_fp80(x86_fp80, metadata) -declare i64 @llvm.experimental.constrained.fptosi.i64.x86_fp80(x86_fp80, metadata) -declare i1 @llvm.experimental.constrained.fptoui.i1.x86_fp80(x86_fp80, metadata) -declare i8 @llvm.experimental.constrained.fptoui.i8.x86_fp80(x86_fp80, metadata) -declare i16 @llvm.experimental.constrained.fptoui.i16.x86_fp80(x86_fp80, metadata) -declare i32 @llvm.experimental.constrained.fptoui.i32.x86_fp80(x86_fp80, metadata) -declare i64 @llvm.experimental.constrained.fptoui.i64.x86_fp80(x86_fp80, metadata) -declare x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i1(i1, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i8(i8, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i16(i16, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i32(i32, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i64(i64, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i1(i1, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i8(i8, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i16(i16, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i32(i32, metadata, metadata) -declare x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i64(i64, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80, x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80, x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80, x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80, x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(float, metadata) +declare x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double, metadata) +declare x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80, metadata, metadata) +declare float @llvm.experimental.constrained.fptrunc.f32.f80(x86_fp80, metadata, metadata) +declare double @llvm.experimental.constrained.fptrunc.f64.f80(x86_fp80, metadata, metadata) +declare i1 @llvm.experimental.constrained.fptosi.i1.f80(x86_fp80, metadata) +declare i8 @llvm.experimental.constrained.fptosi.i8.f80(x86_fp80, metadata) +declare i16 @llvm.experimental.constrained.fptosi.i16.f80(x86_fp80, metadata) +declare i32 @llvm.experimental.constrained.fptosi.i32.f80(x86_fp80, metadata) +declare i64 @llvm.experimental.constrained.fptosi.i64.f80(x86_fp80, metadata) +declare i1 @llvm.experimental.constrained.fptoui.i1.f80(x86_fp80, metadata) +declare i8 @llvm.experimental.constrained.fptoui.i8.f80(x86_fp80, metadata) +declare i16 @llvm.experimental.constrained.fptoui.i16.f80(x86_fp80, metadata) +declare i32 @llvm.experimental.constrained.fptoui.i32.f80(x86_fp80, metadata) +declare i64 @llvm.experimental.constrained.fptoui.i64.f80(x86_fp80, metadata) +declare x86_fp80 @llvm.experimental.constrained.sitofp.f80.i1(i1, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.sitofp.f80.i8(i8, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.sitofp.f80.i16(i16, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.sitofp.f80.i32(i32, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.sitofp.f80.i64(i64, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.uitofp.f80.i1(i1, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.uitofp.f80.i8(i8, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.uitofp.f80.i16(i16, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.uitofp.f80.i32(i32, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.uitofp.f80.i64(i64, metadata, metadata) define x86_fp80 @fadd_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp { ; X86-LABEL: fadd_fp80: @@ -48,7 +48,7 @@ define x86_fp80 @fadd_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp { ; X64-NEXT: faddp %st, %st(1) ; X64-NEXT: wait ; X64-NEXT: retq - %ret = call x86_fp80 @llvm.experimental.constrained.fadd.x86_fp80(x86_fp80 %a, x86_fp80 %b, + %ret = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 %a, x86_fp80 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %ret @@ -70,7 +70,7 @@ define x86_fp80 @fsub_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp { ; X64-NEXT: fsubp %st, %st(1) ; X64-NEXT: wait ; X64-NEXT: retq - %ret = call x86_fp80 @llvm.experimental.constrained.fsub.x86_fp80(x86_fp80 %a, x86_fp80 %b, + %ret = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 %a, x86_fp80 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %ret @@ -92,7 +92,7 @@ define x86_fp80 @fmul_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp { ; X64-NEXT: fmulp %st, %st(1) ; X64-NEXT: wait ; X64-NEXT: retq - %ret = call x86_fp80 @llvm.experimental.constrained.fmul.x86_fp80(x86_fp80 %a, x86_fp80 %b, + %ret = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 %a, x86_fp80 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %ret @@ -114,7 +114,7 @@ define x86_fp80 @fdiv_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp { ; X64-NEXT: fdivp %st, %st(1) ; X64-NEXT: wait ; X64-NEXT: retq - %ret = call x86_fp80 @llvm.experimental.constrained.fdiv.x86_fp80(x86_fp80 %a, x86_fp80 %b, + %ret = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 %a, x86_fp80 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %ret @@ -133,7 +133,7 @@ define x86_fp80 @fpext_f32_to_fp80(float %a) nounwind strictfp { ; X64-NEXT: flds -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %ret = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f32(float %a, + %ret = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(float %a, metadata !"fpexcept.strict") #0 ret x86_fp80 %ret } @@ -151,7 +151,7 @@ define x86_fp80 @fpext_f64_to_fp80(double %a) nounwind strictfp { ; X64-NEXT: fldl -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %ret = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f64(double %a, + %ret = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double %a, metadata !"fpexcept.strict") #0 ret x86_fp80 %ret } @@ -174,7 +174,7 @@ define float @fptrunc_fp80_to_f32(x86_fp80 %a) nounwind strictfp { ; X64-NEXT: wait ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: retq - %ret = call float @llvm.experimental.constrained.fptrunc.f32.x86_fp80(x86_fp80 %a, + %ret = call float @llvm.experimental.constrained.fptrunc.f32.f80(x86_fp80 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret float %ret @@ -202,7 +202,7 @@ define double @fptrunc_fp80_to_f64(x86_fp80 %a) nounwind strictfp { ; X64-NEXT: wait ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: retq - %ret = call double @llvm.experimental.constrained.fptrunc.f64.x86_fp80(x86_fp80 %a, + %ret = call double @llvm.experimental.constrained.fptrunc.f64.f80(x86_fp80 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret double %ret @@ -222,7 +222,7 @@ define x86_fp80 @fsqrt_fp80(x86_fp80 %a) nounwind strictfp { ; X64-NEXT: fsqrt ; X64-NEXT: wait ; X64-NEXT: retq - %ret = call x86_fp80 @llvm.experimental.constrained.sqrt.x86_fp80(x86_fp80 %a, + %ret = call x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %ret @@ -260,7 +260,7 @@ define i1 @fp80_to_sint1(x86_fp80 %x) #0 { ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al ; X64-NEXT: retq - %result = call i1 @llvm.experimental.constrained.fptosi.i1.x86_fp80(x86_fp80 %x, + %result = call i1 @llvm.experimental.constrained.fptosi.i1.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i1 %result } @@ -297,7 +297,7 @@ define i8 @fp80_to_sint8(x86_fp80 %x) #0 { ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al ; X64-NEXT: retq - %result = call i8 @llvm.experimental.constrained.fptosi.i8.x86_fp80(x86_fp80 %x, + %result = call i8 @llvm.experimental.constrained.fptosi.i8.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i8 %result } @@ -334,7 +334,7 @@ define i16 @fp80_to_sint16(x86_fp80 %x) #0 { ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq - %result = call i16 @llvm.experimental.constrained.fptosi.i16.x86_fp80(x86_fp80 %x, + %result = call i16 @llvm.experimental.constrained.fptosi.i16.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i16 %result } @@ -372,7 +372,7 @@ define i32 @fp80_to_sint32(x86_fp80 %x) #0 { ; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq entry: - %result = call i32 @llvm.experimental.constrained.fptosi.i32.x86_fp80(x86_fp80 %x, + %result = call i32 @llvm.experimental.constrained.fptosi.i32.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i32 %result } @@ -416,7 +416,7 @@ define i64 @fp80_to_sint64(x86_fp80 %x) #0 { ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; X64-NEXT: retq - %result = call i64 @llvm.experimental.constrained.fptosi.i64.x86_fp80(x86_fp80 %x, + %result = call i64 @llvm.experimental.constrained.fptosi.i64.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i64 %result } @@ -453,7 +453,7 @@ define i1 @fp80_to_uint1(x86_fp80 %x) #0 { ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al ; X64-NEXT: retq - %result = call i1 @llvm.experimental.constrained.fptoui.i1.x86_fp80(x86_fp80 %x, + %result = call i1 @llvm.experimental.constrained.fptoui.i1.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i1 %result } @@ -490,7 +490,7 @@ define i8 @fp80_to_uint8(x86_fp80 %x) #0 { ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al ; X64-NEXT: retq - %result = call i8 @llvm.experimental.constrained.fptoui.i8.x86_fp80(x86_fp80 %x, + %result = call i8 @llvm.experimental.constrained.fptoui.i8.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i8 %result } @@ -529,7 +529,7 @@ define i16 @fp80_to_uint16(x86_fp80 %x) #0 { ; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq - %result = call i16 @llvm.experimental.constrained.fptoui.i16.x86_fp80(x86_fp80 %x, + %result = call i16 @llvm.experimental.constrained.fptoui.i16.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i16 %result } @@ -572,7 +572,7 @@ define i32 @fp80_to_uint32(x86_fp80 %x) #0 { ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: retq - %result = call i32 @llvm.experimental.constrained.fptoui.i32.x86_fp80(x86_fp80 %x, + %result = call i32 @llvm.experimental.constrained.fptoui.i32.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i32 %result } @@ -647,7 +647,7 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 { ; X64-NEXT: shlq $63, %rax ; X64-NEXT: xorq -{{[0-9]+}}(%rsp), %rax ; X64-NEXT: retq - %result = call i64 @llvm.experimental.constrained.fptoui.i64.x86_fp80(x86_fp80 %x, + %result = call i64 @llvm.experimental.constrained.fptoui.i64.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 ret i64 %result } @@ -677,7 +677,7 @@ define x86_fp80 @sint1_to_fp80(i1 %x) #0 { ; X64-NEXT: filds -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i1(i1 %x, + %result = call x86_fp80 @llvm.experimental.constrained.sitofp.f80.i1(i1 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -703,7 +703,7 @@ define x86_fp80 @sint8_to_fp80(i8 %x) #0 { ; X64-NEXT: filds -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i8(i8 %x, + %result = call x86_fp80 @llvm.experimental.constrained.sitofp.f80.i8(i8 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -728,7 +728,7 @@ define x86_fp80 @sint16_to_fp80(i16 %x) #0 { ; X64-NEXT: filds -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i16(i16 %x, + %result = call x86_fp80 @llvm.experimental.constrained.sitofp.f80.i16(i16 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -753,7 +753,7 @@ define x86_fp80 @sint32_to_fp80(i32 %x) #0 { ; X64-NEXT: fildl -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i32(i32 %x, + %result = call x86_fp80 @llvm.experimental.constrained.sitofp.f80.i32(i32 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -772,7 +772,7 @@ define x86_fp80 @sint64_to_fp80(i64 %x) #0 { ; X64-NEXT: fildll -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i64(i64 %x, + %result = call x86_fp80 @llvm.experimental.constrained.sitofp.f80.i64(i64 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -800,7 +800,7 @@ define x86_fp80 @uint1_to_fp80(i1 %x) #0 { ; X64-NEXT: filds -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i1(i1 %x, + %result = call x86_fp80 @llvm.experimental.constrained.uitofp.f80.i1(i1 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -826,7 +826,7 @@ define x86_fp80 @uint8_to_fp80(i8 %x) #0 { ; X64-NEXT: filds -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i8(i8 %x, + %result = call x86_fp80 @llvm.experimental.constrained.uitofp.f80.i8(i8 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -852,7 +852,7 @@ define x86_fp80 @uint16_to_fp80(i16 %x) #0 { ; X64-NEXT: fildl -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i16(i16 %x, + %result = call x86_fp80 @llvm.experimental.constrained.uitofp.f80.i16(i16 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -885,7 +885,7 @@ define x86_fp80 @uint32_to_fp80(i32 %x) #0 { ; X64-NEXT: fildll -{{[0-9]+}}(%rsp) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i32(i32 %x, + %result = call x86_fp80 @llvm.experimental.constrained.uitofp.f80.i32(i32 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result @@ -924,7 +924,7 @@ define x86_fp80 @uint64_to_fp80(i64 %x) #0 { ; X64-NEXT: fadds {{\.LCPI.*}}(,%rax,4) ; X64-NEXT: wait ; X64-NEXT: retq - %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i64(i64 %x, + %result = call x86_fp80 @llvm.experimental.constrained.uitofp.f80.i64(i64 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret x86_fp80 %result From 9182dc78145b9f1505d7fcc34b818f6d8aabcfda Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jul 2020 13:24:58 -0700 Subject: [PATCH 0093/1035] [X86] Add llvm.roundeven test cases. Add f80 tests cases for constrained intrinsics that lower to libcalls. NFC --- llvm/test/CodeGen/X86/fp-cvt.ll | 52 + llvm/test/CodeGen/X86/fp-roundeven.ll | 1044 +++++++++++++++++ .../CodeGen/X86/fp-strict-scalar-round.ll | 184 +++ .../test/CodeGen/X86/fp128-libcalls-strict.ll | 42 + llvm/test/CodeGen/X86/fp80-strict-libcalls.ll | 657 +++++++++++ 5 files changed, 1979 insertions(+) create mode 100644 llvm/test/CodeGen/X86/fp-roundeven.ll create mode 100644 llvm/test/CodeGen/X86/fp80-strict-libcalls.ll diff --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll index 667c2d414ed1b..cedbfd2e9bff4 100644 --- a/llvm/test/CodeGen/X86/fp-cvt.ll +++ b/llvm/test/CodeGen/X86/fp-cvt.ll @@ -1090,3 +1090,55 @@ define x86_fp80 @rint_fp80_ld(x86_fp80 *%a0) nounwind { } declare x86_fp80 @llvm.rint.f80(x86_fp80 %p) + +; +; roundeven +; + +define x86_fp80 @roundeven_fp80(x86_fp80 %a0) nounwind { +; X86-LABEL: roundeven_fp80: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: calll roundevenl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: roundeven_fp80: +; X64: # %bb.0: +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: callq roundevenl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq + %1 = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %a0) + ret x86_fp80 %1 +} + +define x86_fp80 @roundeven_fp80_ld(x86_fp80 *%a0) nounwind { +; X86-LABEL: roundeven_fp80_ld: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: fldt (%eax) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: calll roundevenl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: roundeven_fp80_ld: +; X64: # %bb.0: +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt (%rdi) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: callq roundevenl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq + %1 = load x86_fp80, x86_fp80 *%a0 + %2 = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %1) + ret x86_fp80 %2 +} + +declare x86_fp80 @llvm.roundeven.f80(x86_fp80 %p) diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll new file mode 100644 index 0000000000000..a3eae0137f3ec --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-roundeven.ll @@ -0,0 +1,1044 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 + +define float @roundeven_f32(float %x) { +; SSE2-LABEL: roundeven_f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: jmp _roundevenf ## TAILCALL +; +; SSE41-LABEL: roundeven_f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: jmp _roundevenf ## TAILCALL +; +; AVX-LABEL: roundeven_f32: +; AVX: ## %bb.0: +; AVX-NEXT: jmp _roundevenf ## TAILCALL + %a = call float @llvm.roundeven.f32(float %x) + ret float %a +} + +define double @roundeven_f64(double %x) { +; SSE2-LABEL: roundeven_f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: jmp _roundeven ## TAILCALL +; +; SSE41-LABEL: roundeven_f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: jmp _roundeven ## TAILCALL +; +; AVX-LABEL: roundeven_f64: +; AVX: ## %bb.0: +; AVX-NEXT: jmp _roundeven ## TAILCALL + %a = call double @llvm.roundeven.f64(double %x) + ret double %a +} + +define <4 x float> @roundeven_v4f32(<4 x float> %x) { +; SSE2-LABEL: roundeven_v4f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $56, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 64 +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v4f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: subq $40, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 48 +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE41-NEXT: ## xmm0 = mem[1,1,3,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addq $40, %rsp +; SSE41-NEXT: retq +; +; AVX-LABEL: roundeven_v4f32: +; AVX: ## %bb.0: +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,0] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq + %a = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) + ret <4 x float> %a +} + +define <2 x double> @roundeven_v2f64(<2 x double> %x) { +; SSE2-LABEL: roundeven_v2f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $40, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 48 +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addq $40, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v2f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: subq $40, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 48 +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addq $40, %rsp +; SSE41-NEXT: retq +; +; AVX-LABEL: roundeven_v2f64: +; AVX: ## %bb.0: +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: callq _roundeven +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,0] +; AVX-NEXT: callq _roundeven +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq + %a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) + ret <2 x double> %a +} + +define <8 x float> @roundeven_v8f32(<8 x float> %x) { +; SSE2-LABEL: roundeven_v8f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $72, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 80 +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: addq $72, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v8f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: subq $56, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 64 +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movshdup (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE41-NEXT: ## xmm0 = mem[1,1,3,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE41-NEXT: ## xmm0 = mem[1,1,3,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: addq $56, %rsp +; SSE41-NEXT: retq +; +; AVX-LABEL: roundeven_v8f32: +; AVX: ## %bb.0: +; AVX-NEXT: subq $88, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 96 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,0] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,0] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX-NEXT: callq _roundevenf +; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX-NEXT: addq $88, %rsp +; AVX-NEXT: retq + %a = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %x) + ret <8 x float> %a +} + +define <4 x double> @roundeven_v4f64(<4 x double> %x) { +; SSE2-LABEL: roundeven_v4f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $56, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 64 +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v4f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: subq $56, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 64 +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: addq $56, %rsp +; SSE41-NEXT: retq +; +; AVX-LABEL: roundeven_v4f64: +; AVX: ## %bb.0: +; AVX-NEXT: subq $88, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 96 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq _roundeven +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,0] +; AVX-NEXT: callq _roundeven +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq _roundeven +; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX-NEXT: ## xmm0 = mem[1,0] +; AVX-NEXT: callq _roundeven +; AVX-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX-NEXT: addq $88, %rsp +; AVX-NEXT: retq + %a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x) + ret <4 x double> %a +} + +define <16 x float> @roundeven_v16f32(<16 x float> %x) { +; SSE2-LABEL: roundeven_v16f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $104, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 112 +; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: addq $104, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v16f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: subq $88, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 96 +; SSE41-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE41-NEXT: ## xmm0 = mem[1,1,3,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movshdup (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE41-NEXT: ## xmm0 = mem[1,1,3,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE41-NEXT: ## xmm0 = mem[1,1,3,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE41-NEXT: ## xmm0 = mem[1,1,3,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: callq _roundevenf +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0] +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE41-NEXT: addq $88, %rsp +; SSE41-NEXT: retq +; +; AVX1-LABEL: roundeven_v16f32: +; AVX1: ## %bb.0: +; AVX1-NEXT: subq $152, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 160 +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX1-NEXT: callq _roundevenf +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 ## 16-byte Folded Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: addq $152, %rsp +; AVX1-NEXT: retq +; +; AVX512-LABEL: roundeven_v16f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: subq $184, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 192 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload +; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,1,3,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[3,1,2,3] +; AVX512-NEXT: callq _roundevenf +; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX512-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 ## 32-byte Folded Reload +; AVX512-NEXT: addq $184, %rsp +; AVX512-NEXT: retq + %a = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %x) + ret <16 x float> %a +} + +define <8 x double> @roundeven_v8f64(<8 x double> %x) { +; SSE2-LABEL: roundeven_v8f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $88, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 96 +; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: addq $88, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v8f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: subq $88, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 96 +; SSE41-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: callq _roundeven +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE41-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE41-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE41-NEXT: addq $88, %rsp +; SSE41-NEXT: retq +; +; AVX1-LABEL: roundeven_v8f64: +; AVX1: ## %bb.0: +; AVX1-NEXT: subq $120, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 128 +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX1-NEXT: ## xmm0 = mem[1,0] +; AVX1-NEXT: callq _roundeven +; AVX1-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 ## 16-byte Folded Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload +; AVX1-NEXT: addq $120, %rsp +; AVX1-NEXT: retq +; +; AVX512-LABEL: roundeven_v8f64: +; AVX512: ## %bb.0: +; AVX512-NEXT: subq $184, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 192 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovapd %xmm0, (%rsp) ## 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload +; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; AVX512-NEXT: ## xmm0 = mem[1,0] +; AVX512-NEXT: callq _roundeven +; AVX512-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 ## 16-byte Folded Reload +; AVX512-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 ## 32-byte Folded Reload +; AVX512-NEXT: addq $184, %rsp +; AVX512-NEXT: retq + %a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x) + ret <8 x double> %a +} + +declare float @llvm.roundeven.f32(float) +declare double @llvm.roundeven.f64(double) +declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) +declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) +declare <8 x float> @llvm.roundeven.v8f32(<8 x float>) +declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) +declare <16 x float> @llvm.roundeven.v16f32(<16 x float>) +declare <8 x double> @llvm.roundeven.v8f64(<8 x double>) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll index da05e8be432eb..f5a6af9c4d657 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll @@ -16,6 +16,10 @@ declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata) declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata) declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) +declare float @llvm.experimental.constrained.round.f32(float, metadata) +declare double @llvm.experimental.constrained.round.f64(double, metadata) +declare float @llvm.experimental.constrained.roundeven.f32(float, metadata) +declare double @llvm.experimental.constrained.roundeven.f64(double, metadata) define float @fceil32(float %f) #0 { ; SSE41-X86-LABEL: fceil32: @@ -491,4 +495,184 @@ define double @fnearbyintf64(double %f) #0 { ret double %res } +define float @fround32(float %f) #0 { +; SSE41-X86-LABEL: fround32: +; SSE41-X86: # %bb.0: +; SSE41-X86-NEXT: pushl %eax +; SSE41-X86-NEXT: .cfi_def_cfa_offset 8 +; SSE41-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-X86-NEXT: movss %xmm0, (%esp) +; SSE41-X86-NEXT: calll roundf +; SSE41-X86-NEXT: popl %eax +; SSE41-X86-NEXT: .cfi_def_cfa_offset 4 +; SSE41-X86-NEXT: retl +; +; SSE41-X64-LABEL: fround32: +; SSE41-X64: # %bb.0: +; SSE41-X64-NEXT: pushq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 16 +; SSE41-X64-NEXT: callq roundf +; SSE41-X64-NEXT: popq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 8 +; SSE41-X64-NEXT: retq +; +; AVX-X86-LABEL: fround32: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: pushl %eax +; AVX-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-X86-NEXT: vmovss %xmm0, (%esp) +; AVX-X86-NEXT: calll roundf +; AVX-X86-NEXT: popl %eax +; AVX-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fround32: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: pushq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 16 +; AVX-X64-NEXT: callq roundf +; AVX-X64-NEXT: popq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 8 +; AVX-X64-NEXT: retq + %res = call float @llvm.experimental.constrained.round.f32( + float %f, metadata !"fpexcept.strict") #0 + ret float %res +} + +define double @froundf64(double %f) #0 { +; SSE41-X86-LABEL: froundf64: +; SSE41-X86: # %bb.0: +; SSE41-X86-NEXT: subl $8, %esp +; SSE41-X86-NEXT: .cfi_def_cfa_offset 12 +; SSE41-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE41-X86-NEXT: movsd %xmm0, (%esp) +; SSE41-X86-NEXT: calll round +; SSE41-X86-NEXT: addl $8, %esp +; SSE41-X86-NEXT: .cfi_def_cfa_offset 4 +; SSE41-X86-NEXT: retl +; +; SSE41-X64-LABEL: froundf64: +; SSE41-X64: # %bb.0: +; SSE41-X64-NEXT: pushq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 16 +; SSE41-X64-NEXT: callq round +; SSE41-X64-NEXT: popq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 8 +; SSE41-X64-NEXT: retq +; +; AVX-X86-LABEL: froundf64: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: subl $8, %esp +; AVX-X86-NEXT: .cfi_def_cfa_offset 12 +; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX-X86-NEXT: calll round +; AVX-X86-NEXT: addl $8, %esp +; AVX-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: froundf64: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: pushq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 16 +; AVX-X64-NEXT: callq round +; AVX-X64-NEXT: popq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 8 +; AVX-X64-NEXT: retq + %res = call double @llvm.experimental.constrained.round.f64( + double %f, metadata !"fpexcept.strict") #0 + ret double %res +} + +define float @froundeven32(float %f) #0 { +; SSE41-X86-LABEL: froundeven32: +; SSE41-X86: # %bb.0: +; SSE41-X86-NEXT: pushl %eax +; SSE41-X86-NEXT: .cfi_def_cfa_offset 8 +; SSE41-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-X86-NEXT: movss %xmm0, (%esp) +; SSE41-X86-NEXT: calll roundevenf +; SSE41-X86-NEXT: popl %eax +; SSE41-X86-NEXT: .cfi_def_cfa_offset 4 +; SSE41-X86-NEXT: retl +; +; SSE41-X64-LABEL: froundeven32: +; SSE41-X64: # %bb.0: +; SSE41-X64-NEXT: pushq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 16 +; SSE41-X64-NEXT: callq roundevenf +; SSE41-X64-NEXT: popq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 8 +; SSE41-X64-NEXT: retq +; +; AVX-X86-LABEL: froundeven32: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: pushl %eax +; AVX-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-X86-NEXT: vmovss %xmm0, (%esp) +; AVX-X86-NEXT: calll roundevenf +; AVX-X86-NEXT: popl %eax +; AVX-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: froundeven32: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: pushq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 16 +; AVX-X64-NEXT: callq roundevenf +; AVX-X64-NEXT: popq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 8 +; AVX-X64-NEXT: retq + %res = call float @llvm.experimental.constrained.roundeven.f32( + float %f, metadata !"fpexcept.strict") #0 + ret float %res +} + +define double @froundevenf64(double %f) #0 { +; SSE41-X86-LABEL: froundevenf64: +; SSE41-X86: # %bb.0: +; SSE41-X86-NEXT: subl $8, %esp +; SSE41-X86-NEXT: .cfi_def_cfa_offset 12 +; SSE41-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE41-X86-NEXT: movsd %xmm0, (%esp) +; SSE41-X86-NEXT: calll roundeven +; SSE41-X86-NEXT: addl $8, %esp +; SSE41-X86-NEXT: .cfi_def_cfa_offset 4 +; SSE41-X86-NEXT: retl +; +; SSE41-X64-LABEL: froundevenf64: +; SSE41-X64: # %bb.0: +; SSE41-X64-NEXT: pushq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 16 +; SSE41-X64-NEXT: callq roundeven +; SSE41-X64-NEXT: popq %rax +; SSE41-X64-NEXT: .cfi_def_cfa_offset 8 +; SSE41-X64-NEXT: retq +; +; AVX-X86-LABEL: froundevenf64: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: subl $8, %esp +; AVX-X86-NEXT: .cfi_def_cfa_offset 12 +; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX-X86-NEXT: calll roundeven +; AVX-X86-NEXT: addl $8, %esp +; AVX-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: froundevenf64: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: pushq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 16 +; AVX-X64-NEXT: callq roundeven +; AVX-X64-NEXT: popq %rax +; AVX-X64-NEXT: .cfi_def_cfa_offset 8 +; AVX-X64-NEXT: retq + %res = call double @llvm.experimental.constrained.roundeven.f64( + double %f, metadata !"fpexcept.strict") #0 + ret double %res +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll index b705c760287ed..d2be7fb68900e 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -913,6 +913,47 @@ entry: ret fp128 %round } +define fp128 @roundeven(fp128 %x) nounwind strictfp { +; CHECK-LABEL: roundeven: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq roundevenl +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq +; +; X86-LABEL: roundeven: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $20, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl $12, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %eax +; X86-NEXT: calll roundevenl +; X86-NEXT: addl $28, %esp +; X86-NEXT: movl (%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) +; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $20, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl $4 +entry: + %roundeven = call fp128 @llvm.experimental.constrained.roundeven.f128(fp128 %x, metadata !"fpexcept.strict") #0 + ret fp128 %roundeven +} + define fp128 @sin(fp128 %x) nounwind strictfp { ; CHECK-LABEL: sin: ; CHECK: # %bb.0: # %entry @@ -1409,6 +1450,7 @@ declare fp128 @llvm.experimental.constrained.pow.f128(fp128, fp128, metadata, me declare fp128 @llvm.experimental.constrained.powi.f128(fp128, i32, metadata, metadata) declare fp128 @llvm.experimental.constrained.rint.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata) +declare fp128 @llvm.experimental.constrained.roundeven.f128(fp128, metadata) declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata) diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll new file mode 100644 index 0000000000000..c199352d14239 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll @@ -0,0 +1,657 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X64 + +define x86_fp80 @fma(x86_fp80 %x, x86_fp80 %y, x86_fp80 %z) nounwind strictfp { +; X86-LABEL: fma: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $36, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll fmal +; X86-NEXT: addl $36, %esp +; X86-NEXT: retl +; +; X64-LABEL: fma: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $56, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq fmal +; X64-NEXT: addq $56, %rsp +; X64-NEXT: retq +entry: + %fma = call x86_fp80 @llvm.experimental.constrained.fma.f80(x86_fp80 %x, x86_fp80 %y, x86_fp80 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %fma +} + +define x86_fp80 @frem(x86_fp80 %x, x86_fp80 %y) nounwind strictfp { +; X86-LABEL: frem: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $24, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll fmodl +; X86-NEXT: addl $24, %esp +; X86-NEXT: retl +; +; X64-LABEL: frem: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq fmodl +; X64-NEXT: addq $40, %rsp +; X64-NEXT: retq +entry: + %div = call x86_fp80 @llvm.experimental.constrained.frem.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %div +} + +define x86_fp80 @ceil(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: ceil: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll ceill +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: ceil: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq ceill +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %ceil = call x86_fp80 @llvm.experimental.constrained.ceil.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 + ret x86_fp80 %ceil +} + +define x86_fp80 @cos(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: cos: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll cosl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: cos: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq cosl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %cos = call x86_fp80 @llvm.experimental.constrained.cos.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %cos +} + +define x86_fp80 @exp(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: exp: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll expl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: exp: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq expl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %exp = call x86_fp80 @llvm.experimental.constrained.exp.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %exp +} + +define x86_fp80 @exp2(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: exp2: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll exp2l +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: exp2: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq exp2l +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %exp2 = call x86_fp80 @llvm.experimental.constrained.exp2.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %exp2 +} + +define x86_fp80 @floor(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: floor: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll floorl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: floor: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq floorl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %floor = call x86_fp80 @llvm.experimental.constrained.floor.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 + ret x86_fp80 %floor +} + +define x86_fp80 @log(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: log: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll logl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: log: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq logl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %log = call x86_fp80 @llvm.experimental.constrained.log.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %log +} + +define x86_fp80 @log10(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: log10: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll log10l +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: log10: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq log10l +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %log10 = call x86_fp80 @llvm.experimental.constrained.log10.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %log10 +} + +define x86_fp80 @log2(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: log2: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll log2l +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: log2: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq log2l +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %log2 = call x86_fp80 @llvm.experimental.constrained.log2.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %log2 +} + +define x86_fp80 @maxnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp { +; X86-LABEL: maxnum: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $24, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll fmaxl +; X86-NEXT: addl $24, %esp +; X86-NEXT: retl +; +; X64-LABEL: maxnum: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq fmaxl +; X64-NEXT: addq $40, %rsp +; X64-NEXT: retq +entry: + %maxnum = call x86_fp80 @llvm.experimental.constrained.maxnum.f80(x86_fp80 %x, x86_fp80 %y, metadata !"fpexcept.strict") #0 + ret x86_fp80 %maxnum +} + +define x86_fp80 @minnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp { +; X86-LABEL: minnum: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $24, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll fminl +; X86-NEXT: addl $24, %esp +; X86-NEXT: retl +; +; X64-LABEL: minnum: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq fminl +; X64-NEXT: addq $40, %rsp +; X64-NEXT: retq +entry: + %minnum = call x86_fp80 @llvm.experimental.constrained.minnum.f80(x86_fp80 %x, x86_fp80 %y, metadata !"fpexcept.strict") #0 + ret x86_fp80 %minnum +} + +define x86_fp80 @nearbyint(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: nearbyint: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll nearbyintl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: nearbyint: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq nearbyintl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %nearbyint = call x86_fp80 @llvm.experimental.constrained.nearbyint.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %nearbyint +} + +define x86_fp80 @pow(x86_fp80 %x, x86_fp80 %y) nounwind strictfp { +; X86-LABEL: pow: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $24, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll powl +; X86-NEXT: addl $24, %esp +; X86-NEXT: retl +; +; X64-LABEL: pow: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq powl +; X64-NEXT: addq $40, %rsp +; X64-NEXT: retq +entry: + %pow = call x86_fp80 @llvm.experimental.constrained.pow.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %pow +} + +define x86_fp80 @powi(x86_fp80 %x, i32 %y) nounwind strictfp { +; X86-LABEL: powi: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $16, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: wait +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll __powixf2 +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: powi: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq __powixf2 +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %powi = call x86_fp80 @llvm.experimental.constrained.powi.f80(x86_fp80 %x, i32 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %powi +} + +define x86_fp80 @rint(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: rint: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll rintl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: rint: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq rintl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %rint = call x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %rint +} + +define x86_fp80 @round(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: round: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll roundl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: round: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq roundl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %round = call x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 + ret x86_fp80 %round +} + +define x86_fp80 @roundeven(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: roundeven: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll roundevenl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: roundeven: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq roundevenl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %roundeven = call x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 + ret x86_fp80 %roundeven +} + +define x86_fp80 @sin(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: sin: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll sinl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: sin: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq sinl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %sin = call x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %sin +} + +define x86_fp80 @trunc(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: trunc: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll truncl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: trunc: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq truncl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %trunc = call x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 + ret x86_fp80 %trunc +} + +define i32 @lrint(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: lrint: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll lrintl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: lrint: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq lrintl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %rint = call i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret i32 %rint +} + +define i64 @llrint(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: llrint: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll llrintl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: llrint: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq llrintl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %rint = call i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret i64 %rint +} + +define i32 @lround(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: lround: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll lroundl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: lround: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq lroundl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %round = call i32 @llvm.experimental.constrained.lround.i32.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 + ret i32 %round +} + +define i64 @llround(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: llround: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll llroundl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: llround: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq llroundl +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %round = call i64 @llvm.experimental.constrained.llround.i64.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0 + ret i64 %round +} + +attributes #0 = { strictfp } + +declare x86_fp80 @llvm.experimental.constrained.fma.f80(x86_fp80, x86_fp80, x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.frem.f80(x86_fp80, x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.ceil.f80(x86_fp80, metadata) +declare x86_fp80 @llvm.experimental.constrained.cos.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.exp.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.exp2.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.floor.f80(x86_fp80, metadata) +declare x86_fp80 @llvm.experimental.constrained.log.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.log10.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.log2.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.maxnum.f80(x86_fp80, x86_fp80, metadata) +declare x86_fp80 @llvm.experimental.constrained.minnum.f80(x86_fp80, x86_fp80, metadata) +declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.pow.f80(x86_fp80, x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.powi.f80(x86_fp80, i32, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80, metadata) +declare x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80, metadata) +declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80, metadata, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f80(x86_fp80, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f80(x86_fp80, metadata) From 3da1a9634eb9d92b5ffa2571215c350a9641d07b Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Sat, 11 Jul 2020 10:50:34 -0700 Subject: [PATCH 0094/1035] [Statepoints] Support lowering gc relocations to virtual registers (Disabled under flag for the moment) This is part of a larger project wherein we are finally integrating lowering of gc live operands with the register allocator. Today, we force spill all operands in SelectionDAG. The code to do so is distinctly non-optimal. The approach this patch is working towards is to instead lower the relocations directly into the MI form, and let the register allocator pick which ones get spilled and which stack slots they get spilled to. In terms of performance, the later part is actually more important as it avoids redundant shuffling of values between stack slots. This particular change adds ISEL support to produce the variadic def STATEPOINT form required by the above. In particular, the first N are lowered to variadic tied def/use pairs. So new statepoint looks like this: reloc1,reloc2,... = STATEPOINT ..., base1, derived1, base2, derived2, ... N is limited by the maximal number of tied registers machine instruction can have (15 at the moment). The current patch is restricted to handling relocations within a single basic block. Cross block relocations (e.g. invokes) are handled via the legacy mechanism. This restriction will be relaxed in future patches. Patch By: dantrushin Differential Revision: https://reviews.llvm.org/D81648 --- .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 40 + .../SelectionDAG/ScheduleDAGSDNodes.cpp | 3 +- .../SelectionDAG/StatepointLowering.cpp | 132 ++- .../CodeGen/SelectionDAG/StatepointLowering.h | 4 + llvm/lib/CodeGen/TargetLoweringBase.cpp | 12 +- llvm/test/CodeGen/X86/statepoint-vreg.ll | 907 ++++++++++++++++++ 6 files changed, 1074 insertions(+), 24 deletions(-) create mode 100644 llvm/test/CodeGen/X86/statepoint-vreg.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 0e4e99214aa24..ff84fdd62075c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -82,6 +82,28 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses, return N; } +/// Return starting index of GC operand list. +// FIXME: need a better place for this. Put it in StackMaps? +static unsigned getStatepointGCArgStartIdx(MachineInstr *MI) { + assert(MI->getOpcode() == TargetOpcode::STATEPOINT && + "STATEPOINT node expected"); + unsigned OperIdx = StatepointOpers(MI).getNumDeoptArgsIdx(); + unsigned NumDeopts = MI->getOperand(OperIdx).getImm(); + // At this point stack references has not been lowered yet, so they + // take single operand. + ++OperIdx; + while (NumDeopts--) { + MachineOperand &MO = MI->getOperand(OperIdx); + if (MO.isImm() && MO.getImm() == StackMaps::ConstantOp) { + ++OperIdx; + assert(MI->getOperand(OperIdx).isImm() && + "Unexpected statepoint operand"); + } + ++OperIdx; + } + return OperIdx; +} + /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. void InstrEmitter:: @@ -200,6 +222,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() && II.isVariadic() && II.variadicOpsAreDefs(); unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs(); + if (Node->getMachineOpcode() == TargetOpcode::STATEPOINT) + NumVRegs = NumResults; for (unsigned i = 0; i < NumVRegs; ++i) { // If the specific node value is only used by a CopyToReg and the dest reg // is a vreg in the same register class, use the CopyToReg'd destination @@ -821,6 +845,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, NumDefs = NumResults; } ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC); + } else if (Opc == TargetOpcode::STATEPOINT) { + NumDefs = NumResults; } unsigned NumImpUses = 0; @@ -970,6 +996,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef()) MIB->setPhysRegsDeadExcept(UsedRegs, *TRI); + // STATEPOINT is too 'dynamic' to have meaningful machine description. + // We have to manually tie operands. + if (Opc == TargetOpcode::STATEPOINT && NumDefs > 0) { + assert(!HasPhysRegOuts && "STATEPOINT mishandled"); + MachineInstr *MI = MIB; + unsigned Def = 0; + unsigned Use = getStatepointGCArgStartIdx(MI) + 1; + while (Def < NumDefs) { + if (MI->getOperand(Use).isReg()) + MI->tieOperands(Def++, Use); + Use += 2; + } + } + // Run post-isel target hook to adjust this instruction if needed. if (II.hasPostISelHook()) TLI->AdjustInstrPostInstrSelection(*MIB, Node); diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 17c68f2bf73b3..76fb0340a7a88 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -125,8 +125,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, PhysReg = Reg; } else if (Def->isMachineOpcode()) { const MCInstrDesc &II = TII->get(Def->getMachineOpcode()); - if (ResNo >= II.getNumDefs() && - II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) + if (ResNo >= II.getNumDefs() && II.hasImplicitDefOfPhysReg(Reg)) PhysReg = Reg; } diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 2cb57c1d1ccc8..b8c4c73bcccee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -67,6 +67,10 @@ cl::opt UseRegistersForDeoptValues( "use-registers-for-deopt-values", cl::Hidden, cl::init(false), cl::desc("Allow using registers for non pointer deopt args")); +cl::opt MaxRegistersForGCPointers( + "max-registers-for-gc-values", cl::Hidden, cl::init(0), + cl::desc("Max number of VRegs allowed to pass GC pointer meta args in")); + static void pushStackMapConstant(SmallVectorImpl& Ops, SelectionDAGBuilder &Builder, uint64_t Value) { SDLoc L = Builder.getCurSDLoc(); @@ -86,11 +90,13 @@ void StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) { // FunctionLoweringInfo. Also need to ensure used bits get cleared. AllocatedStackSlots.clear(); AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size()); + DerivedPtrMap.clear(); } void StatepointLoweringState::clear() { Locations.clear(); AllocatedStackSlots.clear(); + DerivedPtrMap.clear(); assert(PendingGCRelocateCalls.empty() && "cleared before statepoint sequence completed"); } @@ -221,7 +227,6 @@ static Optional findPreviousSpillSlot(const Value *Val, return None; } - /// Return true if-and-only-if the given SDValue can be lowered as either a /// constant argument or a stack reference. The key point is that the value /// doesn't need to be spilled or tracked as a vreg use. @@ -242,7 +247,6 @@ static bool willLowerDirectly(SDValue Incoming) { Incoming.isUndef()); } - /// Try to find existing copies of the incoming values in stack slots used for /// statepoint spilling. If we can find a spill slot for the incoming value, /// mark that slot as allocated, and reuse the same slot for this safepoint. @@ -388,7 +392,7 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain, StoreMMO); MMO = getMachineMemOperand(MF, *cast(Loc)); - + Builder.StatepointLowering.setLocation(Incoming, Loc); } @@ -485,7 +489,9 @@ lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot, /// will be set to the last value spilled (if any were). static void lowerStatepointMetaArgs(SmallVectorImpl &Ops, - SmallVectorImpl &MemRefs, SelectionDAGBuilder::StatepointLoweringInfo &SI, + SmallVectorImpl &MemRefs, + DenseMap &LowerAsVReg, + SelectionDAGBuilder::StatepointLoweringInfo &SI, SelectionDAGBuilder &Builder) { // Lower the deopt and gc arguments for this statepoint. Layout will be: // deopt argument length, deopt arguments.., gc arguments... @@ -531,6 +537,37 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, const bool LiveInDeopt = SI.StatepointFlags & (uint64_t)StatepointFlags::DeoptLiveIn; + // Decide which deriver pointers will go on VRegs + const unsigned MaxTiedRegs = 15; // Max number of tied regs MI can have. + unsigned MaxVRegPtrs = + std::min(MaxTiedRegs, MaxRegistersForGCPointers.getValue()); + // Use old spill scheme for cross-block relocates. + if (SI.StatepointInstr) { + const BasicBlock *BB = SI.StatepointInstr->getParent(); + bool NonLocalReloc = + llvm::any_of(SI.GCRelocates, [BB](const GCRelocateInst *R) { + return R->getParent() != BB; + }); + if (NonLocalReloc) + MaxVRegPtrs = 0; + } + + LLVM_DEBUG(dbgs() << "Desiding how to lower GC Pointers:\n"); + unsigned CurNumVRegs = 0; + for (const Value *P : SI.Ptrs) { + if (LowerAsVReg.size() == MaxVRegPtrs) + break; + SDValue PtrSD = Builder.getValue(P); + if (willLowerDirectly(PtrSD) || P->getType()->isVectorTy()) { + LLVM_DEBUG(dbgs() << "direct/spill "; PtrSD.dump(&Builder.DAG)); + continue; + } + LLVM_DEBUG(dbgs() << "vreg "; PtrSD.dump(&Builder.DAG)); + LowerAsVReg[PtrSD] = CurNumVRegs++; + } + LLVM_DEBUG(dbgs() << LowerAsVReg.size() + << " derived pointers will go in vregs\n"); + auto isGCValue = [&](const Value *V) { auto *Ty = V->getType(); if (!Ty->isPtrOrPtrVectorTy()) @@ -542,7 +579,9 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, }; auto requireSpillSlot = [&](const Value *V) { - return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V); + if (isGCValue(V)) + return !LowerAsVReg.count(Builder.getValue(V)); + return !(LiveInDeopt || UseRegistersForDeoptValues); }; // Before we actually start lowering (and allocating spill slots for values), @@ -554,9 +593,14 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, if (requireSpillSlot(V)) reservePreviousStackSlotForValue(V, Builder); } + for (unsigned i = 0; i < SI.Bases.size(); ++i) { - reservePreviousStackSlotForValue(SI.Bases[i], Builder); - reservePreviousStackSlotForValue(SI.Ptrs[i], Builder); + SDValue SDV = Builder.getValue(SI.Bases[i]); + if (!LowerAsVReg.count(SDV)) + reservePreviousStackSlotForValue(SI.Bases[i], Builder); + SDV = Builder.getValue(SI.Ptrs[i]); + if (!LowerAsVReg.count(SDV)) + reservePreviousStackSlotForValue(SI.Ptrs[i], Builder); } // First, prefix the list with the number of unique values to be @@ -567,6 +611,7 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, // The vm state arguments are lowered in an opaque manner. We do not know // what type of values are contained within. + LLVM_DEBUG(dbgs() << "Lowering deopt state\n"); for (const Value *V : SI.DeoptState) { SDValue Incoming; // If this is a function argument at a static frame index, generate it as @@ -578,6 +623,8 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, } if (!Incoming.getNode()) Incoming = Builder.getValue(V); + LLVM_DEBUG(dbgs() << "Value " << *V + << " requireSpillSlot = " << requireSpillSlot(V) << "\n"); lowerIncomingStatepointValue(Incoming, requireSpillSlot(V), Ops, MemRefs, Builder); } @@ -588,14 +635,15 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, // it's (lowered) derived pointer. i.e // (base[0], ptr[0], base[1], ptr[1], ...) for (unsigned i = 0; i < SI.Bases.size(); ++i) { - const Value *Base = SI.Bases[i]; - lowerIncomingStatepointValue(Builder.getValue(Base), - /*RequireSpillSlot*/ true, Ops, MemRefs, + bool RequireSpillSlot; + SDValue Base = Builder.getValue(SI.Bases[i]); + RequireSpillSlot = !LowerAsVReg.count(Base); + lowerIncomingStatepointValue(Base, RequireSpillSlot, Ops, MemRefs, Builder); - const Value *Ptr = SI.Ptrs[i]; - lowerIncomingStatepointValue(Builder.getValue(Ptr), - /*RequireSpillSlot*/ true, Ops, MemRefs, + SDValue Derived = Builder.getValue(SI.Ptrs[i]); + RequireSpillSlot = !LowerAsVReg.count(Derived); + lowerIncomingStatepointValue(Derived, RequireSpillSlot, Ops, MemRefs, Builder); } @@ -630,7 +678,9 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, SDValue SDV = Builder.getValue(V); SDValue Loc = Builder.StatepointLowering.getLocation(SDV); - if (Loc.getNode()) { + if (LowerAsVReg.count(SDV)) { + SpillMap[V] = None; + } else if (Loc.getNode()) { SpillMap[V] = cast(Loc)->getIndex(); } else { // Record value as visited, but not spilled. This is case for allocas @@ -665,6 +715,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( assert(SI.Bases.size() == SI.Ptrs.size() && SI.Ptrs.size() <= SI.GCRelocates.size()); + LLVM_DEBUG(dbgs() << "Lowering statepoint " << *SI.StatepointInstr << "\n"); #ifndef NDEBUG for (auto *Reloc : SI.GCRelocates) if (Reloc->getParent() == SI.StatepointInstr->getParent()) @@ -674,7 +725,9 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( // Lower statepoint vmstate and gcstate arguments SmallVector LoweredMetaArgs; SmallVector MemRefs; - lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, SI, *this); + // Maps derived pointer SDValue to statepoint result of relocated pointer. + DenseMap LowerAsVReg; + lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, LowerAsVReg, SI, *this); // Now that we've emitted the spills, we need to update the root so that the // call sequence is ordered correctly. @@ -788,14 +841,35 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( // Compute return values. Provide a glue output since we consume one as // input. This allows someone else to chain off us as needed. - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SmallVector NodeTys; + for (auto &Ptr : SI.Ptrs) { + SDValue SD = getValue(Ptr); + if (LowerAsVReg.count(SD)) { + NodeTys.push_back(SD.getValueType()); + } + } + LLVM_DEBUG(dbgs() << "Statepoint has " << NodeTys.size() << " results\n"); + assert(NodeTys.size() == LowerAsVReg.size() && "Inconsistent GC Ptr lowering"); + NodeTys.push_back(MVT::Other); + NodeTys.push_back(MVT::Glue); + unsigned NumResults = NodeTys.size(); MachineSDNode *StatepointMCNode = DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops); DAG.setNodeMemRefs(StatepointMCNode, MemRefs); SDNode *SinkNode = StatepointMCNode; + // Fill mapping from derived pointer to statepoint result denoting its + // relocated value. + auto &DPtrMap = StatepointLowering.DerivedPtrMap; + for (const auto *Relocate : SI.GCRelocates) { + Value *Derived = Relocate->getDerivedPtr(); + SDValue SD = getValue(Derived); + if (LowerAsVReg.count(SD)) + DPtrMap[Derived] = SDValue(StatepointMCNode, LowerAsVReg[SD]); + } + // Build the GC_TRANSITION_END node if necessary. // // See the comment above regarding GC_TRANSITION_START for the layout of @@ -804,7 +878,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( SmallVector TEOps; // Add chain - TEOps.push_back(SDValue(StatepointMCNode, 0)); + TEOps.push_back(SDValue(StatepointMCNode, NumResults - 2)); // Add GC transition arguments for (const Value *V : SI.GCTransitionArgs) { @@ -814,7 +888,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( } // Add glue - TEOps.push_back(SDValue(StatepointMCNode, 1)); + TEOps.push_back(SDValue(StatepointMCNode, NumResults - 1)); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -825,7 +899,12 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( } // Replace original call - DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root + // Call: ch,glue = CALL ... + // Statepoint: [gc relocates],ch,glue = STATEPOINT ... + unsigned NumSinkValues = SinkNode->getNumValues(); + SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2), + SDValue(SinkNode, NumSinkValues - 1)}; + DAG.ReplaceAllUsesWith(CallNode, StatepointValues); // Remove original call node DAG.DeleteNode(CallNode); @@ -927,7 +1006,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I, setValue(&I, ReturnValue); return; } - + // Result value will be used in a different basic block so we need to export // it now. Default exporting mechanism will not work here because statepoint // call has a different type than the actual call. It means that by default @@ -1010,12 +1089,13 @@ void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) { } void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { + const BasicBlock *StatepointBB = Relocate.getStatepoint()->getParent(); #ifndef NDEBUG // Consistency check // We skip this check for relocates not in the same basic block as their // statepoint. It would be too expensive to preserve validation info through // different basic blocks. - if (Relocate.getStatepoint()->getParent() == Relocate.getParent()) + if (StatepointBB == Relocate.getParent()) StatepointLowering.relocCallVisited(Relocate); auto *Ty = Relocate.getType()->getScalarType(); @@ -1033,6 +1113,16 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { return; } + // Relocate is local to statepoint block and its pointer was assigned + // to VReg. Use corresponding statepoint result. + auto &DPtrMap = StatepointLowering.DerivedPtrMap; + auto It = DPtrMap.find(DerivedPtr); + if (It != DPtrMap.end()) { + setValue(&Relocate, It->second); + assert(Relocate.getParent() == StatepointBB && "unexpected DPtrMap entry"); + return; + } + auto &SpillMap = FuncInfo.StatepointSpillMaps[Relocate.getStatepoint()]; auto SlotIt = SpillMap.find(DerivedPtr); assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value"); diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h index 634ef87f3840e..d6c18379c5add 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h @@ -103,6 +103,10 @@ class StatepointLoweringState { return AllocatedStackSlots.test(Offset); } + /// For each statepoint keep mapping from original derived pointer to + /// the statepoint node result defining its new value. + DenseMap DerivedPtrMap; + private: /// Maps pre-relocation value (gc pointer directly incoming into statepoint) /// into it's location (currently only stack slots) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 2c94c2c62e5f0..db4fcf7494c7f 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1041,9 +1041,19 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, // Inherit previous memory operands. MIB.cloneMemRefs(*MI); - for (auto &MO : MI->operands()) { + for (unsigned i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand &MO = MI->getOperand(i); if (!MO.isFI()) { + // Index of Def operand this Use it tied to. + // Since Defs are coming before Uses, if Use is tied, then + // index of Def must be smaller that index of that Use. + // Also, Defs preserve their position in new MI. + unsigned TiedTo = i; + if (MO.isReg() && MO.isTied()) + TiedTo = MI->findTiedOperandIdx(i); MIB.add(MO); + if (TiedTo < i) + MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1); continue; } diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.ll b/llvm/test/CodeGen/X86/statepoint-vreg.ll new file mode 100644 index 0000000000000..bb86e9e1f1cfb --- /dev/null +++ b/llvm/test/CodeGen/X86/statepoint-vreg.ll @@ -0,0 +1,907 @@ +; This run is to demonstrate what MIR SSA looks like. +; RUN: llc -max-registers-for-gc-values=4 -stop-after finalize-isel < %s | FileCheck --check-prefix=CHECK-VREG %s +; This run is to demonstrate register allocator work. +; RUN: llc -max-registers-for-gc-values=4 -stop-after virtregrewriter < %s | FileCheck --check-prefix=CHECK-PREG %s +; This run is to demonstrate resulting assembly/stackmaps. +; NOTE: When D81647 is landed this run line will need to be adjusted! +; RUN: llc -max-registers-for-gc-values=4 < %s | FileCheck --check-prefix=CHECK-ASM %s + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +declare i1 @return_i1() +declare void @func() +declare void @consume(i32 addrspace(1)*) +declare void @consume2(i32 addrspace(1)*, i32 addrspace(1)*) +declare void @consume5(i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*) +declare void @use1(i32 addrspace(1)*, i8 addrspace(1)*) + +; test most simple relocate +define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_relocate +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: %1:gr64 = STATEPOINT 0, 0, 0, @return_i1, 2, 0, 2, 0, 2, 0, %0, %0(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def $al +; CHECK-VREG: %2:gr8 = COPY $al +; CHECK-VREG: $rdi = COPY %1 +; CHECK-VREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + +; CHECK-PREG-LABEL: name: test_relocate +; CHECK-PREG: renamable $rbx = COPY $rdi +; CHECK-PREG: renamable $rbx = STATEPOINT 0, 0, 0, @return_i1, 2, 0, 2, 0, 2, 0, killed renamable $rbx, renamable $rbx(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def $al +; CHECK-PREG: renamable $bpl = COPY killed $al +; CHECK-PREG: $rdi = COPY killed renamable $rbx +; CHECK-PREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + +; CHECK-ASM-LABEL: test_relocate: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: pushq %rbp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: pushq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 24 +; CHECK-ASM-NEXT: pushq %rax +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 32 +; CHECK-ASM-NEXT: .cfi_offset %rbx, -24 +; CHECK-ASM-NEXT: .cfi_offset %rbp, -16 +; CHECK-ASM-NEXT: movq %rdi, %rbx +; CHECK-ASM-NEXT: callq return_i1 +; CHECK-ASM-NEXT: .Ltmp0: +; CHECK-ASM-NEXT: movl %eax, %ebp +; CHECK-ASM-NEXT: movq %rbx, %rdi +; CHECK-ASM-NEXT: callq consume +; CHECK-ASM-NEXT: movl %ebp, %eax +; CHECK-ASM-NEXT: addq $8, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 24 +; CHECK-ASM-NEXT: popq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: popq %rbp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)] + %rel1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %res1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + call void @consume(i32 addrspace(1)* %rel1) + ret i1 %res1 +} +; test pointer variables intermixed with pointer constants +define void @test_mixed(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_mixed +; CHECK-VREG: %2:gr64 = COPY $rdx +; CHECK-VREG: %1:gr64 = COPY $rsi +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: %3:gr64, %4:gr64, %5:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, %2, %2(tied-def 0), 2, 0, 2, 0, %1, %1(tied-def 1), %0, %0(tied-def 2), csr_64 +; CHECK-VREG: %6:gr32 = MOV32r0 implicit-def dead $eflags +; CHECK-VREG: %7:gr64 = SUBREG_TO_REG 0, killed %6, %subreg.sub_32bit +; CHECK-VREG: $rdi = COPY %5 +; CHECK-VREG: $rsi = COPY %7 +; CHECK-VREG: $rdx = COPY %4 +; CHECK-VREG: $rcx = COPY %7 +; CHECK-VREG: $r8 = COPY %3 +; CHECK-VREG: CALL64pcrel32 @consume5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit $rcx, implicit $r8, implicit-def $rsp, implicit-def $ssp + +; CHECK-PREG-LABEL: name: test_mixed +; CHECK-PREG: renamable $r14 = COPY $rdx +; CHECK-PREG: renamable $r15 = COPY $rsi +; CHECK-PREG: renamable $rbx = COPY $rdi +; CHECK-PREG: renamable $r14, renamable $r15, renamable $rbx = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, killed renamable $r14, renamable $r14(tied-def 0), 2, 0, 2, 0, killed renamable $r15, renamable $r15(tied-def 1), killed renamable $rbx, renamable $rbx(tied-def 2), csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-PREG: $rdi = COPY killed renamable $rbx +; CHECK-PREG: dead $esi = MOV32r0 implicit-def dead $eflags, implicit-def $rsi +; CHECK-PREG: $rdx = COPY killed renamable $r15 +; CHECK-PREG: dead $ecx = MOV32r0 implicit-def dead $eflags, implicit-def $rcx +; CHECK-PREG: $r8 = COPY killed renamable $r14 +; CHECK-PREG: CALL64pcrel32 @consume5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit killed $rcx, implicit killed $r8, implicit-def $rsp, implicit-def $ssp + +; CHECK-ASM-LABEL: test_mixed: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: pushq %r15 +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: pushq %r14 +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 24 +; CHECK-ASM-NEXT: pushq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 32 +; CHECK-ASM-NEXT: .cfi_offset %rbx, -32 +; CHECK-ASM-NEXT: .cfi_offset %r14, -24 +; CHECK-ASM-NEXT: .cfi_offset %r15, -16 +; CHECK-ASM-NEXT: movq %rdx, %r14 +; CHECK-ASM-NEXT: movq %rsi, %r15 +; CHECK-ASM-NEXT: movq %rdi, %rbx +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT:.Ltmp1: +; CHECK-ASM-NEXT: movq %rbx, %rdi +; CHECK-ASM-NEXT: xorl %esi, %esi +; CHECK-ASM-NEXT: movq %r15, %rdx +; CHECK-ASM-NEXT: xorl %ecx, %ecx +; CHECK-ASM-NEXT: movq %r14, %r8 +; CHECK-ASM-NEXT: callq consume5 +; CHECK-ASM-NEXT: popq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 24 +; CHECK-ASM-NEXT: popq %r14 +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: popq %r15 +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq +entry: + %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a, i32 addrspace(1)* null, i32 addrspace(1)* %b, i32 addrspace(1)* null, i32 addrspace(1)* %c)] + %rel1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %rel2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 1) + %rel3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 2, i32 2) + %rel4 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 3, i32 3) + %rel5 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 4, i32 4) + call void @consume5(i32 addrspace(1)* %rel1, i32 addrspace(1)* %rel2, i32 addrspace(1)* %rel3, i32 addrspace(1)* %rel4, i32 addrspace(1)* %rel5) + ret void +} + +; same as above, but for alloca +define i32 addrspace(1)* @test_alloca(i32 addrspace(1)* %ptr) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_alloca +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: MOV64mr %stack.0.alloca, 1, $noreg, 0, $noreg, %0 :: (store 8 into %ir.alloca) +; CHECK-VREG: %1:gr64 = STATEPOINT 0, 0, 0, @return_i1, 2, 0, 2, 0, 2, 0, %0, %0(tied-def 0), 0, %stack.0.alloca, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def $al :: (volatile load store 8 on %stack.0.alloca) +; CHECK-VREG: %2:gr8 = COPY $al +; CHECK-VREG: %3:gr64 = MOV64rm %stack.0.alloca, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.alloca) +; CHECK-VREG: $rdi = COPY %1 +; CHECK-VREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + +; CHECK-PREG-LABEL: name: test_alloca +; CHECK-PREG: renamable $rbx = COPY $rdi +; CHECK-PREG: MOV64mr %stack.0.alloca, 1, $noreg, 0, $noreg, renamable $rbx :: (store 8 into %ir.alloca) +; CHECK-PREG: renamable $rbx = STATEPOINT 0, 0, 0, @return_i1, 2, 0, 2, 0, 2, 0, killed renamable $rbx, renamable $rbx(tied-def 0), 0, %stack.0.alloca, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $al :: (volatile load store 8 on %stack.0.alloca) +; CHECK-PREG: renamable $r14 = MOV64rm %stack.0.alloca, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.alloca) +; CHECK-PREG: $rdi = COPY killed renamable $rbx +; CHECK-PREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + +; CHECK-ASM-LABEL: test_alloca: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: pushq %r14 +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: pushq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 24 +; CHECK-ASM-NEXT: pushq %rax +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 32 +; CHECK-ASM-NEXT: .cfi_offset %rbx, -24 +; CHECK-ASM-NEXT: .cfi_offset %r14, -16 +; CHECK-ASM-NEXT: movq %rdi, %rbx +; CHECK-ASM-NEXT: movq %rdi, (%rsp) +; CHECK-ASM-NEXT: callq return_i1 +; CHECK-ASM-NEXT: .Ltmp2: +; CHECK-ASM-NEXT: movq (%rsp), %r14 +; CHECK-ASM-NEXT: movq %rbx, %rdi +; CHECK-ASM-NEXT: callq consume +; CHECK-ASM-NEXT: movq %r14, %rax +; CHECK-ASM-NEXT: addq $8, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 24 +; CHECK-ASM-NEXT: popq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: popq %r14 +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq +entry: + %alloca = alloca i32 addrspace(1)*, align 8 + store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca + %safepoint_token = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)** %alloca, i32 addrspace(1)* %ptr)] + %rel1 = load i32 addrspace(1)*, i32 addrspace(1)** %alloca + %rel2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 1) + call void @consume(i32 addrspace(1)* %rel2) + ret i32 addrspace(1)* %rel1 +} + +; test base != derived +define void @test_base_derived(i32 addrspace(1)* %base, i32 addrspace(1)* %derived) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_base_derived +; CHECK-VREG: %1:gr64 = COPY $rsi +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: MOV64mr %stack.0, 1, $noreg, 0, $noreg, %0 :: (store 8 into %stack.0) +; CHECK-VREG: %2:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 1, 8, %stack.0, 0, %1(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store 8 on %stack.0) +; CHECK-VREG: $rdi = COPY %2 +; CHECK-VREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + +; CHECK-PREG-LABEL: name: test_base_derived +; CHECK-PREG: renamable $rbx = COPY $rsi +; CHECK-PREG: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rdi :: (store 8 into %stack.0) +; CHECK-PREG: renamable $rbx = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 1, 8, %stack.0, 0, killed renamable $rbx(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store 8 on %stack.0) +; CHECK-PREG: $rdi = COPY killed renamable $rbx +; CHECK-PREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + +; CHECK-ASM-LABEL: test_base_derived: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: pushq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: subq $16, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 32 +; CHECK-ASM-NEXT: .cfi_offset %rbx, -16 +; CHECK-ASM-NEXT: movq %rsi, %rbx +; CHECK-ASM-NEXT: movq %rdi, 8(%rsp) +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT: .Ltmp3: +; CHECK-ASM-NEXT: movq %rbx, %rdi +; CHECK-ASM-NEXT: callq consume +; CHECK-ASM-NEXT: addq $16, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: popq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq + %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %base, i32 addrspace(1)* %derived)] + %reloc = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 1) + call void @consume(i32 addrspace(1)* %reloc) + ret void +} + +; deopt GC pointer not present in GC args must be spilled +define void @test_deopt_gcpointer(i32 addrspace(1)* %a, i32 addrspace(1)* %b) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_deopt_gcpointer +; CHECK-VREG: %1:gr64 = COPY $rsi +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: MOV64mr %stack.0, 1, $noreg, 0, $noreg, %0 :: (store 8 into %stack.0) +; CHECK-VREG: %2:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 1, 1, 8, %stack.0, 0, %1, %1(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store 8 on %stack.0) +; CHECK-VREG: $rdi = COPY %2 +; CHECK-VREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG: RET 0 + +; CHECK-PREG-LABEL: name: test_deopt_gcpointer +; CHECK-PREG: renamable $rbx = COPY $rsi +; CHECK-PREG: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rdi :: (store 8 into %stack.0) +; CHECK-PREG: renamable $rbx = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 1, 1, 8, %stack.0, 0, killed renamable $rbx, renamable $rbx(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store 8 on %stack.0) +; CHECK-PREG: $rdi = COPY killed renamable $rbx +; CHECK-PREG: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + +; CHECK-ASM-LABEL: test_deopt_gcpointer: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: pushq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: subq $16, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 32 +; CHECK-ASM-NEXT: .cfi_offset %rbx, -16 +; CHECK-ASM-NEXT: movq %rsi, %rbx +; CHECK-ASM-NEXT: movq %rdi, 8(%rsp) +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT: .Ltmp4: +; CHECK-ASM-NEXT: movq %rbx, %rdi +; CHECK-ASM-NEXT: callq consume +; CHECK-ASM-NEXT: addq $16, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: popq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq + %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 addrspace(1)* %a), "gc-live" (i32 addrspace(1)* %b)] + %rel = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + call void @consume(i32 addrspace(1)* %rel) + ret void +} + +;; Two gc.relocates of the same input, should require only a single spill/fill +define void @test_gcrelocate_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_gcrelocate_uniqueing +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: %1:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 2, %0, 2, 4278124286, %0, %0(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG: $rdi = COPY %1 +; CHECK-VREG: $rsi = COPY %1 +; CHECK-VREG: CALL64pcrel32 @consume2, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit-def $rsp, implicit-def $ssp + +; CHECK-PREG-LABEL: name: test_gcrelocate_uniqueing +; CHECK-PREG: renamable $rbx = COPY $rdi +; CHECK-PREG: renamable $rbx = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 2, killed renamable $rbx, 2, 4278124286, renamable $rbx, renamable $rbx(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-PREG: $rdi = COPY renamable $rbx +; CHECK-PREG: $rsi = COPY killed renamable $rbx +; CHECK-PREG: CALL64pcrel32 @consume2, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit killed $rsi, implicit-def $rsp, implicit-def $ssp + +; CHECK-ASM-LABEL: test_gcrelocate_uniqueing: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: pushq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: .cfi_offset %rbx, -16 +; CHECK-ASM-NEXT: movq %rdi, %rbx +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT: .Ltmp5: +; CHECK-ASM-NEXT: movq %rbx, %rdi +; CHECK-ASM-NEXT: movq %rbx, %rsi +; CHECK-ASM-NEXT: callq consume2 +; CHECK-ASM-NEXT: popq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq + %tok = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 addrspace(1)* %ptr, i32 undef), "gc-live" (i32 addrspace(1)* %ptr, i32 addrspace(1)* %ptr)] + %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 0, i32 0) + %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 1, i32 1) + call void @consume2(i32 addrspace(1)* %a, i32 addrspace(1)* %b) + ret void +} + +; Two gc.relocates of a bitcasted pointer should only require a single spill/fill +define void @test_gcptr_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_gcptr_uniqueing +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp +; CHECK-VREG: %1:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 2, %0, 2, 4278124286, %0, %0(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp +; CHECK-VREG: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp +; CHECK-VREG: $rdi = COPY %1 +; CHECK-VREG: $rsi = COPY %1 +; CHECK-VREG: CALL64pcrel32 @use1, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit-def $rsp, implicit-def $ssp + +; CHECK-PREG-LABEL: name: test_gcptr_uniqueing +; CHECK-PREG: renamable $rbx = COPY $rdi +; CHECK-PREG: renamable $rbx = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 2, killed renamable $rbx, 2, 4278124286, renamable $rbx, renamable $rbx(tied-def 0), csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-PREG: $rdi = COPY renamable $rbx +; CHECK-PREG: $rsi = COPY killed renamable $rbx +; CHECK-PREG: CALL64pcrel32 @use1, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit killed $rsi, implicit-def $rsp, implicit-def $ssp + +; CHECK-ASM-LABEL: test_gcptr_uniqueing: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: pushq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: .cfi_offset %rbx, -16 +; CHECK-ASM-NEXT: movq %rdi, %rbx +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT: .Ltmp6: +; CHECK-ASM-NEXT: movq %rbx, %rdi +; CHECK-ASM-NEXT: movq %rbx, %rsi +; CHECK-ASM-NEXT: callq use1 +; CHECK-ASM-NEXT: popq %rbx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq + %ptr2 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)* + %tok = tail call token (i64, i32, void ()*, i32, i32, ...) + @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 addrspace(1)* %ptr, i32 undef), "gc-live" (i32 addrspace(1)* %ptr, i8 addrspace(1)* %ptr2)] + %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 0, i32 0) + %b = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 1, i32 1) + call void @use1(i32 addrspace(1)* %a, i8 addrspace(1)* %b) + ret void +} + +; +; Cross-basicblock relocates are handled with spilling for now. +; No need to check post-RA output +define i1 @test_cross_bb(i32 addrspace(1)* %a, i1 %external_cond) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_cross_bb +; CHECK-VREG: bb.0.entry: +; CHECK-VREG: %1:gr32 = COPY $esi +; CHECK-VREG-NEXT: %0:gr64 = COPY $rdi +; CHECK-VREG-NEXT: %3:gr8 = COPY %1.sub_8bit +; CHECK-VREG-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, %0 :: (store 8 into %stack.0) +; CHECK-VREG-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp +; CHECK-VREG-NEXT: STATEPOINT 0, 0, 0, @return_i1, 2, 0, 2, 0, 2, 0, 1, 8, %stack.0, 0, 1, 8, %stack.0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def $al :: (volatile load store 8 on %stack.0) +; CHECK-VREG-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp +; CHECK-VREG-NEXT: %4:gr8 = COPY $al +; CHECK-VREG-NEXT: %2:gr8 = COPY %4 +; CHECK-VREG-NEXT: TEST8ri killed %3, 1, implicit-def $eflags +; CHECK-VREG-NEXT: JCC_1 %bb.2, 4, implicit $eflags +; CHECK-VREG-NEXT: JMP_1 %bb.1 +; CHECK-VREG: bb.1.left: +; CHECK-VREG-NEXT: %6:gr64 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load 8 from %stack.0) +; CHECK-VREG-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp +; CHECK-VREG-NEXT: $rdi = COPY %6 +; CHECK-VREG-NEXT: CALL64pcrel32 @consume, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp +; CHECK-VREG-NEXT: $al = COPY %2 +; CHECK-VREG-NEXT: RET 0, $al +; CHECK-VREG: bb.2.right: +; CHECK-VREG-NEXT: %5:gr8 = MOV8ri 1 +; CHECK-VREG-NEXT: $al = COPY %5 +; CHECK-VREG-NEXT: RET 0, $al + +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)] + br i1 %external_cond, label %left, label %right + +left: + %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + call void @consume(i32 addrspace(1)* %call1) + ret i1 %call2 + +right: + ret i1 true +} + +; No need to check post-regalloc output as it is the same +define i1 @duplicate_reloc() gc "statepoint-example" { +; CHECK-VREG-LABEL: name: duplicate_reloc +; CHECK-VREG: bb.0.entry: +; CHECK-VREG: STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG: STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG: %0:gr8 = MOV8ri 1 +; CHECK-VREG: $al = COPY %0 +; CHECK-VREG: RET 0, $al + +; CHECK-ASM-LABEL: duplicate_reloc: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: pushq %rax +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 16 +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT: .Ltmp8: +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT: .Ltmp9: +; CHECK-ASM-NEXT: movb $1, %al +; CHECK-ASM-NEXT: popq %rcx +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq +entry: + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* null, i32 addrspace(1)* null)] + %base = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %derived = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 1) + %safepoint_token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %base, i32 addrspace(1)* %derived)] + %base_reloc = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 0, i32 0) + %derived_reloc = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 0, i32 1) + %cmp1 = icmp eq i32 addrspace(1)* %base_reloc, null + %cmp2 = icmp eq i32 addrspace(1)* %derived_reloc, null + %cmp = and i1 %cmp1, %cmp2 + ret i1 %cmp +} + +; Vectors cannot go in VRegs +; No need to check post-regalloc output as it is lowered using old scheme +define <2 x i8 addrspace(1)*> @test_vector(<2 x i8 addrspace(1)*> %obj) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_vector +; CHECK-VREG: %0:vr128 = COPY $xmm0 +; CHECK-VREG: MOVAPSmr %stack.0, 1, $noreg, 0, $noreg, %0 :: (store 16 into %stack.0) +; CHECK-VREG: STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 1, 16, %stack.0, 0, 1, 16, %stack.0, 0, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store 16 on %stack.0) +; CHECK-VREG: %1:vr128 = MOVAPSrm %stack.0, 1, $noreg, 0, $noreg :: (load 16 from %stack.0) +; CHECK-VREG: $xmm0 = COPY %1 +; CHECK-VREG: RET 0, $xmm0 + +; CHECK-ASM-LABEL: test_vector: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: subq $24, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 32 +; CHECK-ASM-NEXT: movaps %xmm0, (%rsp) +; CHECK-ASM-NEXT: callq func +; CHECK-ASM-NEXT: .Ltmp10: +; CHECK-ASM-NEXT: movaps (%rsp), %xmm0 +; CHECK-ASM-NEXT: addq $24, %rsp +; CHECK-ASM-NEXT: .cfi_def_cfa_offset 8 +; CHECK-ASM-NEXT: retq +entry: + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (<2 x i8 addrspace(1)*> %obj)] + %obj.relocated = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %safepoint_token, i32 0, i32 0) ; (%obj, %obj) + ret <2 x i8 addrspace(1)*> %obj.relocated +} + + +; test limit on amount of vregs +define void @test_limit(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c, i32 addrspace(1)* %d, i32 addrspace(1)* %e) gc "statepoint-example" { +; CHECK-VREG-LABEL: name: test_limit +; CHECK-VREG: %4:gr64 = COPY $r8 +; CHECK-VREG: %3:gr64 = COPY $rcx +; CHECK-VREG: %2:gr64 = COPY $rdx +; CHECK-VREG: %1:gr64 = COPY $rsi +; CHECK-VREG: %0:gr64 = COPY $rdi +; CHECK-VREG: MOV64mr %stack.0, 1, $noreg, 0, $noreg, %0 :: (store 8 into %stack.0) +; CHECK-VREG: %5:gr64, %6:gr64, %7:gr64, %8:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, %4, %4(tied-def 0), %3, %3(tied-def 1), %2, %2(tied-def 2), %1, %1(tied-def 3), 1, 8, %stack.0, 0, 1, 8, %stack.0, 0, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store 8 on %stack.0) +; CHECK-VREG: %9:gr64 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load 8 from %stack.0) +; CHECK-VREG: $rdi = COPY %9 +; CHECK-VREG: $rsi = COPY %8 +; CHECK-VREG: $rdx = COPY %7 +; CHECK-VREG: $rcx = COPY %6 +; CHECK-VREG: $r8 = COPY %5 +; CHECK-VREG: CALL64pcrel32 @consume5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit $rcx, implicit $r8, implicit-def $rsp, implicit-def $ssp +; CHECK-VREG: RET 0 +entry: + %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c, i32 addrspace(1)* %d, i32 addrspace(1)* %e)] + %rel1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 0, i32 0) + %rel2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 1, i32 1) + %rel3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 2, i32 2) + %rel4 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 3, i32 3) + %rel5 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 4, i32 4) + call void @consume5(i32 addrspace(1)* %rel1, i32 addrspace(1)* %rel2, i32 addrspace(1)* %rel3, i32 addrspace(1)* %rel4, i32 addrspace(1)* %rel5) + ret void +} + +declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...) +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) +declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) +declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) +declare <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token, i32, i32) +declare i1 @llvm.experimental.gc.result.i1(token) + +; CHECK-ASM-LABEL: .section .llvm_stackmaps +; CHECK-ASM-NEXT: __LLVM_StackMaps: +; Entry for test_relocate +; CHECK-ASM: .quad 0 +; CHECK-ASM-NEXT: .long .Ltmp0-test_relocate +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 5 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 4 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 5 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Entry for test_mixed +; CHECK-ASM: .quad 0 +; CHECK-ASM-NEXT: .long .Ltmp1-test_mixed +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 11 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 4 Register $r14 +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 14 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 5 Register $r14 +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 14 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 6 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 7 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 8 Register $r15 +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 15 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 9 Register $r15 +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 15 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 10 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 11 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Entry for test_alloca +; CHECK-ASM: .quad 0 +; CHECK-ASM-NEXT: .long .Ltmp2-test_alloca +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 6 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 4 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 5 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 6 Direct $rsp + 0 +; CHECK-ASM-NEXT: .byte 2 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 7 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Entry for test_base_derive +; CHECK-ASM: .quad 0 +; CHECK-ASM-NEXT: .long .Ltmp3-test_base_derived +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 5 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 4 Indirect $rsp + 8 +; CHECK-ASM-NEXT: .byte 3 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 7 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 8 +; Location 5 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Entry for test_deopt_gcpointer +; CHECK-ASM: .quad 0 +; CHECK-ASM-NEXT: .long .Ltmp4-test_deopt_gcpointer +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 6 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 1 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 1 +; Location 4Indirect $rsp + 8 +; CHECK-ASM-NEXT: .byte 3 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 7 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 8 +; Location 5 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 6 +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Entry for test_gcrelocate_uniqueing +; CHECK-ASM: .quad 0 +; CHECK-ASM-NEXT: .long .Ltmp5-test_gcrelocate_uniqueing +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 7 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 2 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 2 +; Location 4 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 5 Constant Index 0 +; CHECK-ASM-NEXT: .byte 5 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 6 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 7 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Entry for test_gcptr_uniqueing +; CHECK-ASM: .long .Ltmp6-test_gcptr_uniqueing +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 7 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 2 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 2 +; Location 4 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 5 Constant Index 0 +; CHECK-ASM-NEXT: .byte 5 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 6 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 7 Register $rbx +; CHECK-ASM-NEXT: .byte 1 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 3 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Entry for test_cross_bb +; CHECK-ASM: .quad 0 +; CHECK-ASM-NEXT: .long .Ltmp7-test_cross_bb +; CHECK-ASM-NEXT: .short 0 +; Num locations +; CHECK-ASM-NEXT: .short 5 +; Location 1 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 2 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 3 Constant 0 +; CHECK-ASM-NEXT: .byte 4 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 4 Indirect $rsp + 0 +; CHECK-ASM-NEXT: .byte 3 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 7 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 +; Location 5 Indirect $rsp + 0 +; CHECK-ASM-NEXT: .byte 3 +; CHECK-ASM-NEXT: .byte 0 +; CHECK-ASM-NEXT: .short 8 +; CHECK-ASM-NEXT: .short 7 +; CHECK-ASM-NEXT: .short 0 +; CHECK-ASM-NEXT: .long 0 From 55ced04d6bc13fd0f9396a0cfc393b44378d8784 Mon Sep 17 00:00:00 2001 From: Frederik Gossen Date: Sat, 25 Jul 2020 14:39:18 -0700 Subject: [PATCH 0095/1035] [MLIR][Shape] Allow `num_elements` to operate on extent tensors Differential Revision: https://reviews.llvm.org/D84445 --- mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td | 16 ++++++++-------- mlir/test/Dialect/Shape/canonicalize.mlir | 4 ++-- mlir/test/Dialect/Shape/invalid.mlir | 16 ++++++++++++++++ mlir/test/Dialect/Shape/ops.mlir | 11 +++++++++++ mlir/test/Dialect/Shape/shape-to-shape.mlir | 6 +++--- 5 files changed, 40 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index 797dc0bc0cb6a..abbd8f0931091 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -333,19 +333,19 @@ def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> { let summary = "Returns the number of elements for a given shape"; let description = [{ Returns the number of elements for a given shape which is the product of its - dimensions. - - ```mlir - %product = shape.mul %lhs, %rhs - ``` + extents. If the argument is of type `shape` then the result will be of type + `size` and potential errors will be propagated. Otherwise, if the argument + is and extent tensor `tensor` then the result will be of type + `index`. }]; - let arguments = (ins Shape_ShapeType:$shape); - let results = (outs Shape_SizeType:$result); + let arguments = (ins Shape_ShapeOrExtentTensorType:$shape); + let results = (outs Shape_SizeOrIndexType:$result); - let assemblyFormat = "$shape attr-dict"; + let assemblyFormat = "$shape `:` type($shape) `->` type($result) attr-dict"; let hasFolder = 1; + let verifier = [{ return ::verifySizeOrIndexOp(*this); }]; } def Shape_ReduceOp : Shape_Op<"reduce", diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index 577656a0b362a..e147fbeb81ac2 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -217,7 +217,7 @@ func @num_elements() -> !shape.size { // CHECK-NOT: shape.const_shape %shape = shape.const_shape [4, 5, 6] : !shape.shape // CHECK-NOT: shape.num_elements - %num_elements = shape.num_elements %shape + %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size // CHECK: %[[NUM:.*]] = shape.const_size 120 // CHECK-NEXT: return %[[NUM]] : !shape.size return %num_elements : !shape.size @@ -229,7 +229,7 @@ func @num_elements() -> !shape.size { // CHECK-LABEL: func @nonfoldable_num_elements func @nonfoldable_num_elements(%shape : !shape.shape) -> !shape.size { // CHECK-NOT: shape.const_{{.*}} - %num_elements = shape.num_elements %shape + %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size return %num_elements : !shape.size } diff --git a/mlir/test/Dialect/Shape/invalid.mlir b/mlir/test/Dialect/Shape/invalid.mlir index b4900e491fb82..0bbd6cec777dc 100644 --- a/mlir/test/Dialect/Shape/invalid.mlir +++ b/mlir/test/Dialect/Shape/invalid.mlir @@ -146,3 +146,19 @@ func @mul_error_possible(%lhs : !shape.size, %rhs : index) -> index { return %result : index } +// ----- + +func @num_elements_error_free(%arg : tensor) -> !shape.size { + // expected-error@+1 {{if none of the operands can hold error values then the result must be of type `index`}} + %result = shape.num_elements %arg : tensor -> !shape.size + return %result : !shape.size +} + +// ----- + +func @num_elements_error_possible(%arg : !shape.shape) -> index { + // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}} + %result = shape.num_elements %arg : !shape.shape -> index + return %result : index +} + diff --git a/mlir/test/Dialect/Shape/ops.mlir b/mlir/test/Dialect/Shape/ops.mlir index 3a0cb7781ec72..f57826097d34f 100644 --- a/mlir/test/Dialect/Shape/ops.mlir +++ b/mlir/test/Dialect/Shape/ops.mlir @@ -195,3 +195,14 @@ func @any() { return } +func @num_elements_extent_tensor(%arg : tensor) -> index { + %result = shape.num_elements %arg : tensor -> index + return %result : index +} + +func @num_elements_shape(%arg : !shape.shape) -> !shape.size { + %result = shape.num_elements %arg : !shape.shape -> !shape.size + return %result : !shape.size +} + + diff --git a/mlir/test/Dialect/Shape/shape-to-shape.mlir b/mlir/test/Dialect/Shape/shape-to-shape.mlir index 9a75f0b9ca1bc..d1b00bc12a22c 100644 --- a/mlir/test/Dialect/Shape/shape-to-shape.mlir +++ b/mlir/test/Dialect/Shape/shape-to-shape.mlir @@ -1,9 +1,9 @@ // RUN: mlir-opt -shape-to-shape-lowering -split-input-file %s | FileCheck %s -// CHECK-LABEL: func @num_elements_to_reduce( -// CHECK-SAME: [[ARG:%.*]]: !shape.shape) -> !shape.size { +// CHECK-LABEL: func @num_elements_to_reduce +// CHECK-SAME: ([[ARG:%.*]]: !shape.shape) -> !shape.size func @num_elements_to_reduce(%shape : !shape.shape) -> !shape.size { - %num_elements = shape.num_elements %shape + %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size return %num_elements : !shape.size } // CHECK: [[C1:%.*]] = shape.const_size 1 From 7bfecd773968668b17fddf3865b1d611325942a8 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Sat, 25 Jul 2020 14:47:57 -0700 Subject: [PATCH 0096/1035] Revert "[MLIR][Shape] Allow `num_elements` to operate on extent tensors" This reverts commit 55ced04d6bc13fd0f9396a0cfc393b44378d8784. Forgot to submit depend change first. --- mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td | 16 ++++++++-------- mlir/test/Dialect/Shape/canonicalize.mlir | 4 ++-- mlir/test/Dialect/Shape/invalid.mlir | 16 ---------------- mlir/test/Dialect/Shape/ops.mlir | 11 ----------- mlir/test/Dialect/Shape/shape-to-shape.mlir | 6 +++--- 5 files changed, 13 insertions(+), 40 deletions(-) diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index abbd8f0931091..797dc0bc0cb6a 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -333,19 +333,19 @@ def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> { let summary = "Returns the number of elements for a given shape"; let description = [{ Returns the number of elements for a given shape which is the product of its - extents. If the argument is of type `shape` then the result will be of type - `size` and potential errors will be propagated. Otherwise, if the argument - is and extent tensor `tensor` then the result will be of type - `index`. + dimensions. + + ```mlir + %product = shape.mul %lhs, %rhs + ``` }]; - let arguments = (ins Shape_ShapeOrExtentTensorType:$shape); - let results = (outs Shape_SizeOrIndexType:$result); + let arguments = (ins Shape_ShapeType:$shape); + let results = (outs Shape_SizeType:$result); - let assemblyFormat = "$shape `:` type($shape) `->` type($result) attr-dict"; + let assemblyFormat = "$shape attr-dict"; let hasFolder = 1; - let verifier = [{ return ::verifySizeOrIndexOp(*this); }]; } def Shape_ReduceOp : Shape_Op<"reduce", diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index e147fbeb81ac2..577656a0b362a 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -217,7 +217,7 @@ func @num_elements() -> !shape.size { // CHECK-NOT: shape.const_shape %shape = shape.const_shape [4, 5, 6] : !shape.shape // CHECK-NOT: shape.num_elements - %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size + %num_elements = shape.num_elements %shape // CHECK: %[[NUM:.*]] = shape.const_size 120 // CHECK-NEXT: return %[[NUM]] : !shape.size return %num_elements : !shape.size @@ -229,7 +229,7 @@ func @num_elements() -> !shape.size { // CHECK-LABEL: func @nonfoldable_num_elements func @nonfoldable_num_elements(%shape : !shape.shape) -> !shape.size { // CHECK-NOT: shape.const_{{.*}} - %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size + %num_elements = shape.num_elements %shape return %num_elements : !shape.size } diff --git a/mlir/test/Dialect/Shape/invalid.mlir b/mlir/test/Dialect/Shape/invalid.mlir index 0bbd6cec777dc..b4900e491fb82 100644 --- a/mlir/test/Dialect/Shape/invalid.mlir +++ b/mlir/test/Dialect/Shape/invalid.mlir @@ -146,19 +146,3 @@ func @mul_error_possible(%lhs : !shape.size, %rhs : index) -> index { return %result : index } -// ----- - -func @num_elements_error_free(%arg : tensor) -> !shape.size { - // expected-error@+1 {{if none of the operands can hold error values then the result must be of type `index`}} - %result = shape.num_elements %arg : tensor -> !shape.size - return %result : !shape.size -} - -// ----- - -func @num_elements_error_possible(%arg : !shape.shape) -> index { - // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}} - %result = shape.num_elements %arg : !shape.shape -> index - return %result : index -} - diff --git a/mlir/test/Dialect/Shape/ops.mlir b/mlir/test/Dialect/Shape/ops.mlir index f57826097d34f..3a0cb7781ec72 100644 --- a/mlir/test/Dialect/Shape/ops.mlir +++ b/mlir/test/Dialect/Shape/ops.mlir @@ -195,14 +195,3 @@ func @any() { return } -func @num_elements_extent_tensor(%arg : tensor) -> index { - %result = shape.num_elements %arg : tensor -> index - return %result : index -} - -func @num_elements_shape(%arg : !shape.shape) -> !shape.size { - %result = shape.num_elements %arg : !shape.shape -> !shape.size - return %result : !shape.size -} - - diff --git a/mlir/test/Dialect/Shape/shape-to-shape.mlir b/mlir/test/Dialect/Shape/shape-to-shape.mlir index d1b00bc12a22c..9a75f0b9ca1bc 100644 --- a/mlir/test/Dialect/Shape/shape-to-shape.mlir +++ b/mlir/test/Dialect/Shape/shape-to-shape.mlir @@ -1,9 +1,9 @@ // RUN: mlir-opt -shape-to-shape-lowering -split-input-file %s | FileCheck %s -// CHECK-LABEL: func @num_elements_to_reduce -// CHECK-SAME: ([[ARG:%.*]]: !shape.shape) -> !shape.size +// CHECK-LABEL: func @num_elements_to_reduce( +// CHECK-SAME: [[ARG:%.*]]: !shape.shape) -> !shape.size { func @num_elements_to_reduce(%shape : !shape.shape) -> !shape.size { - %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size + %num_elements = shape.num_elements %shape return %num_elements : !shape.size } // CHECK: [[C1:%.*]] = shape.const_size 1 From 5142448a5e2aeeffefb3aabdb48f19033025bc09 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Sat, 25 Jul 2020 14:55:19 -0700 Subject: [PATCH 0097/1035] [MLIR][Shape] Refactor verification Based on https://reviews.llvm.org/D84439 but less restrictive, else we don't allow shape_of to be able to produce a ranked output and doesn't allow for iterative refinement here. We can consider making it more restrictive later. --- .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 8 +- mlir/lib/Dialect/Shape/IR/Shape.cpp | 82 ++++++------------- mlir/test/Dialect/Shape/invalid.mlir | 30 +------ 3 files changed, 31 insertions(+), 89 deletions(-) diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index 797dc0bc0cb6a..8c32faee55f90 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -207,7 +207,7 @@ def Shape_RankOp : Shape_Op<"rank", [NoSideEffect]> { let hasFolder = 1; let hasCanonicalizer = 1; - let verifier = [{ return ::verify(*this); }]; + let verifier = [{ return ::verifySizeOrIndexOp(*this); }]; } def Shape_ToExtentTensorOp : Shape_Op<"to_extent_tensor", [NoSideEffect]> { @@ -252,7 +252,7 @@ def Shape_GetExtentOp : Shape_Op<"get_extent", [NoSideEffect]> { }]; let hasFolder = 1; - let verifier = [{ return ::verify(*this); }]; + let verifier = [{ return ::verifySizeOrIndexOp(*this); }]; } def Shape_IndexToSizeOp : Shape_Op<"index_to_size", [NoSideEffect]> { @@ -325,7 +325,7 @@ def Shape_MulOp : Shape_Op<"mul", [Commutative, NoSideEffect]> { $lhs `,` $rhs `:` type($lhs) `,` type($rhs) `->` type($result) attr-dict }]; - let verifier = [{ return ::verify(*this); }]; + let verifier = [{ return ::verifySizeOrIndexOp(*this); }]; let hasFolder = 1; } @@ -412,7 +412,7 @@ def Shape_ShapeOfOp : Shape_Op<"shape_of", [NoSideEffect]> { let assemblyFormat = "$arg `:` type($arg) `->` type($result) attr-dict"; - let verifier = [{ return ::verify(*this); }]; + let verifier = [{ return ::verifyShapeOrExtentTensorOp(*this); }]; let hasFolder = 1; } diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index d2b0dbdedb052..104ab46c55813 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -28,13 +28,37 @@ static RankedTensorType getExtentTensorType(MLIRContext *ctx) { return RankedTensorType::get({ShapedType::kDynamicSize}, IndexType::get(ctx)); } -static bool isErrorPropagationPossible(ArrayRef operandTypes) { +static bool isErrorPropagationPossible(TypeRange operandTypes) { for (Type ty : operandTypes) if (ty.isa() || ty.isa() || ty.isa()) return true; return false; } +static LogicalResult verifySizeOrIndexOp(Operation *op) { + assert(op != nullptr && op->getNumResults() == 1); + Type resultTy = op->getResultTypes().front(); + if (isErrorPropagationPossible(op->getOperandTypes())) { + if (!resultTy.isa()) + return op->emitOpError() + << "if at least one of the operands can hold error values then " + "the result must be of type `size` to propagate them"; + } + return success(); +} + +static LogicalResult verifyShapeOrExtentTensorOp(Operation *op) { + assert(op != nullptr && op->getNumResults() == 1); + Type resultTy = op->getResultTypes().front(); + if (isErrorPropagationPossible(op->getOperandTypes())) { + if (!resultTy.isa()) + return op->emitOpError() + << "if at least one of the operands can hold error values then " + "the result must be of type `shape` to propagate them"; + } + return success(); +} + ShapeDialect::ShapeDialect(MLIRContext *context) : Dialect(getDialectNamespace(), context) { addOperations< @@ -542,23 +566,6 @@ OpFoldResult FromExtentsOp::fold(ArrayRef operands) { // GetExtentOp //===----------------------------------------------------------------------===// -static LogicalResult verify(GetExtentOp op) { - Type shapeTy = op.shape().getType(); - Type dimTy = op.dim().getType(); - Type extentTy = op.extent().getType(); - if (isErrorPropagationPossible({shapeTy, dimTy})) { - if (!extentTy.isa()) - op.emitError() - << "if at least one of the operands can hold error values then the " - "result must be of type `size` to propagate them"; - } else { - if (extentTy.isa()) - op.emitError() << "if none of the operands can hold error values then " - "the result must be of type `index`"; - } - return success(); -} - Optional GetExtentOp::getConstantDim() { if (auto constSizeOp = dim().getDefiningOp()) return constSizeOp.value().getLimitedValue(); @@ -597,15 +604,6 @@ void GetExtentOp::build(OpBuilder &builder, OperationState &result, Value shape, // RankOp //===----------------------------------------------------------------------===// -static LogicalResult verify(shape::RankOp op) { - if (op.shape().getType().isa() && - !op.rank().getType().isa()) - return op.emitOpError() - << "if operand is of type `shape` then the result must be of type " - "`size` to propagate potential errors"; - return success(); -} - OpFoldResult shape::RankOp::fold(ArrayRef operands) { auto shape = operands[0].dyn_cast_or_null(); if (!shape) @@ -680,21 +678,6 @@ OpFoldResult NumElementsOp::fold(ArrayRef operands) { // MulOp //===----------------------------------------------------------------------===// -static LogicalResult verify(MulOp op) { - Type resultTy = op.result().getType(); - if (isErrorPropagationPossible({op.lhs().getType(), op.rhs().getType()})) { - if (!resultTy.isa()) - return op.emitOpError() - << "if at least one of the operands can hold error values then " - "the result must be of type `size` to propagate them"; - } else { - if (resultTy.isa()) - return op.emitError() << "if none of the operands can hold error values " - "then the result must be of type `index`"; - } - return success(); -} - OpFoldResult MulOp::fold(ArrayRef operands) { auto lhs = operands[0].dyn_cast_or_null(); if (!lhs) @@ -719,21 +702,6 @@ OpFoldResult ShapeOfOp::fold(ArrayRef) { return builder.getIndexTensorAttr(type.getShape()); } -static LogicalResult verify(ShapeOfOp op) { - Type resultTy = op.result().getType(); - if (isErrorPropagationPossible(op.arg().getType())) { - if (!resultTy.isa()) - return op.emitOpError() - << "if operand is of type `value_shape` then the result must be " - "of type `shape` to propagate potential error shapes"; - } else { - if (resultTy != getExtentTensorType(op.getContext())) - return op.emitOpError() << "if operand is a shaped type then the result " - "must be an extent tensor"; - } - return success(); -} - //===----------------------------------------------------------------------===// // SizeToIndexOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Shape/invalid.mlir b/mlir/test/Dialect/Shape/invalid.mlir index b4900e491fb82..20f4e877a2a90 100644 --- a/mlir/test/Dialect/Shape/invalid.mlir +++ b/mlir/test/Dialect/Shape/invalid.mlir @@ -90,39 +90,21 @@ func @assuming_all_op_too_few_operands() { func @shape_of(%value_arg : !shape.value_shape, %shaped_arg : tensor) { - // expected-error@+1 {{if operand is of type `value_shape` then the result must be of type `shape` to propagate potential error shapes}} + // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `shape` to propagate them}} %0 = shape.shape_of %value_arg : !shape.value_shape -> tensor return } // ----- -func @shape_of(%value_arg : !shape.value_shape, - %shaped_arg : tensor) { - // expected-error@+1 {{if operand is a shaped type then the result must be an extent tensor}} - %1 = shape.shape_of %shaped_arg : tensor -> !shape.shape - return -} - -// ----- - func @rank(%arg : !shape.shape) { - // expected-error@+1 {{if operand is of type `shape` then the result must be of type `size` to propagate potential errors}} + // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}} %0 = shape.rank %arg : !shape.shape -> index return } // ----- -func @get_extent_error_free(%arg : tensor) -> !shape.size { - %c0 = constant 0 : index - // expected-error@+1 {{if none of the operands can hold error values then the result must be of type `index`}} - %result = shape.get_extent %arg, %c0 : tensor, index -> !shape.size - return %result : !shape.size -} - -// ----- - func @get_extent_error_possible(%arg : tensor) -> index { %c0 = shape.const_size 0 // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}} @@ -132,14 +114,6 @@ func @get_extent_error_possible(%arg : tensor) -> index { // ----- -func @mul_error_free(%arg : index) -> !shape.size { - // expected-error@+1 {{if none of the operands can hold error values then the result must be of type `index`}} - %result = shape.mul %arg, %arg : index, index -> !shape.size - return %result : !shape.size -} - -// ----- - func @mul_error_possible(%lhs : !shape.size, %rhs : index) -> index { // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}} %result = shape.mul %lhs, %rhs : !shape.size, index -> index From 07f227c0eb8c5628842e7f7aa30001b24b8aede9 Mon Sep 17 00:00:00 2001 From: Frederik Gossen Date: Sat, 25 Jul 2020 15:01:21 -0700 Subject: [PATCH 0098/1035] [MLIR][Shape] Allow `num_elements` to operate on extent tensors Re-landing with dependent change landed and error condition relaxed. Beyond the change to error condition exactly https://reviews.llvm.org/D84445. --- mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td | 16 ++++++++-------- mlir/test/Dialect/Shape/canonicalize.mlir | 4 ++-- mlir/test/Dialect/Shape/invalid.mlir | 8 ++++++++ mlir/test/Dialect/Shape/ops.mlir | 11 +++++++++++ mlir/test/Dialect/Shape/shape-to-shape.mlir | 6 +++--- 5 files changed, 32 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index 8c32faee55f90..3c50a4f8b39f4 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -333,19 +333,19 @@ def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> { let summary = "Returns the number of elements for a given shape"; let description = [{ Returns the number of elements for a given shape which is the product of its - dimensions. - - ```mlir - %product = shape.mul %lhs, %rhs - ``` + extents. If the argument is of type `shape` then the result will be of type + `size` and potential errors will be propagated. Otherwise, if the argument + is and extent tensor `tensor` then the result will be of type + `index`. }]; - let arguments = (ins Shape_ShapeType:$shape); - let results = (outs Shape_SizeType:$result); + let arguments = (ins Shape_ShapeOrExtentTensorType:$shape); + let results = (outs Shape_SizeOrIndexType:$result); - let assemblyFormat = "$shape attr-dict"; + let assemblyFormat = "$shape `:` type($shape) `->` type($result) attr-dict"; let hasFolder = 1; + let verifier = [{ return ::verifySizeOrIndexOp(*this); }]; } def Shape_ReduceOp : Shape_Op<"reduce", diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index 577656a0b362a..e147fbeb81ac2 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -217,7 +217,7 @@ func @num_elements() -> !shape.size { // CHECK-NOT: shape.const_shape %shape = shape.const_shape [4, 5, 6] : !shape.shape // CHECK-NOT: shape.num_elements - %num_elements = shape.num_elements %shape + %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size // CHECK: %[[NUM:.*]] = shape.const_size 120 // CHECK-NEXT: return %[[NUM]] : !shape.size return %num_elements : !shape.size @@ -229,7 +229,7 @@ func @num_elements() -> !shape.size { // CHECK-LABEL: func @nonfoldable_num_elements func @nonfoldable_num_elements(%shape : !shape.shape) -> !shape.size { // CHECK-NOT: shape.const_{{.*}} - %num_elements = shape.num_elements %shape + %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size return %num_elements : !shape.size } diff --git a/mlir/test/Dialect/Shape/invalid.mlir b/mlir/test/Dialect/Shape/invalid.mlir index 20f4e877a2a90..4a45181d4587c 100644 --- a/mlir/test/Dialect/Shape/invalid.mlir +++ b/mlir/test/Dialect/Shape/invalid.mlir @@ -120,3 +120,11 @@ func @mul_error_possible(%lhs : !shape.size, %rhs : index) -> index { return %result : index } +// ----- + +func @num_elements_error_possible(%arg : !shape.shape) -> index { + // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}} + %result = shape.num_elements %arg : !shape.shape -> index + return %result : index +} + diff --git a/mlir/test/Dialect/Shape/ops.mlir b/mlir/test/Dialect/Shape/ops.mlir index 3a0cb7781ec72..f57826097d34f 100644 --- a/mlir/test/Dialect/Shape/ops.mlir +++ b/mlir/test/Dialect/Shape/ops.mlir @@ -195,3 +195,14 @@ func @any() { return } +func @num_elements_extent_tensor(%arg : tensor) -> index { + %result = shape.num_elements %arg : tensor -> index + return %result : index +} + +func @num_elements_shape(%arg : !shape.shape) -> !shape.size { + %result = shape.num_elements %arg : !shape.shape -> !shape.size + return %result : !shape.size +} + + diff --git a/mlir/test/Dialect/Shape/shape-to-shape.mlir b/mlir/test/Dialect/Shape/shape-to-shape.mlir index 9a75f0b9ca1bc..d1b00bc12a22c 100644 --- a/mlir/test/Dialect/Shape/shape-to-shape.mlir +++ b/mlir/test/Dialect/Shape/shape-to-shape.mlir @@ -1,9 +1,9 @@ // RUN: mlir-opt -shape-to-shape-lowering -split-input-file %s | FileCheck %s -// CHECK-LABEL: func @num_elements_to_reduce( -// CHECK-SAME: [[ARG:%.*]]: !shape.shape) -> !shape.size { +// CHECK-LABEL: func @num_elements_to_reduce +// CHECK-SAME: ([[ARG:%.*]]: !shape.shape) -> !shape.size func @num_elements_to_reduce(%shape : !shape.shape) -> !shape.size { - %num_elements = shape.num_elements %shape + %num_elements = shape.num_elements %shape : !shape.shape -> !shape.size return %num_elements : !shape.size } // CHECK: [[C1:%.*]] = shape.const_size 1 From 11d5316afd102871027c103a4b8bfe6c072d4368 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 24 Jul 2020 21:17:37 -0700 Subject: [PATCH 0099/1035] [ORC] Don't require PageSize or Triple during TargetProcessControl construction Subclasses will commonly gather that information from a remote during construction, in which case they won't have meaningful values to pass to TargetProcessControl's constructor. --- .../llvm/ExecutionEngine/Orc/TargetProcessControl.h | 1 - llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp | 9 ++++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h index 37bfa5a45b6c0..887ac94e4d5ce 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h @@ -152,7 +152,6 @@ class TargetProcessControl { virtual Expected lookupSymbols(LookupRequest Request) = 0; protected: - TargetProcessControl(Triple TT, unsigned PageSize); Triple TT; unsigned PageSize = 0; diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp index ab07d3ad90b11..3f9a938db339a 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp @@ -19,13 +19,12 @@ namespace orc { TargetProcessControl::MemoryAccess::~MemoryAccess() {} -TargetProcessControl::TargetProcessControl(Triple TT, unsigned PageSize) - : TT(std::move(TT)), PageSize(PageSize) {} - TargetProcessControl::~TargetProcessControl() {} -SelfTargetProcessControl::SelfTargetProcessControl(Triple TT, unsigned PageSize) - : TargetProcessControl(std::move(TT), PageSize) { +SelfTargetProcessControl::SelfTargetProcessControl(Triple TT, + unsigned PageSize) { + this->TT = std::move(TT); + this->PageSize = PageSize; this->MemMgr = IPMM.get(); this->MemAccess = this; if (this->TT.isOSBinFormatMachO()) From a01c4ee71cb2bcdd1fd93396af2ed6dc25f5f828 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Sat, 25 Jul 2020 14:18:52 -0700 Subject: [PATCH 0100/1035] [ORC] Rename TargetProcessControl DynamicLibraryHandle and loadLibrary. The new names, DylibHandle and loadDylib, are more concise and make clear that these utilities are for loading dynamic libraries, not static ones. --- .../Orc/TPCDynamicLibrarySearchGenerator.h | 9 +++--- .../Orc/TargetProcessControl.h | 28 ++++++++++--------- .../Orc/TPCDynamicLibrarySearchGenerator.cpp | 4 +-- .../Orc/TargetProcessControl.cpp | 6 ++-- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h b/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h index 6c95e22a4257d..d35c8abc84a2e 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h @@ -27,10 +27,9 @@ class TPCDynamicLibrarySearchGenerator : public JITDylib::DefinitionGenerator { /// If the Allow predicate is given then only symbols matching the predicate /// will be searched for. If the predicate is not given then all symbols will /// be searched for. - TPCDynamicLibrarySearchGenerator( - TargetProcessControl &TPC, - TargetProcessControl::DynamicLibraryHandle DylibHandle) - : TPC(TPC), DylibHandle(DylibHandle) {} + TPCDynamicLibrarySearchGenerator(TargetProcessControl &TPC, + TargetProcessControl::DylibHandle H) + : TPC(TPC), H(H) {} /// Permanently loads the library at the given path and, on success, returns /// a DynamicLibrarySearchGenerator that will search it for symbol definitions @@ -51,7 +50,7 @@ class TPCDynamicLibrarySearchGenerator : public JITDylib::DefinitionGenerator { private: TargetProcessControl &TPC; - TargetProcessControl::DynamicLibraryHandle DylibHandle; + TargetProcessControl::DylibHandle H; }; } // end namespace orc diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h index 887ac94e4d5ce..e260c64bee512 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h @@ -110,15 +110,18 @@ class TargetProcessControl { } }; - using DynamicLibraryHandle = JITTargetAddress; - - /// Request lookup within a single library. - /// If Library is None then the whole target process should be searched. + /// A handle for a library opened via loadDylib. + /// + /// Note that this handle does not necessarily represent a JITDylib: it may + /// be a regular dynamic library or shared object (e.g. one opened via a + /// dlopen in the target process). + using DylibHandle = JITTargetAddress; + + /// Request lookup within the given DylibHandle. struct LookupRequestElement { - LookupRequestElement(DynamicLibraryHandle Handle, - const SymbolLookupSet &Symbols) + LookupRequestElement(DylibHandle Handle, const SymbolLookupSet &Symbols) : Handle(Handle), Symbols(Symbols) {} - DynamicLibraryHandle Handle; + DylibHandle Handle; const SymbolLookupSet &Symbols; }; @@ -140,11 +143,10 @@ class TargetProcessControl { /// Return a MemoryAccess object for the target process. MemoryAccess &getMemoryAccess() const { return *MemAccess; } - /// Load the library at the given path. Returns a handle to the loaded - /// library. If LibraryPath is null this function will return the global - /// handle for the target process. - virtual Expected - loadLibrary(const char *LibraryPath) = 0; + /// Load the dynamic library at the given path and return a handle to it. + /// If LibraryPath is null this function will return the global handle for + /// the target process. + virtual Expected loadDylib(const char *DylibPath) = 0; /// Search for symbols in the target process. /// The result of the lookup is a 2-dimentional array of target addresses @@ -167,7 +169,7 @@ class SelfTargetProcessControl : public TargetProcessControl, static Expected> Create(); - Expected loadLibrary(const char *LibraryPath) override; + Expected loadDylib(const char *DylibPath) override; Expected lookupSymbols(LookupRequest Request) override; diff --git a/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp b/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp index ea8bde971d1d4..18de5b616eec8 100644 --- a/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp @@ -14,7 +14,7 @@ namespace orc { Expected> TPCDynamicLibrarySearchGenerator::Load(TargetProcessControl &TPC, const char *LibraryPath) { - auto Handle = TPC.loadLibrary(LibraryPath); + auto Handle = TPC.loadDylib(LibraryPath); if (!Handle) return Handle.takeError(); @@ -30,7 +30,7 @@ Error TPCDynamicLibrarySearchGenerator::tryToGenerate( SymbolMap NewSymbols; - TargetProcessControl::LookupRequestElement Request(DylibHandle, Symbols); + TargetProcessControl::LookupRequestElement Request(H, Symbols); auto Result = TPC.lookupSymbols(Request); if (!Result) return Result.takeError(); diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp index 3f9a938db339a..f17f5bf32856f 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp @@ -42,11 +42,11 @@ SelfTargetProcessControl::Create() { return std::make_unique(std::move(TT), *PageSize); } -Expected -SelfTargetProcessControl::loadLibrary(const char *LibraryPath) { +Expected +SelfTargetProcessControl::loadDylib(const char *DylibPath) { std::string ErrMsg; auto Dylib = std::make_unique( - sys::DynamicLibrary::getPermanentLibrary(LibraryPath, &ErrMsg)); + sys::DynamicLibrary::getPermanentLibrary(DylibPath, &ErrMsg)); if (!Dylib->isValid()) return make_error(std::move(ErrMsg), inconvertibleErrorCode()); DynamicLibraries.push_back(std::move(Dylib)); From af1dd0b1adc40e62bd922673af3741ad66989eee Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sat, 25 Jul 2020 22:31:05 +0300 Subject: [PATCH 0101/1035] [Reduce] Basic block reduction: do properly handle invoke insts (PR46818) Terminator may have returned value, so we need to replace uses, and in general handle invoke as a branch inst. I'm not sure this is the best handling, but IMO poorly reduced input is much better than crashing reduction tool. A (previously-crashing!) test added. Fixes https://bugs.llvm.org/show_bug.cgi?id=46818 --- llvm/test/Reduce/remove-bbs-unwinded-to.ll | 39 +++++++++++++++++++ .../llvm-reduce/deltas/ReduceBasicBlocks.cpp | 3 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Reduce/remove-bbs-unwinded-to.ll diff --git a/llvm/test/Reduce/remove-bbs-unwinded-to.ll b/llvm/test/Reduce/remove-bbs-unwinded-to.ll new file mode 100644 index 0000000000000..375678ec99c9d --- /dev/null +++ b/llvm/test/Reduce/remove-bbs-unwinded-to.ll @@ -0,0 +1,39 @@ +; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s + +declare i32 @maybe_throwing_callee() + +; CHECK-ALL: declare void @did_not_throw(i32) +declare void @did_not_throw(i32) + +declare void @thrown() + +; CHECK-ALL: define void @caller() +define void @caller() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-ALL: bb: +bb: +; CHECK-INTERESTINGNESS: label %bb3 +; CHECK-FINAL: br label %bb3 + %i0 = invoke i32 @maybe_throwing_callee() + to label %bb3 unwind label %bb1 + +bb1: + landingpad { i8*, i32 } catch i8* null + call void @thrown() + br label %bb4 + +; CHECK-ALL: bb3: +bb3: +; CHECK-INTERESTINGNESS: call void @did_not_throw(i32 +; CHECK-FINAL: call void @did_not_throw(i32 undef) +; CHECK-ALL: br label %bb4 + call void @did_not_throw(i32 %i0) + br label %bb4 + +; CHECK-ALL: bb4: +; CHECK-ALL: ret void +bb4: + ret void +} + +declare i32 @__gxx_personality_v0(...) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp index 002d81a674872..9dee738d4906b 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp @@ -36,11 +36,12 @@ static void replaceBranchTerminator(BasicBlock &BB, if (ChunkSucessors.size() == Term->getNumSuccessors()) return; - bool IsBranch = isa(Term); + bool IsBranch = isa(Term) || isa(Term); Value *Address = nullptr; if (auto IndBI = dyn_cast(Term)) Address = IndBI->getAddress(); + Term->replaceAllUsesWith(UndefValue::get(Term->getType())); Term->eraseFromParent(); if (ChunkSucessors.empty()) { From 9932d74740b19030f12b82ae93a9e61af32f5931 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sat, 25 Jul 2020 23:24:13 +0300 Subject: [PATCH 0102/1035] [Reduce] Argument reduction: do properly handle invoke insts (PR46819) replaceFunctionCalls() is very non-exhaustive, it only handles CallInst's. Which means, by the time we drop old function, there may still be uses of it lurking around. Let's instead whack-a-mole them by all by replacing with undef. I'm not sure this is the best handling, especially for calls, but IMO poorly reduced input is much better than crashing reduction tool. A (previously-crashing!) test added. Fixes https://bugs.llvm.org/show_bug.cgi?id=46819 --- llvm/test/Reduce/remove-invoked-functions.ll | 55 +++++++++++++++++++ .../llvm-reduce/deltas/ReduceArguments.cpp | 1 + 2 files changed, 56 insertions(+) create mode 100644 llvm/test/Reduce/remove-invoked-functions.ll diff --git a/llvm/test/Reduce/remove-invoked-functions.ll b/llvm/test/Reduce/remove-invoked-functions.ll new file mode 100644 index 0000000000000..e4458e662fee3 --- /dev/null +++ b/llvm/test/Reduce/remove-invoked-functions.ll @@ -0,0 +1,55 @@ +; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s + +; CHECK-INTERESTINGNESS: define i32 @maybe_throwing_callee( +; CHECK-FINAL: define i32 @maybe_throwing_callee() +define i32 @maybe_throwing_callee(i32 %arg) { +; CHECK-ALL: call void @thrown() +; CHECK-INTERESTINGNESS: ret i32 +; CHECK-FINAL: ret i32 undef + call void @thrown() + ret i32 %arg +} + +; CHECK-ALL: declare void @did_not_throw(i32) +declare void @did_not_throw(i32) + +; CHECK-ALL: declare void @thrown() +declare void @thrown() + +; CHECK-INTERESTINGNESS: define void @caller( +; CHECK-FINAL: define void @caller(i32 %arg) +define void @caller(i32 %arg) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-ALL: bb: +bb: +; CHECK-INTERESTINGNESS: %i0 = invoke i32 +; CHECK-FINAL: %i0 = invoke i32 undef(i32 %arg) +; CHECK-ALL: to label %bb3 unwind label %bb1 + %i0 = invoke i32 @maybe_throwing_callee(i32 %arg) + to label %bb3 unwind label %bb1 + +; CHECK-ALL: bb1: +bb1: +; CHECK-ALL: landingpad { i8*, i32 } +; CHECK-ALL: catch i8* null +; CHECK-ALL: call void @thrown() +; CHECK-ALL: br label %bb4 + landingpad { i8*, i32 } + catch i8* null + call void @thrown() + br label %bb4 + +; CHECK-ALL: bb3: +bb3: +; CHECK-ALL: call void @did_not_throw(i32 %i0) +; CHECK-ALL: br label %bb4 + call void @did_not_throw(i32 %i0) + br label %bb4 + +; CHECK-ALL: bb4: +; CHECK-ALL: ret void +bb4: + ret void +} + +declare i32 @__gxx_personality_v0(...) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp index e0e1d1c225670..1eafc2c560dea 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp @@ -94,6 +94,7 @@ static void extractArgumentsFromModule(std::vector ChunksToKeep, replaceFunctionCalls(*F, *ClonedFunc, ArgIndexesToKeep); // Rename Cloned Function to Old's name std::string FName = std::string(F->getName()); + F->replaceAllUsesWith(UndefValue::get(F->getType())); F->eraseFromParent(); ClonedFunc->setName(FName); } From 96d74530c09edd95452295bb4e300ab310a9bb2d Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 26 Jul 2020 00:56:36 +0300 Subject: [PATCH 0103/1035] [Reduce] Argument reduction: do deal with function declarations We can happily turn function definitions into declarations, thus obscuring their argument from being elided by this pass. I don't believe there is a good reason to just ignore declarations. likely even proper llvm intrinsics ones, at worst the input becomes uninteresting. The other question here is that all these transforms are all-or-nothing. In some cases, should we be treating each use separately? The main blocker here seemed to be that llvm::CloneFunctionInto() does `&OldFunc->front()`, which inserts a nullptr into a densemap, which is not happy about it and asserts. --- llvm/lib/Transforms/Utils/CloneFunction.cpp | 5 ++++ ...ultiple-use-of-args-in-same-instruction.py | 13 ---------- ...-use-of-global-vars-in-same-instruction.py | 13 ---------- .../Reduce/remove-args-from-declaration.ll | 24 +++++++++++++++++++ ...ultiple-use-of-args-in-same-instruction.ll | 9 +++---- ...-use-of-global-vars-in-same-instruction.ll | 12 +++++----- .../llvm-reduce/deltas/ReduceArguments.cpp | 4 ++-- 7 files changed, 42 insertions(+), 38 deletions(-) delete mode 100644 llvm/test/Reduce/Inputs/remove-multiple-use-of-args-in-same-instruction.py delete mode 100644 llvm/test/Reduce/Inputs/remove-multiple-use-of-global-vars-in-same-instruction.py create mode 100644 llvm/test/Reduce/remove-args-from-declaration.ll diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 788983c156903..957e4028bae77 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -147,6 +147,11 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, TypeMapper, Materializer)); } + // Everything else beyond this point deals with function instructions, + // so if we are dealing with a function declaration, we're done. + if (OldFunc->isDeclaration()) + return; + // When we remap instructions, we want to avoid duplicating inlined // DISubprograms, so record all subprograms we find as we duplicate // instructions and then freeze them in the MD map. diff --git a/llvm/test/Reduce/Inputs/remove-multiple-use-of-args-in-same-instruction.py b/llvm/test/Reduce/Inputs/remove-multiple-use-of-args-in-same-instruction.py deleted file mode 100644 index 93fa5c0bc29e7..0000000000000 --- a/llvm/test/Reduce/Inputs/remove-multiple-use-of-args-in-same-instruction.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - -FunctionCallPresent = False - -input = open(sys.argv[1], "r") -for line in input: - if "call void @use" in line: - FunctionCallPresent = True - -if FunctionCallPresent: - sys.exit(0) # Interesting! - -sys.exit(1) diff --git a/llvm/test/Reduce/Inputs/remove-multiple-use-of-global-vars-in-same-instruction.py b/llvm/test/Reduce/Inputs/remove-multiple-use-of-global-vars-in-same-instruction.py deleted file mode 100644 index 93fa5c0bc29e7..0000000000000 --- a/llvm/test/Reduce/Inputs/remove-multiple-use-of-global-vars-in-same-instruction.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - -FunctionCallPresent = False - -input = open(sys.argv[1], "r") -for line in input: - if "call void @use" in line: - FunctionCallPresent = True - -if FunctionCallPresent: - sys.exit(0) # Interesting! - -sys.exit(1) diff --git a/llvm/test/Reduce/remove-args-from-declaration.ll b/llvm/test/Reduce/remove-args-from-declaration.ll new file mode 100644 index 0000000000000..f476495c57314 --- /dev/null +++ b/llvm/test/Reduce/remove-args-from-declaration.ll @@ -0,0 +1,24 @@ +; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s + +; CHECK-INTERESTINGNESS-LABEL: @interesting( +; CHECK-INTERESTINGNESS-SAME: i32 +; CHECK-FINAL: declare void @interesting(i32) +declare void @interesting(i32 %uninteresting1, i32 %interesting, i32 %uninteresting2) + +; CHECK-INTERESTINGNESS-LABEL: @interesting2( +; CHECK-INTERESTINGNESS-SAME: i32 +; CHECK-FINAL: declare void @interesting2(i32) +declare void @interesting2(i32 %uninteresting1, i32 %interesting, i32 %uninteresting2) + +; CHECK-INTERESTINGNESS-LABEL: @callee( +; CHECK-INTERESTINGNESS-SAME: i32 %interesting +; CHECK-FINAL: define void @callee(i32 %interesting) { +define void @callee(i32 %uninteresting1, i32 %interesting, i32 %uninteresting2) { +; CHECK-INTERESTINGNESS: call void @interesting2( +; CHECK-INTERESTINGNESS-SAME: i32 %interesting +; CHECK-FINAL: call void @interesting2(i32 %interesting) + call void @interesting2(i32 %uninteresting1, i32 %interesting, i32 %uninteresting2) +; CHECK-ALL: ret void + ret void +} diff --git a/llvm/test/Reduce/remove-multiple-use-of-args-in-same-instruction.ll b/llvm/test/Reduce/remove-multiple-use-of-args-in-same-instruction.ll index 21a638f1e6bce..cd23d6e616022 100644 --- a/llvm/test/Reduce/remove-multiple-use-of-args-in-same-instruction.ll +++ b/llvm/test/Reduce/remove-multiple-use-of-args-in-same-instruction.ll @@ -1,14 +1,15 @@ ; Test that llvm-reduce can remove uninteresting function arguments from function definitions as well as their calls. ; -; RUN: llvm-reduce --test %python --test-arg %p/Inputs/remove-multiple-use-of-args-in-same-instruction.py %s -o %t -; RUN: cat %t | FileCheck -implicit-check-not=uninteresting %s +; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s +; CHECK-ALL: declare void @use(i32, i32, i32) declare void @use(i32, i32, i32) -; CHECK-LABEL: @interesting(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3 +; CHECK-ALL: @interesting(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3 define void @interesting(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3) { entry: - ; CHECK: call void @use(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3) + ; CHECK-ALL: call void @use(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3) call void @use(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3) call void @use(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3) call void @use(i32 %uninteresting1, i32 %uninteresting2, i32 %uninteresting3) diff --git a/llvm/test/Reduce/remove-multiple-use-of-global-vars-in-same-instruction.ll b/llvm/test/Reduce/remove-multiple-use-of-global-vars-in-same-instruction.ll index 4400bc818e554..6d62bd2938d7a 100644 --- a/llvm/test/Reduce/remove-multiple-use-of-global-vars-in-same-instruction.ll +++ b/llvm/test/Reduce/remove-multiple-use-of-global-vars-in-same-instruction.ll @@ -1,11 +1,11 @@ ; Test that llvm-reduce can remove uninteresting function arguments from function definitions as well as their calls. ; -; RUN: llvm-reduce --test %python --test-arg %p/Inputs/remove-multiple-use-of-global-vars-in-same-instruction.py %s -o %t -; RUN: cat %t | FileCheck -implicit-check-not=uninteresting %s +; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s -; CHECK: @uninteresting1 = global -; CHECK: @uninteresting2 = global -; CHECK: @uninteresting3 = global +; CHECK-ALL: @uninteresting1 = global +; CHECK-ALL: @uninteresting2 = global +; CHECK-ALL: @uninteresting3 = global @uninteresting1 = global i32 0, align 4 @uninteresting2 = global i32 0, align 4 @uninteresting3 = global i32 0, align 4 @@ -15,7 +15,7 @@ declare void @use(i32*, i32*, i32*) ; CHECK-LABEL: @interesting() define void @interesting() { entry: - ; CHECK: call void @use(i32* @uninteresting1, i32* @uninteresting2, i32* @uninteresting3) + ; CHECK-ALL: call void @use(i32* @uninteresting1, i32* @uninteresting2, i32* @uninteresting3) call void @use(i32* @uninteresting1, i32* @uninteresting2, i32* @uninteresting3) call void @use(i32* @uninteresting1, i32* @uninteresting2, i32* @uninteresting3) call void @use(i32* @uninteresting1, i32* @uninteresting2, i32* @uninteresting3) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp index 1eafc2c560dea..9488d71b71c35 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp @@ -48,7 +48,7 @@ static void extractArgumentsFromModule(std::vector ChunksToKeep, std::vector Funcs; // Get inside-chunk arguments, as well as their parent function for (auto &F : *Program) - if (!F.isDeclaration()) { + if (!F.arg_empty()) { Funcs.push_back(&F); for (auto &A : F.args()) if (O.shouldKeep()) @@ -108,7 +108,7 @@ static int countArguments(Module *Program) { outs() << "Param Index Reference:\n"; int ArgsCount = 0; for (auto &F : *Program) - if (!F.isDeclaration() && F.arg_size()) { + if (!F.arg_empty()) { outs() << " " << F.getName() << "\n"; for (auto &A : F.args()) outs() << "\t" << ++ArgsCount << ": " << A.getName() << "\n"; From c5b23714368eabfb22fcd7f7567cf2a9830c8d8b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jul 2020 16:36:33 -0700 Subject: [PATCH 0104/1035] [X86] Add masked versions of the VPTERNLOG test cases added for D83630. NFC We don't handle these yet and D83630 won't improve that, but at least we'll have the tests. --- llvm/test/CodeGen/X86/avx512-logic.ll | 136 ++++++++++++ llvm/test/CodeGen/X86/avx512vl-logic.ll | 270 ++++++++++++++++++++++++ 2 files changed, 406 insertions(+) diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll index 88a3b5aea9bd4..30607214f56d5 100644 --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -919,3 +919,139 @@ define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) { %b = xor <8 x i64> %a, %y ret <8 x i64> %b } + +define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) { +; KNL-LABEL: ternlog_maskz_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %zmm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <16 x i32> %mask, zeroinitializer + %a = and <16 x i32> %x, + %b = or <16 x i32> %a, %y + %c = select <16 x i1> %m, <16 x i32> %b, <16 x i32> zeroinitializer + ret <16 x i32> %c +} + +define <8 x i64> @ternlog_maskz_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) { +; KNL-LABEL: ternlog_maskz_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <8 x i64> %mask, zeroinitializer + %a = and <8 x i64> %x, + %b = xor <8 x i64> %a, %y + %c = select <8 x i1> %m, <8 x i64> %b, <8 x i64> zeroinitializer + ret <8 x i64> %c +} + +define <16 x i32> @ternlog_maskx_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) { +; KNL-LABEL: ternlog_maskx_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm2 +; KNL-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %zmm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <16 x i32> %mask, zeroinitializer + %a = and <16 x i32> %x, + %b = or <16 x i32> %a, %y + %c = select <16 x i1> %m, <16 x i32> %b, <16 x i32> %x + ret <16 x i32> %c +} + +define <16 x i32> @ternlog_masky_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) { +; KNL-LABEL: ternlog_masky_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %zmm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + %m = icmp slt <16 x i32> %mask, zeroinitializer + %a = and <16 x i32> %x, + %b = or <16 x i32> %a, %y + %c = select <16 x i1> %m, <16 x i32> %b, <16 x i32> %y + ret <16 x i32> %c +} + +define <8 x i64> @ternlog_maskx_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) { +; KNL-LABEL: ternlog_maskx_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm2 +; KNL-NEXT: vpxorq %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vxorpd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <8 x i64> %mask, zeroinitializer + %a = and <8 x i64> %x, + %b = xor <8 x i64> %a, %y + %c = select <8 x i1> %m, <8 x i64> %b, <8 x i64> %x + ret <8 x i64> %c +} + +define <8 x i64> @ternlog_masky_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) { +; KNL-LABEL: ternlog_masky_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq + %m = icmp slt <8 x i64> %mask, zeroinitializer + %a = and <8 x i64> %x, + %b = xor <8 x i64> %a, %y + %c = select <8 x i1> %m, <8 x i64> %b, <8 x i64> %y + ret <8 x i64> %c +} diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll index 26d905ebeae77..3f0ce30928478 100644 --- a/llvm/test/CodeGen/X86/avx512vl-logic.ll +++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll @@ -1031,3 +1031,273 @@ define <4 x i64> @ternlog_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y) { %b = xor <4 x i64> %a, %y ret <4 x i64> %b } + +define <4 x i32> @ternlog_maskz_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) { +; KNL-LABEL: ternlog_maskz_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %xmm3, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <4 x i32> %mask, zeroinitializer + %a = and <4 x i32> %x, + %b = or <4 x i32> %a, %y + %c = select <4 x i1> %m, <4 x i32> %b, <4 x i32> zeroinitializer + ret <4 x i32> %c +} + +define <8 x i32> @ternlog_maskz_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) { +; KNL-LABEL: ternlog_maskz_or_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_or_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %ymm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <8 x i32> %mask, zeroinitializer + %a = and <8 x i32> %x, + %b = or <8 x i32> %a, %y + %c = select <8 x i1> %m, <8 x i32> %b, <8 x i32> zeroinitializer + ret <8 x i32> %c +} + +define <2 x i64> @ternlog_maskz_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) { +; KNL-LABEL: ternlog_maskz_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpxorq %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %xmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <2 x i64> %mask, zeroinitializer + %a = and <2 x i64> %x, + %b = xor <2 x i64> %a, %y + %c = select <2 x i1> %m, <2 x i64> %b, <2 x i64> zeroinitializer + ret <2 x i64> %c +} + +define <4 x i64> @ternlog_maskz_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) { +; KNL-LABEL: ternlog_maskz_xor_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxorq %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_xor_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %ymm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <4 x i64> %mask, zeroinitializer + %a = and <4 x i64> %x, + %b = xor <4 x i64> %a, %y + %c = select <4 x i1> %m, <4 x i64> %b, <4 x i64> zeroinitializer + ret <4 x i64> %c +} + +define <4 x i32> @ternlog_maskx_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) { +; KNL-LABEL: ternlog_maskx_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; KNL-NEXT: vpord %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %xmm3, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vorps %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <4 x i32> %mask, zeroinitializer + %a = and <4 x i32> %x, + %b = or <4 x i32> %a, %y + %c = select <4 x i1> %m, <4 x i32> %b, <4 x i32> %x + ret <4 x i32> %c +} + +define <8 x i32> @ternlog_maskx_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) { +; KNL-LABEL: ternlog_maskx_or_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_or_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %ymm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <8 x i32> %mask, zeroinitializer + %a = and <8 x i32> %x, + %b = or <8 x i32> %a, %y + %c = select <8 x i1> %m, <8 x i32> %b, <8 x i32> %x + ret <8 x i32> %c +} + +define <2 x i64> @ternlog_maskx_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) { +; KNL-LABEL: ternlog_maskx_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; KNL-NEXT: vpxorq %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %xmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vxorpd %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <2 x i64> %mask, zeroinitializer + %a = and <2 x i64> %x, + %b = xor <2 x i64> %a, %y + %c = select <2 x i1> %m, <2 x i64> %b, <2 x i64> %x + ret <2 x i64> %c +} + +define <4 x i64> @ternlog_maskx_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) { +; KNL-LABEL: ternlog_maskx_xor_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpxorq %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_xor_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %ymm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vxorpd %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <4 x i64> %mask, zeroinitializer + %a = and <4 x i64> %x, + %b = xor <4 x i64> %a, %y + %c = select <4 x i1> %m, <4 x i64> %b, <4 x i64> %x + ret <4 x i64> %c +} + +define <4 x i32> @ternlog_masky_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) { +; KNL-LABEL: ternlog_masky_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpord %xmm1, %xmm0, %xmm1 {%k1} +; KNL-NEXT: vmovdqa %xmm1, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %xmm3, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vorps %xmm1, %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: retq + %m = icmp slt <4 x i32> %mask, zeroinitializer + %a = and <4 x i32> %x, + %b = or <4 x i32> %a, %y + %c = select <4 x i1> %m, <4 x i32> %b, <4 x i32> %y + ret <4 x i32> %c +} + +define <8 x i32> @ternlog_masky_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) { +; KNL-LABEL: ternlog_masky_or_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_or_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %ymm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <8 x i32> %mask, zeroinitializer + %a = and <8 x i32> %x, + %b = or <8 x i32> %a, %y + %c = select <8 x i1> %m, <8 x i32> %b, <8 x i32> %x + ret <8 x i32> %c +} + +define <2 x i64> @ternlog_masky_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) { +; KNL-LABEL: ternlog_masky_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpxorq %xmm1, %xmm0, %xmm1 {%k1} +; KNL-NEXT: vmovdqa %xmm1, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %xmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: retq + %m = icmp slt <2 x i64> %mask, zeroinitializer + %a = and <2 x i64> %x, + %b = xor <2 x i64> %a, %y + %c = select <2 x i1> %m, <2 x i64> %b, <2 x i64> %y + ret <2 x i64> %c +} + +define <4 x i64> @ternlog_masky_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) { +; KNL-LABEL: ternlog_masky_xor_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxorq %ymm1, %ymm0, %ymm1 {%k1} +; KNL-NEXT: vmovdqa %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_xor_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %ymm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: retq + %m = icmp slt <4 x i64> %mask, zeroinitializer + %a = and <4 x i64> %x, + %b = xor <4 x i64> %a, %y + %c = select <4 x i1> %m, <4 x i64> %b, <4 x i64> %y + ret <4 x i64> %c +} From 55dae9c20ce3f1149cf4c773258536f972e411c0 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Sat, 25 Jul 2020 16:40:06 -0700 Subject: [PATCH 0105/1035] [Statepoints] Style cleanup after 3da1a963 [NFC] Just fixing a few minor stylistic issues. --- .../SelectionDAG/StatepointLowering.cpp | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index b8c4c73bcccee..3063993ba97d8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -678,27 +678,24 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, SDValue SDV = Builder.getValue(V); SDValue Loc = Builder.StatepointLowering.getLocation(SDV); - if (LowerAsVReg.count(SDV)) { - SpillMap[V] = None; - } else if (Loc.getNode()) { + if (Loc.getNode()) { + // If this is a value we spilled, remember where for when we visit the + // gc.relocate corresponding to this gc.statepoint SpillMap[V] = cast(Loc)->getIndex(); } else { - // Record value as visited, but not spilled. This is case for allocas - // and constants. For this values we can avoid emitting spill load while - // visiting corresponding gc_relocate. - // Actually we do not need to record them in this map at all. - // We do this only to check that we are not relocating any unvisited - // value. + // If we didn't spill the value - allocas, constants, and values lowered + // as tied vregs - mark them as visited, but not spilled. Marking them + // visited (as opposed to simply missing in the map), allows tighter + // assertion checking. SpillMap[V] = None; - // Default llvm mechanisms for exporting values which are used in - // different basic blocks does not work for gc relocates. - // Note that it would be incorrect to teach llvm that all relocates are - // uses of the corresponding values so that it would automatically - // export them. Relocates of the spilled values does not use original - // value. - if (Relocate->getParent() != StatepointInstr->getParent()) + // Conservatively export all values used by gc.relocates outside this + // block. This is currently only needed for expressions which don't need + // relocation, but will likely be extended for vreg case shortly. + if (Relocate->getParent() != StatepointInstr->getParent()) { Builder.ExportFromCurrentBlock(V); + assert(!LowerAsVReg.count(SDV)); + } } } } @@ -844,9 +841,9 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( SmallVector NodeTys; for (auto &Ptr : SI.Ptrs) { SDValue SD = getValue(Ptr); - if (LowerAsVReg.count(SD)) { - NodeTys.push_back(SD.getValueType()); - } + if (!LowerAsVReg.count(SD)) + continue; + NodeTys.push_back(SD.getValueType()); } LLVM_DEBUG(dbgs() << "Statepoint has " << NodeTys.size() << " results\n"); assert(NodeTys.size() == LowerAsVReg.size() && "Inconsistent GC Ptr lowering"); @@ -866,8 +863,9 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( for (const auto *Relocate : SI.GCRelocates) { Value *Derived = Relocate->getDerivedPtr(); SDValue SD = getValue(Derived); - if (LowerAsVReg.count(SD)) - DPtrMap[Derived] = SDValue(StatepointMCNode, LowerAsVReg[SD]); + if (!LowerAsVReg.count(SD)) + continue; + DPtrMap[Derived] = SDValue(StatepointMCNode, LowerAsVReg[SD]); } // Build the GC_TRANSITION_END node if necessary. From cdead4f89c0eecf11f50092bc088e3a9c6511825 Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Sat, 25 Jul 2020 20:28:52 -0400 Subject: [PATCH 0106/1035] [PowerPC][NFC] Fix an assert that cannot trip from 7d076e19e31a I mixed up the precedence of operators in the assert and thought I had it right since there was no compiler warning. This just adds the parentheses in the expression as needed. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8c28ead9f6041..ae840a9fa37de 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9311,7 +9311,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (!BVNIsConstantSplat || SplatBitSize > 32) { bool IsPermutedLoad = false; - const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); + const SDValue *InputLoad = + getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. if (InputLoad && DAG.isSplatValue(Op, true)) { @@ -9949,7 +9950,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (IsPermutedLoad) { assert(isLittleEndian && "Unexpected permuted load on big endian target"); SplatIdx += IsFourByte ? 2 : 1; - assert(SplatIdx < IsFourByte ? 4 : 2 && + assert((SplatIdx < (IsFourByte ? 4 : 2)) && "Splat of a value outside of the loaded memory"); } From 18975762c1974047baeeacb17879416410012a31 Mon Sep 17 00:00:00 2001 From: Eric Christopher Date: Sat, 25 Jul 2020 18:34:02 -0700 Subject: [PATCH 0107/1035] Fold StatepointBB into checks as it's only used from an NDEBUG or ASSERT context fixing an unused variable warning. --- llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 3063993ba97d8..4d93dec9e1fad 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -1087,13 +1087,12 @@ void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) { } void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { - const BasicBlock *StatepointBB = Relocate.getStatepoint()->getParent(); #ifndef NDEBUG // Consistency check // We skip this check for relocates not in the same basic block as their // statepoint. It would be too expensive to preserve validation info through // different basic blocks. - if (StatepointBB == Relocate.getParent()) + if (Relocate.getStatepoint()->getParent() == Relocate.getParent()) StatepointLowering.relocCallVisited(Relocate); auto *Ty = Relocate.getType()->getScalarType(); @@ -1117,7 +1116,8 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { auto It = DPtrMap.find(DerivedPtr); if (It != DPtrMap.end()) { setValue(&Relocate, It->second); - assert(Relocate.getParent() == StatepointBB && "unexpected DPtrMap entry"); + assert(Relocate.getParent() == Relocate.getStatepoint()->getParent() && + "unexpected DPtrMap entry"); return; } From 4b14ef33e81c01632e848e7a67ccc6b11fb4c595 Mon Sep 17 00:00:00 2001 From: Eric Christopher Date: Sat, 25 Jul 2020 18:42:04 -0700 Subject: [PATCH 0108/1035] Temporarily Revert "Unify the return value of GetByteSize to an llvm::Optional (NFC-ish)" as it's causing numerous (176) test failures on linux. This reverts commit 1d9b860fb6a85df33fd52fcacc6a5efb421621bd. --- lldb/include/lldb/Core/ValueObject.h | 2 +- lldb/include/lldb/Core/ValueObjectCast.h | 2 +- lldb/include/lldb/Core/ValueObjectChild.h | 2 +- .../lldb/Core/ValueObjectConstResult.h | 4 ++-- .../lldb/Core/ValueObjectDynamicValue.h | 2 +- lldb/include/lldb/Core/ValueObjectMemory.h | 2 +- lldb/include/lldb/Core/ValueObjectRegister.h | 4 ++-- .../lldb/Core/ValueObjectSyntheticFilter.h | 2 +- lldb/include/lldb/Core/ValueObjectVariable.h | 2 +- .../lldb/Expression/ExpressionVariable.h | 2 +- .../lldb/Target/StackFrameRecognizer.h | 4 +--- lldb/source/API/SBValue.cpp | 2 +- .../Commands/CommandObjectWatchpoint.cpp | 2 +- lldb/source/Core/ValueObject.cpp | 12 +++++----- lldb/source/Core/ValueObjectCast.cpp | 2 +- lldb/source/Core/ValueObjectConstResult.cpp | 10 ++++---- lldb/source/Core/ValueObjectDynamicValue.cpp | 2 +- lldb/source/Core/ValueObjectMemory.cpp | 8 ++++--- lldb/source/Core/ValueObjectRegister.cpp | 6 ++--- .../Core/ValueObjectSyntheticFilter.cpp | 4 +--- lldb/source/Core/ValueObjectVariable.cpp | 6 ++--- lldb/source/Expression/ExpressionVariable.cpp | 8 +++---- lldb/source/Expression/Materializer.cpp | 23 ++++++++----------- lldb/source/Target/StackFrame.cpp | 6 ++--- 24 files changed, 57 insertions(+), 62 deletions(-) diff --git a/lldb/include/lldb/Core/ValueObject.h b/lldb/include/lldb/Core/ValueObject.h index a557d69f3ae30..0080368fd9965 100644 --- a/lldb/include/lldb/Core/ValueObject.h +++ b/lldb/include/lldb/Core/ValueObject.h @@ -358,7 +358,7 @@ class ValueObject : public UserID { virtual bool CanProvideValue(); // Subclasses must implement the functions below. - virtual llvm::Optional GetByteSize() = 0; + virtual uint64_t GetByteSize() = 0; virtual lldb::ValueType GetValueType() const = 0; diff --git a/lldb/include/lldb/Core/ValueObjectCast.h b/lldb/include/lldb/Core/ValueObjectCast.h index 342803f8ca63a..d91ca6a92be8d 100644 --- a/lldb/include/lldb/Core/ValueObjectCast.h +++ b/lldb/include/lldb/Core/ValueObjectCast.h @@ -30,7 +30,7 @@ class ValueObjectCast : public ValueObject { ConstString name, const CompilerType &cast_type); - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; size_t CalculateNumChildren(uint32_t max) override; diff --git a/lldb/include/lldb/Core/ValueObjectChild.h b/lldb/include/lldb/Core/ValueObjectChild.h index 9a9fd9294261a..c6f44a29b0591 100644 --- a/lldb/include/lldb/Core/ValueObjectChild.h +++ b/lldb/include/lldb/Core/ValueObjectChild.h @@ -30,7 +30,7 @@ class ValueObjectChild : public ValueObject { public: ~ValueObjectChild() override; - llvm::Optional GetByteSize() override { return m_byte_size; } + uint64_t GetByteSize() override { return m_byte_size; } lldb::offset_t GetByteOffset() override { return m_byte_offset; } diff --git a/lldb/include/lldb/Core/ValueObjectConstResult.h b/lldb/include/lldb/Core/ValueObjectConstResult.h index 8d823baa0b7b4..0e868c687e931 100644 --- a/lldb/include/lldb/Core/ValueObjectConstResult.h +++ b/lldb/include/lldb/Core/ValueObjectConstResult.h @@ -62,7 +62,7 @@ class ValueObjectConstResult : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const Status &error); - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; lldb::ValueType GetValueType() const override; @@ -113,7 +113,7 @@ class ValueObjectConstResult : public ValueObject { CompilerType GetCompilerTypeImpl() override; ConstString m_type_name; - llvm::Optional m_byte_size; + uint64_t m_byte_size; ValueObjectConstResultImpl m_impl; diff --git a/lldb/include/lldb/Core/ValueObjectDynamicValue.h b/lldb/include/lldb/Core/ValueObjectDynamicValue.h index 2806857339efb..9f5304b55e934 100644 --- a/lldb/include/lldb/Core/ValueObjectDynamicValue.h +++ b/lldb/include/lldb/Core/ValueObjectDynamicValue.h @@ -34,7 +34,7 @@ class ValueObjectDynamicValue : public ValueObject { public: ~ValueObjectDynamicValue() override; - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectMemory.h b/lldb/include/lldb/Core/ValueObjectMemory.h index b5d5e6ecf4c0e..d1cd6ae41445d 100644 --- a/lldb/include/lldb/Core/ValueObjectMemory.h +++ b/lldb/include/lldb/Core/ValueObjectMemory.h @@ -40,7 +40,7 @@ class ValueObjectMemory : public ValueObject { const Address &address, const CompilerType &ast_type); - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectRegister.h b/lldb/include/lldb/Core/ValueObjectRegister.h index 3968584ad5185..41051d93b707e 100644 --- a/lldb/include/lldb/Core/ValueObjectRegister.h +++ b/lldb/include/lldb/Core/ValueObjectRegister.h @@ -36,7 +36,7 @@ class ValueObjectRegisterSet : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t set_idx); - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegisterSet; @@ -86,7 +86,7 @@ class ValueObjectRegister : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t reg_num); - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegister; diff --git a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h index 41c461ce13f0d..cb471657aec9b 100644 --- a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h +++ b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h @@ -36,7 +36,7 @@ class ValueObjectSynthetic : public ValueObject { public: ~ValueObjectSynthetic() override; - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectVariable.h b/lldb/include/lldb/Core/ValueObjectVariable.h index 23fdedbf5a4a6..b7e262574a14d 100644 --- a/lldb/include/lldb/Core/ValueObjectVariable.h +++ b/lldb/include/lldb/Core/ValueObjectVariable.h @@ -37,7 +37,7 @@ class ValueObjectVariable : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const lldb::VariableSP &var_sp); - llvm::Optional GetByteSize() override; + uint64_t GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Expression/ExpressionVariable.h b/lldb/include/lldb/Expression/ExpressionVariable.h index 4259e6395da47..60062d212badf 100644 --- a/lldb/include/lldb/Expression/ExpressionVariable.h +++ b/lldb/include/lldb/Expression/ExpressionVariable.h @@ -32,7 +32,7 @@ class ExpressionVariable virtual ~ExpressionVariable(); - llvm::Optional GetByteSize() { return m_frozen_sp->GetByteSize(); } + size_t GetByteSize() { return m_frozen_sp->GetByteSize(); } ConstString GetName() { return m_frozen_sp->GetName(); } diff --git a/lldb/include/lldb/Target/StackFrameRecognizer.h b/lldb/include/lldb/Target/StackFrameRecognizer.h index baffc890bb065..302b56bec907b 100644 --- a/lldb/include/lldb/Target/StackFrameRecognizer.h +++ b/lldb/include/lldb/Target/StackFrameRecognizer.h @@ -154,9 +154,7 @@ class ValueObjectRecognizerSynthesizedValue : public ValueObject { SetName(parent.GetName()); } - llvm::Optional GetByteSize() override { - return m_parent->GetByteSize(); - } + uint64_t GetByteSize() override { return m_parent->GetByteSize(); } lldb::ValueType GetValueType() const override { return m_type; } bool UpdateValue() override { if (!m_parent->UpdateValueIfNeeded()) return false; diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp index 686d1f23a75a8..7485b0ee1838e 100644 --- a/lldb/source/API/SBValue.cpp +++ b/lldb/source/API/SBValue.cpp @@ -333,7 +333,7 @@ size_t SBValue::GetByteSize() { ValueLocker locker; lldb::ValueObjectSP value_sp(GetSP(locker)); if (value_sp) { - result = value_sp->GetByteSize().getValueOr(0); + result = value_sp->GetByteSize(); } return result; diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp index c2a008af79d6f..ce4662930a7c2 100644 --- a/lldb/source/Commands/CommandObjectWatchpoint.cpp +++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp @@ -905,7 +905,7 @@ corresponding to the byte size of the data type."); // We're in business. // Find out the size of this variable. size = m_option_watchpoint.watch_size == 0 - ? valobj_sp->GetByteSize().getValueOr(0) + ? valobj_sp->GetByteSize() : m_option_watchpoint.watch_size; } compiler_type = valobj_sp->GetCompilerType(); diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index aedefd0cf0fd9..3a775b07e5e1f 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -849,7 +849,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize().getValueOr(0); + const size_t byte_size = GetByteSize(); Value::ValueType value_type = m_value.GetValueType(); @@ -1524,7 +1524,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize().getValueOr(0); + const size_t byte_size = GetByteSize(); Value::ValueType value_type = m_value.GetValueType(); @@ -1741,13 +1741,13 @@ ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to, uint32_t bit_field_offset = from; if (GetDataExtractor().GetByteOrder() == eByteOrderBig) bit_field_offset = - GetByteSize().getValueOr(0) * 8 - bit_field_size - bit_field_offset; + GetByteSize() * 8 - bit_field_size - bit_field_offset; // We haven't made a synthetic array member for INDEX yet, so lets make // one and cache it for any future reference. ValueObjectChild *synthetic_child = new ValueObjectChild( - *this, GetCompilerType(), index_const_str, - GetByteSize().getValueOr(0), 0, bit_field_size, bit_field_offset, - false, false, eAddressTypeInvalid, 0); + *this, GetCompilerType(), index_const_str, GetByteSize(), 0, + bit_field_size, bit_field_offset, false, false, eAddressTypeInvalid, + 0); // Cache the value if we got one back... if (synthetic_child) { diff --git a/lldb/source/Core/ValueObjectCast.cpp b/lldb/source/Core/ValueObjectCast.cpp index 7b6d3591faf44..22e856be539b5 100644 --- a/lldb/source/Core/ValueObjectCast.cpp +++ b/lldb/source/Core/ValueObjectCast.cpp @@ -47,7 +47,7 @@ size_t ValueObjectCast::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -llvm::Optional ValueObjectCast::GetByteSize() { +uint64_t ValueObjectCast::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); return m_value.GetValueByteSize(nullptr, &exe_ctx); } diff --git a/lldb/source/Core/ValueObjectConstResult.cpp b/lldb/source/Core/ValueObjectConstResult.cpp index fd31ddc676b43..8d84f8e62ccc5 100644 --- a/lldb/source/Core/ValueObjectConstResult.cpp +++ b/lldb/source/Core/ValueObjectConstResult.cpp @@ -179,7 +179,8 @@ ValueObjectSP ValueObjectConstResult::Create(ExecutionContextScope *exe_scope, ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Status &error) - : ValueObject(exe_scope, manager), m_impl(this) { + : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), + m_impl(this) { m_error = error; SetIsConstant(); } @@ -188,7 +189,8 @@ ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Value &value, ConstString name, Module *module) - : ValueObject(exe_scope, manager), m_impl(this) { + : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), + m_impl(this) { m_value = value; m_name = name; ExecutionContext exe_ctx; @@ -206,9 +208,9 @@ lldb::ValueType ValueObjectConstResult::GetValueType() const { return eValueTypeConstResult; } -llvm::Optional ValueObjectConstResult::GetByteSize() { +uint64_t ValueObjectConstResult::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); - if (!m_byte_size) { + if (m_byte_size == 0) { if (auto size = GetCompilerType().GetByteSize(exe_ctx.GetBestExecutionContextScope())) SetByteSize(*size); diff --git a/lldb/source/Core/ValueObjectDynamicValue.cpp b/lldb/source/Core/ValueObjectDynamicValue.cpp index 1c25b8c85a059..ca66740cb55d4 100644 --- a/lldb/source/Core/ValueObjectDynamicValue.cpp +++ b/lldb/source/Core/ValueObjectDynamicValue.cpp @@ -98,7 +98,7 @@ size_t ValueObjectDynamicValue::CalculateNumChildren(uint32_t max) { return m_parent->GetNumChildren(max); } -llvm::Optional ValueObjectDynamicValue::GetByteSize() { +uint64_t ValueObjectDynamicValue::GetByteSize() { const bool success = UpdateValueIfNeeded(false); if (success && m_dynamic_type_info.HasType()) { ExecutionContext exe_ctx(GetExecutionContextRef()); diff --git a/lldb/source/Core/ValueObjectMemory.cpp b/lldb/source/Core/ValueObjectMemory.cpp index 17fade9e5fdc3..8e7d3ebc93f69 100644 --- a/lldb/source/Core/ValueObjectMemory.cpp +++ b/lldb/source/Core/ValueObjectMemory.cpp @@ -139,11 +139,13 @@ size_t ValueObjectMemory::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -llvm::Optional ValueObjectMemory::GetByteSize() { +uint64_t ValueObjectMemory::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); if (m_type_sp) - return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()); - return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); + return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()) + .getValueOr(0); + return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()) + .getValueOr(0); } lldb::ValueType ValueObjectMemory::GetValueType() const { diff --git a/lldb/source/Core/ValueObjectRegister.cpp b/lldb/source/Core/ValueObjectRegister.cpp index 27461e9cebc41..ec87c38fb3679 100644 --- a/lldb/source/Core/ValueObjectRegister.cpp +++ b/lldb/source/Core/ValueObjectRegister.cpp @@ -81,7 +81,7 @@ size_t ValueObjectRegisterSet::CalculateNumChildren(uint32_t max) { return 0; } -llvm::Optional ValueObjectRegisterSet::GetByteSize() { return 0; } +uint64_t ValueObjectRegisterSet::GetByteSize() { return 0; } bool ValueObjectRegisterSet::UpdateValue() { m_error.Clear(); @@ -229,9 +229,7 @@ size_t ValueObjectRegister::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -llvm::Optional ValueObjectRegister::GetByteSize() { - return m_reg_info.byte_size; -} +uint64_t ValueObjectRegister::GetByteSize() { return m_reg_info.byte_size; } bool ValueObjectRegister::UpdateValue() { m_error.Clear(); diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp index fb2d32e602cea..32d1e6ab8368c 100644 --- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp +++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp @@ -121,9 +121,7 @@ bool ValueObjectSynthetic::MightHaveChildren() { return (m_might_have_children != eLazyBoolNo); } -llvm::Optional ValueObjectSynthetic::GetByteSize() { - return m_parent->GetByteSize(); -} +uint64_t ValueObjectSynthetic::GetByteSize() { return m_parent->GetByteSize(); } lldb::ValueType ValueObjectSynthetic::GetValueType() const { return m_parent->GetValueType(); diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp index ab67e3038cf0a..0d1e7b047a0ac 100644 --- a/lldb/source/Core/ValueObjectVariable.cpp +++ b/lldb/source/Core/ValueObjectVariable.cpp @@ -105,15 +105,15 @@ size_t ValueObjectVariable::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -llvm::Optional ValueObjectVariable::GetByteSize() { +uint64_t ValueObjectVariable::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); CompilerType type(GetCompilerType()); if (!type.IsValid()) - return {}; + return 0; - return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); + return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()).getValueOr(0); } lldb::ValueType ValueObjectVariable::GetValueType() const { diff --git a/lldb/source/Expression/ExpressionVariable.cpp b/lldb/source/Expression/ExpressionVariable.cpp index 8b3dda7b2fe10..d95f0745cf4ba 100644 --- a/lldb/source/Expression/ExpressionVariable.cpp +++ b/lldb/source/Expression/ExpressionVariable.cpp @@ -16,10 +16,10 @@ using namespace lldb_private; ExpressionVariable::~ExpressionVariable() {} uint8_t *ExpressionVariable::GetValueBytes() { - llvm::Optional byte_size = m_frozen_sp->GetByteSize(); - if (byte_size && *byte_size) { - if (m_frozen_sp->GetDataExtractor().GetByteSize() < *byte_size) { - m_frozen_sp->GetValue().ResizeData(*byte_size); + const size_t byte_size = m_frozen_sp->GetByteSize(); + if (byte_size > 0) { + if (m_frozen_sp->GetDataExtractor().GetByteSize() < byte_size) { + m_frozen_sp->GetValue().ResizeData(byte_size); m_frozen_sp->GetValue().GetData(m_frozen_sp->GetDataExtractor()); } return const_cast( diff --git a/lldb/source/Expression/Materializer.cpp b/lldb/source/Expression/Materializer.cpp index 327e15a26266f..6f8d9b154570a 100644 --- a/lldb/source/Expression/Materializer.cpp +++ b/lldb/source/Expression/Materializer.cpp @@ -67,7 +67,7 @@ class EntityPersistentVariable : public Materializer::Entity { const bool zero_memory = false; lldb::addr_t mem = map.Malloc( - m_persistent_variable_sp->GetByteSize().getValueOr(0), 8, + m_persistent_variable_sp->GetByteSize(), 8, lldb::ePermissionsReadable | lldb::ePermissionsWritable, IRMemoryMap::eAllocationPolicyMirror, zero_memory, allocate_error); @@ -106,8 +106,7 @@ class EntityPersistentVariable : public Materializer::Entity { Status write_error; map.WriteMemory(mem, m_persistent_variable_sp->GetValueBytes(), - m_persistent_variable_sp->GetByteSize().getValueOr(0), - write_error); + m_persistent_variable_sp->GetByteSize(), write_error); if (!write_error.Success()) { err.SetErrorStringWithFormat( @@ -235,7 +234,7 @@ class EntityPersistentVariable : public Materializer::Entity { map.GetBestExecutionContextScope(), m_persistent_variable_sp.get()->GetCompilerType(), m_persistent_variable_sp->GetName(), location, eAddressTypeLoad, - m_persistent_variable_sp->GetByteSize().getValueOr(0)); + m_persistent_variable_sp->GetByteSize()); if (frame_top != LLDB_INVALID_ADDRESS && frame_bottom != LLDB_INVALID_ADDRESS && location >= frame_bottom && @@ -280,8 +279,7 @@ class EntityPersistentVariable : public Materializer::Entity { LLDB_LOGF(log, "Dematerializing %s from 0x%" PRIx64 " (size = %llu)", m_persistent_variable_sp->GetName().GetCString(), (uint64_t)mem, - (unsigned long long)m_persistent_variable_sp->GetByteSize() - .getValueOr(0)); + (unsigned long long)m_persistent_variable_sp->GetByteSize()); // Read the contents of the spare memory area @@ -290,7 +288,7 @@ class EntityPersistentVariable : public Materializer::Entity { Status read_error; map.ReadMemory(m_persistent_variable_sp->GetValueBytes(), mem, - m_persistent_variable_sp->GetByteSize().getValueOr(0), read_error); + m_persistent_variable_sp->GetByteSize(), read_error); if (!read_error.Success()) { err.SetErrorStringWithFormat( @@ -371,11 +369,10 @@ class EntityPersistentVariable : public Materializer::Entity { if (!err.Success()) { dump_stream.Printf(" \n"); } else { - DataBufferHeap data( - m_persistent_variable_sp->GetByteSize().getValueOr(0), 0); + DataBufferHeap data(m_persistent_variable_sp->GetByteSize(), 0); map.ReadMemory(data.GetBytes(), target_address, - m_persistent_variable_sp->GetByteSize().getValueOr(0), err); + m_persistent_variable_sp->GetByteSize(), err); if (!err.Success()) { dump_stream.Printf(" \n"); @@ -624,8 +621,8 @@ class EntityVariable : public Materializer::Entity { Status extract_error; - map.GetMemoryData(data, m_temporary_allocation, - valobj_sp->GetByteSize().getValueOr(0), extract_error); + map.GetMemoryData(data, m_temporary_allocation, valobj_sp->GetByteSize(), + extract_error); if (!extract_error.Success()) { err.SetErrorStringWithFormat("couldn't get the data for variable %s", @@ -922,7 +919,7 @@ class EntityResultVariable : public Materializer::Entity { ret->ValueUpdated(); - const size_t pvar_byte_size = ret->GetByteSize().getValueOr(0); + const size_t pvar_byte_size = ret->GetByteSize(); uint8_t *pvar_data = ret->GetValueBytes(); map.ReadMemory(pvar_data, address, pvar_byte_size, read_error); diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 22bca52d7f98a..098aed9cd8125 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1408,7 +1408,7 @@ ValueObjectSP GetValueForOffset(StackFrame &frame, ValueObjectSP &parent, } int64_t child_offset = child_sp->GetByteOffset(); - int64_t child_size = child_sp->GetByteSize().getValueOr(0); + int64_t child_size = child_sp->GetByteSize(); if (offset >= child_offset && offset < (child_offset + child_size)) { return GetValueForOffset(frame, child_sp, offset - child_offset); @@ -1441,8 +1441,8 @@ ValueObjectSP GetValueForDereferincingOffset(StackFrame &frame, } if (offset >= 0 && uint64_t(offset) >= pointee->GetByteSize()) { - int64_t index = offset / pointee->GetByteSize().getValueOr(1); - offset = offset % pointee->GetByteSize().getValueOr(1); + int64_t index = offset / pointee->GetByteSize(); + offset = offset % pointee->GetByteSize(); const bool can_create = true; pointee = base->GetSyntheticArrayMember(index, can_create); } From 1df8804ce57576ff0c655073df082042cdf970f6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jul 2020 20:46:42 -0700 Subject: [PATCH 0109/1035] [X86] Replace a use of ProcIntelSLM with FeatureFast7ByteNOP. --- llvm/lib/Target/X86/X86MCInstLower.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 9ce2a4637e2ea..b4db72e150601 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1083,7 +1083,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, /// target cpu. 15-bytes is the longest single NOP instruction, but some /// platforms can't decode the longest forms efficiently. static unsigned maxLongNopLength(const X86Subtarget *Subtarget) { - if (Subtarget->getFeatureBits()[X86::ProcIntelSLM]) + if (Subtarget->getFeatureBits()[X86::FeatureFast7ByteNOP]) return 7; if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP]) return 15; From 9162b70e510475a044ee98332c9cdabf094b0e9b Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Sat, 25 Jul 2020 21:20:59 -0700 Subject: [PATCH 0110/1035] DADCombiner: Don't simplify the token factor if the node's number of operands already exceeds TokenFactorInlineLimit Summary: In parallelizeChainedStores, a TokenFactor was created with the size greater than 3000. We found that DAGCombiner::visitTokenFactor will consume a huge amount of time on such nodes. Since the number of operands already exceeds TokenFactorInlineLimit, we propose to give up simplification with the consideration of compile time. Reviewers: @spatel, @arsenm Differential Revision: https://reviews.llvm.org/D84204 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++ .../AMDGPU/token-factor-inline-limit-test.ll | 58 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a026d39600264..8932503b9564b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1805,6 +1805,10 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (OptLevel == CodeGenOpt::None) return SDValue(); + // Don't simplify the token factor if the node itself has too many operands. + if (N->getNumOperands() > TokenFactorInlineLimit) + return SDValue(); + // If the sole user is a token factor, we should make sure we have a // chance to merge them together. This prevents TF chains from inhibiting // optimizations. diff --git a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll new file mode 100644 index 0000000000000..88d9eeb8f2667 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFILD %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -combiner-tokenfactor-inline-limit=7 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFIL7 %s + + +; GCN-LABEL: {{^}}token_factor_inline_limit_test: + +; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}} +; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28 + +; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}} + +; GCN: v_mov_b32_e32 v31, 7 +; GCN: s_getpc +define void @token_factor_inline_limit_test() { +entry: + call void @external_void_func_8xv5i32( + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>) + ret void +} + +declare hidden void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>, + <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) From 595d214f47e484ffe517a4294d3ac042d6c7d25d Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Sat, 25 Jul 2020 21:37:15 -0700 Subject: [PATCH 0111/1035] [mlir][shape] Further operand and result type generalization Previous changes generalized some of the operands and results. Complete a larger group of those to simplify progressive lowering. Also update some of the declarative asm form due to generalization. Tried to keep it mostly mechanical. --- .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 38 ++++++++++------- mlir/lib/Dialect/Shape/IR/Shape.cpp | 42 +++++++++++++++++++ .../ShapeToStandard/shape-to-standard.mlir | 6 +-- mlir/test/Dialect/Shape/canonicalize.mlir | 30 ++++++------- mlir/test/Dialect/Shape/ops.mlir | 10 ++--- 5 files changed, 87 insertions(+), 39 deletions(-) diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td index 3c50a4f8b39f4..7b676a2b05981 100644 --- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td +++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td @@ -86,11 +86,12 @@ def Shape_BroadcastOp : Shape_Op<"broadcast", [Commutative]> { broadcastable output shape possible for the given inputs. }]; - let arguments = (ins Shape_ShapeType:$lhs, Shape_ShapeType:$rhs, - OptionalAttr:$error); + let arguments = (ins Shape_ShapeOrExtentTensorType:$lhs, + Shape_ShapeOrExtentTensorType:$rhs, + OptionalAttr:$error); let results = (outs Shape_ShapeType:$result); - let assemblyFormat = "$lhs `,` $rhs attr-dict"; + let assemblyFormat = "$lhs `,` $rhs attr-dict `:` type($lhs) `,` type($rhs)"; let hasFolder = 1; } @@ -220,10 +221,10 @@ def Shape_ToExtentTensorOp : Shape_Op<"to_extent_tensor", [NoSideEffect]> { If the shape represents an error, this op's behavior is undefined. }]; - let arguments = (ins Shape_ShapeType:$input); + let arguments = (ins Shape_ShapeOrExtentTensorType:$input); let results = (outs IndexTensor:$result); - let assemblyFormat = "attr-dict $input `:` type($result)"; + let assemblyFormat = "attr-dict $input `:` type($input) `->` type($result)"; let hasFolder = 1; } @@ -342,6 +343,10 @@ def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> { let arguments = (ins Shape_ShapeOrExtentTensorType:$shape); let results = (outs Shape_SizeOrIndexType:$result); + let builders = [ + OpBuilder<"OpBuilder &builder, OperationState &result, Value shape">, + ]; + let assemblyFormat = "$shape `:` type($shape) `->` type($result) attr-dict"; let hasFolder = 1; @@ -412,23 +417,28 @@ def Shape_ShapeOfOp : Shape_Op<"shape_of", [NoSideEffect]> { let assemblyFormat = "$arg `:` type($arg) `->` type($result) attr-dict"; + let builders = [ + OpBuilder<"OpBuilder &builder, OperationState &result, Value arg"> + ]; + let verifier = [{ return ::verifyShapeOrExtentTensorOp(*this); }]; + let hasCanonicalizer = 1; let hasFolder = 1; } def Shape_SizeToIndexOp : Shape_Op<"size_to_index", [NoSideEffect]> { let summary = "Casts between index types of the shape and standard dialect"; let description = [{ - Converts a `shape.size` to a standard index. - This operation and its inverse, `index_to_size`, facilitate index conversion - between the standard and the shape dialect. - The behavior is undefined for unknown and invalid arguments. + Converts a `shape.size` to a standard index. This operation and its + inverse, `index_to_size`, facilitate index conversion between the standard + and the shape dialect. The behavior is undefined for unknown and invalid + arguments. }]; - let arguments = (ins Shape_SizeType:$arg); + let arguments = (outs Shape_SizeOrIndexType:$arg); let results = (outs Index:$result); - let assemblyFormat = "$arg attr-dict"; + let assemblyFormat = "$arg attr-dict `:` type($arg)"; let hasFolder = 1; let hasCanonicalizer = 1; @@ -490,7 +500,7 @@ def Shape_SplitAtOp : Shape_Op<"split_at", []> { - `index` is in the range [-rank(operand),rank(operand)] }]; - let arguments = (ins Shape_ShapeType:$operand, I32:$index); + let arguments = (ins Shape_ShapeOrExtentTensorType:$operand, I32:$index); let results = (outs Shape_ShapeType:$head, Shape_ShapeType:$tail); let hasFolder = 1; } @@ -520,8 +530,7 @@ def Shape_ConcatOp : Shape_Op<"concat", []> { // TODO: Move the code below and witnesses to a different file. def Shape_AnyOp : Shape_Op<"any", [Commutative, - NoSideEffect, - SameOperandsAndResultType]> { + NoSideEffect]> { let summary = "Return any combination of the input shapes"; let description = [{ This operation takes multiple input shapes or extent tensors and returns @@ -541,7 +550,6 @@ def Shape_AnyOp : Shape_Op<"any", [Commutative, let arguments = (ins Variadic:$inputs); let results = (outs Shape_ShapeOrExtentTensorType:$result); - let assemblyFormat = "$inputs `:` type($result) attr-dict"; let hasFolder = 1; } diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index 104ab46c55813..4887c87c1e5f1 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -674,6 +674,16 @@ OpFoldResult NumElementsOp::fold(ArrayRef operands) { return builder.getIndexAttr(product.getLimitedValue()); } +void NumElementsOp::build(OpBuilder &builder, OperationState &result, + Value shape) { + if (shape.getType().isa()) { + auto type = builder.getIndexType(); + return build(builder, result, type, shape); + } + auto type = SizeType::get(builder.getContext()); + return build(builder, result, type, shape); +} + //===----------------------------------------------------------------------===// // MulOp //===----------------------------------------------------------------------===// @@ -702,6 +712,38 @@ OpFoldResult ShapeOfOp::fold(ArrayRef) { return builder.getIndexTensorAttr(type.getShape()); } +void ShapeOfOp::build(OpBuilder &builder, OperationState &result, Value arg) { + if (arg.getType().isa()) { + auto type = RankedTensorType::get({ShapedType::kDynamicSize}, + builder.getIndexType()); + return ShapeOfOp::build(builder, result, type, arg); + } + auto type = ShapeType::get(builder.getContext()); + return ShapeOfOp::build(builder, result, type, arg); +} + +namespace { +struct ShapeOfWithTensor : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(shape::ShapeOfOp op, + PatternRewriter &rewriter) const override { + if (!op.arg().getType().isa()) + return failure(); + if (op.getType().isa()) + return failure(); + + rewriter.replaceOpWithNewOp(op.getOperation(), op.arg()); + return success(); + } +}; +} // namespace + +void ShapeOfOp::getCanonicalizationPatterns(OwningRewritePatternList &patterns, + MLIRContext *context) { + patterns.insert(context); +} + //===----------------------------------------------------------------------===// // SizeToIndexOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir index 8236c6f279755..9336402d86da4 100644 --- a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir +++ b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir @@ -50,7 +50,6 @@ func @shape_of_stat(%arg : tensor<1x2x3xf32>) { // CHECK-DAG: %[[C2:.*]] = constant 2 : index // CHECK-DAG: %[[C3:.*]] = constant 3 : index // CHECK-DAG: %[[SHAPE_UNCASTED:.*]] = tensor_from_elements(%[[C1]], %[[C2]], %[[C3]]) : tensor<3xindex> - // CHECK-DAG: %[[SHAPE:.*]] = tensor_cast %[[SHAPE_UNCASTED]] : tensor<3xindex> to tensor %shape = shape.shape_of %arg : tensor<1x2x3xf32> -> tensor return } @@ -66,7 +65,6 @@ func @shape_of_dyn(%arg : tensor<1x5x?xf32>) { // CHECK-DAG: %[[C2:.*]] = constant 2 : index // CHECK-DAG: %[[DYN_DIM:.*]] = dim %[[ARG]], %[[C2]] : tensor<1x5x?xf32> // CHECK-DAG: %[[SHAPE_UNCASTED:.*]] = tensor_from_elements(%[[C1]], %[[C5]], %[[DYN_DIM]]) : tensor<3xindex> - // CHECK-DAG: %[[SHAPE:.*]] = tensor_cast %[[SHAPE_UNCASTED]] : tensor<3xindex> to tensor %shape = shape.shape_of %arg : tensor<1x5x?xf32> -> tensor return } @@ -120,7 +118,7 @@ func @any_of_three(%a : tensor, %b : tensor, %c : tensor) -> tensor { // CHECK: return %[[A]] : tensor - %result = shape.any %a, %b, %c : tensor + %result = "shape.any"(%a, %b, %c) : (tensor, tensor, tensor) -> tensor return %result : tensor } @@ -131,7 +129,7 @@ func @any_of_three(%a : tensor, // CHECK-SAME: (%[[A:.*]]: tensor) -> tensor func @any_of_one(%a : tensor) -> tensor { // CHECK: return %[[A]] : tensor - %result = shape.any %a : tensor + %result = "shape.any"(%a) : (tensor) -> tensor return %result : tensor } diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index e147fbeb81ac2..5fe2ac108a69d 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -54,7 +54,7 @@ func @f() -> !shape.shape { // CHECK: shape.const_shape [7, 2] : !shape.shape %0 = shape.const_shape [1, 2] : !shape.shape %1 = shape.const_shape [7, 1] : !shape.shape - %2 = shape.broadcast %0, %1 + %2 = shape.broadcast %0, %1 : !shape.shape, !shape.shape return %2 : !shape.shape } @@ -65,7 +65,7 @@ func @f() -> !shape.shape { func @f(%arg0 : !shape.shape) -> !shape.shape { // CHECK: return %arg0 %0 = shape.const_shape [] : !shape.shape - %1 = shape.broadcast %arg0, %0 + %1 = shape.broadcast %arg0, %0 : !shape.shape, !shape.shape return %1 : !shape.shape } @@ -76,7 +76,7 @@ func @f(%arg0 : !shape.shape) -> !shape.shape { func @f(%arg0 : !shape.shape) -> !shape.shape { // CHECK: return %arg0 %0 = shape.const_shape [] : !shape.shape - %1 = shape.broadcast %0, %arg0 + %1 = shape.broadcast %0, %arg0 : !shape.shape, !shape.shape return %1 : !shape.shape } @@ -89,7 +89,7 @@ func @f() -> !shape.shape { // CHECK: return %[[CST]] %0 = shape.const_shape [] : !shape.shape %1 = shape.const_shape [1, 2, 3] : !shape.shape - %2 = shape.broadcast %0, %1 + %2 = shape.broadcast %0, %1 : !shape.shape, !shape.shape return %2 : !shape.shape } @@ -101,7 +101,7 @@ func @f() -> !shape.shape { // CHECK: shape.broadcast %0 = shape.const_shape [2] : !shape.shape %1 = shape.const_shape [7] : !shape.shape - %2 = shape.broadcast %0, %1 + %2 = shape.broadcast %0, %1 : !shape.shape, !shape.shape return %2 : !shape.shape } @@ -124,7 +124,7 @@ func @f() -> !shape.shape { func @f() -> tensor<2xindex> { // CHECK: constant dense<[0, 1]> : tensor<2xindex> %cs = shape.const_shape [0, 1] : !shape.shape - %0 = shape.to_extent_tensor %cs : tensor<2xindex> + %0 = shape.to_extent_tensor %cs : !shape.shape -> tensor<2xindex> return %0 : tensor<2xindex> } @@ -159,7 +159,7 @@ func @const_size_to_index() -> index { // CHECK-NOT: shape.index_cast %cs = shape.const_size 123 // CHECK: constant 123 : index - %ci = shape.size_to_index %cs + %ci = shape.size_to_index %cs : !shape.size return %ci : index } @@ -185,7 +185,7 @@ func @const_index_to_size_to_index() -> index { %cs0 = shape.index_to_size %ci0 // CHECK: %[[CI:.*]] = constant 123 : index // CHECK-NEXT: return %[[CI]] : index - %ci1 = shape.size_to_index %cs0 + %ci1 = shape.size_to_index %cs0 : !shape.size return %ci1 : index } @@ -195,7 +195,7 @@ func @const_index_to_size_to_index() -> index { // CHECK-LABEL: func @nonfoldable_size_to_index func @nonfoldable_size_to_index(%cs : !shape.size) -> index { // CHECK: shape.size_to_index - %ci = shape.size_to_index %cs + %ci = shape.size_to_index %cs : !shape.size return %ci : index } @@ -403,7 +403,7 @@ func @f(%arg : !shape.shape) -> !shape.shape { // CHECK-NEXT: %[[CS:.*]] = shape.const_shape // CHECK-NEXT: return %[[CS]] %0 = shape.const_shape [2, 3, 4] : !shape.shape - %1 = shape.any %0, %arg : !shape.shape + %1 = "shape.any"(%0, %arg) : (!shape.shape, !shape.shape) -> !shape.shape return %1 : !shape.shape } @@ -415,7 +415,7 @@ func @f(%arg : tensor) -> tensor { // CHECK-NEXT: %[[CS:.*]] = shape.const_shape [2, 3, 4] : tensor // CHECK-NEXT: return %[[CS]] : tensor %0 = shape.const_shape [2, 3, 4] : tensor - %1 = shape.any %0, %arg : tensor + %1 = "shape.any"(%0, %arg) : (tensor, tensor) -> tensor return %1 : tensor } @@ -424,9 +424,9 @@ func @f(%arg : tensor) -> tensor { // Folding of any with partially constant operands is not yet implemented. // CHECK-LABEL: func @f func @f(%arg0 : !shape.shape, %arg1 : !shape.shape) -> !shape.shape { - // CHECK-NEXT: %[[CS:.*]] = shape.any + // CHECK-NEXT: %[[CS:.*]] = "shape.any" // CHECK-NEXT: return %[[CS]] - %1 = shape.any %arg0, %arg1 : !shape.shape + %1 = "shape.any"(%arg0, %arg1) : (!shape.shape, !shape.shape) -> !shape.shape return %1 : !shape.shape } @@ -619,7 +619,7 @@ func @dont_canonicalize_rank(%arg : tensor<*xf32>) -> index { func @index_to_size_to_index(%index : index) -> index { // CHECK: return %[[IDX]] : index %size = shape.index_to_size %index - %result = shape.size_to_index %size + %result = shape.size_to_index %size : !shape.size return %result : index } @@ -630,7 +630,7 @@ func @index_to_size_to_index(%index : index) -> index { // CHECK-SAME: (%[[SIZE:.*]]: !shape.size) -> !shape.size func @size_to_index_to_size(%size : !shape.size) -> !shape.size { // CHECK: return %[[SIZE]] : !shape.size - %idx = shape.size_to_index %size + %idx = shape.size_to_index %size : !shape.size %result = shape.index_to_size %idx return %result : !shape.size } diff --git a/mlir/test/Dialect/Shape/ops.mlir b/mlir/test/Dialect/Shape/ops.mlir index f57826097d34f..87af623fe0f7b 100644 --- a/mlir/test/Dialect/Shape/ops.mlir +++ b/mlir/test/Dialect/Shape/ops.mlir @@ -49,7 +49,7 @@ func @test_shape_num_elements_fixed() { func @test_broadcast_fixed() { %0 = shape.const_shape [10, 1, 57, 92] : !shape.shape %1 = shape.const_shape [4, 57, 92] : !shape.shape - %2 = shape.broadcast %0, %1 + %2 = shape.broadcast %0, %1 : !shape.shape, !shape.shape %3 = "shape.print"(%2) : (!shape.shape) -> !shape.shape return } @@ -99,7 +99,7 @@ func @test_constraints() { %w3 = shape.const_witness false %w4 = shape.assuming_all %w0, %w1, %w2, %w3 shape.assuming %w4 -> !shape.shape { - %2 = shape.any %0, %1 : !shape.shape + %2 = "shape.any"(%0, %1) : (!shape.shape, !shape.shape) -> !shape.shape shape.assuming_yield %2 : !shape.shape } return @@ -131,7 +131,7 @@ func @const_size() { } func @test_to_extent_tensor(%arg: !shape.shape) -> tensor<3xindex> { - %0 = shape.to_extent_tensor %arg : tensor<3xindex> + %0 = shape.to_extent_tensor %arg : !shape.shape -> tensor<3xindex> return %0 : tensor<3xindex> } @@ -188,10 +188,10 @@ func @get_extent_on_mixed_operands(%arg : tensor) -> !shape.size { func @any() { %0 = shape.const_shape [1, 2, 3] : !shape.shape %1 = shape.const_shape [4, 5, 6] : !shape.shape - %2 = shape.any %0, %1 : !shape.shape + %2 = "shape.any"(%0, %1) : (!shape.shape, !shape.shape) -> !shape.shape %3 = shape.const_shape [1, 2, 3] : tensor %4 = shape.const_shape [4, 5, 6] : tensor - %5 = shape.any %3, %4 : tensor + %5 = "shape.any"(%3, %4) : (tensor, tensor) -> tensor return } From 14c59b4577658655e56671bf2f17cca9c0d952d9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jul 2020 20:48:46 -0700 Subject: [PATCH 0112/1035] [X86] Remove getProcFamily() method from X86Subtarget. NFC This isn't used and we've decided in the past that a CPU enum for tuning is not a good idea. --- llvm/lib/Target/X86/X86Subtarget.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 54d7fbef7f94a..5b5ab4b969aac 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -787,8 +787,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool isXRaySupported() const override { return is64Bit(); } - X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; } - /// TODO: to be removed later and replaced with suitable properties bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } From 1a1448e6568d9b11f198e510fa9c4cb6b1f4216a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 25 Jul 2020 22:05:46 -0700 Subject: [PATCH 0113/1035] [X86] Merge X86MCInstLowering's maxLongNopLength into emitNop and remove check for FeatureNOPL. The switch in emitNop uses 64-bit registers for nops exceeding 2 bytes. This isn't valid outside 64-bit mode. We could fix this easily enough, but there are no users that ask for more than 2 bytes outside 64-bit mode. Inlining the method to make the coupling between the two methods more explicit. --- llvm/lib/Target/X86/X86MCInstLower.cpp | 34 ++++++++++++-------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index b4db72e150601..8f3e32727371b 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1079,29 +1079,27 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, } } -/// Return the longest nop which can be efficiently decoded for the given -/// target cpu. 15-bytes is the longest single NOP instruction, but some -/// platforms can't decode the longest forms efficiently. -static unsigned maxLongNopLength(const X86Subtarget *Subtarget) { - if (Subtarget->getFeatureBits()[X86::FeatureFast7ByteNOP]) - return 7; - if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP]) - return 15; - if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP]) - return 11; - if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit()) - return 10; - if (Subtarget->is32Bit()) - return 2; - return 1; -} - /// Emit the largest nop instruction smaller than or equal to \p NumBytes /// bytes. Return the size of nop emitted. static unsigned emitNop(MCStreamer &OS, unsigned NumBytes, const X86Subtarget *Subtarget) { + unsigned MaxNopLength = 1; + if (Subtarget->is64Bit()) { + // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the + // IndexReg/BaseReg below need to be updated. + if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP)) + MaxNopLength = 7; + else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP)) + MaxNopLength = 15; + else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP)) + MaxNopLength = 11; + else + MaxNopLength = 10; + } if (Subtarget->is32Bit()) + MaxNopLength = 2; + // Cap a single nop emission at the profitable value for the target - NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget)); + NumBytes = std::min(NumBytes, MaxNopLength); unsigned NopSize; unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; From 9b19400004dfee9d07a90aa11d448bade9ee71a2 Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Sun, 26 Jul 2020 00:46:29 -0700 Subject: [PATCH 0114/1035] [AArch64][GlobalISel] Make <8 x s16> and <16 x s8> legal types for G_SHUFFLE_VECTOR and G_IMPLICIT_DEF. Trivial change, we're still missing support for rev matching for these types in the combiner. --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 4 +- .../GlobalISel/legalize-shuffle-vector.mir | 44 +++++++++++++++++++ .../AArch64/GlobalISel/legalize-undef.mir | 22 ++++++++++ llvm/test/CodeGen/AArch64/arm64-rev.ll | 36 +++++++-------- 4 files changed, 84 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index bbceb0e169039..54e368cd7768f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -62,7 +62,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) } getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) - .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) + .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64, v16s8, v8s16}) .clampScalar(0, s1, s64) .widenScalarToNextPow2(0, 8) .fewerElementsIf( @@ -596,7 +596,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // to be the same size as the dest. if (DstTy != SrcTy) return false; - for (auto &Ty : {v2s32, v4s32, v2s64}) { + for (auto &Ty : {v2s32, v4s32, v2s64, v16s8, v8s16}) { if (DstTy == Ty) return true; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir index 7b4ae3d56ab0f..22d9406ac4f6f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir @@ -43,6 +43,50 @@ body: | $q0 = COPY %2(<2 x s64>) RET_ReallyLR implicit $q0 +... +--- +name: shuffle_v16i8 +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v16i8 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 + ; CHECK: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[COPY]](<16 x s8>), [[COPY1]], shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + ; CHECK: $q0 = COPY [[SHUF]](<16 x s8>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<16 x s8>) = COPY $q0 + %1:_(<16 x s8>) = COPY $q1 + %2:_(<16 x s8>) = G_SHUFFLE_VECTOR %0(<16 x s8>), %1, shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %2(<16 x s8>) + RET_ReallyLR implicit $q0 + +... +--- +name: shuffle_v8i16 +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v8i16 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 + ; CHECK: [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<8 x s16>), [[COPY1]], shufflemask(0, 0, 0, 0, 0, 0, 0, 0) + ; CHECK: $q0 = COPY [[SHUF]](<8 x s16>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s16>) = COPY $q1 + %2:_(<8 x s16>) = G_SHUFFLE_VECTOR %0(<8 x s16>), %1, shufflemask(0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %2(<8 x s16>) + RET_ReallyLR implicit $q0 + ... --- name: shuffle_1elt_mask diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir index 2e7c1cbb33914..984909d342072 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir @@ -77,3 +77,25 @@ body: | $w0 = COPY %1 $w1 = COPY %2 ... +--- +name: test_implicit_def_v16s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v16s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s8>) = G_IMPLICIT_DEF + ; CHECK: $q0 = COPY [[DEF]](<16 x s8>) + %0:_(<16 x s8>) = G_IMPLICIT_DEF + $q0 = COPY %0 +... +--- +name: test_implicit_def_v8s16 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v8s16 + ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF + ; CHECK: $q0 = COPY [[DEF]](<8 x s16>) + %0:_(<8 x s16>) = G_IMPLICIT_DEF + $q0 = COPY %0 +... diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll index 0a6c7a14a3352..5f76f0a1c2714 100644 --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -426,11 +426,10 @@ define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind { ; CHECK-NEXT: rev32.16b v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev32Q8: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev32.16b v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev32Q8: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -443,11 +442,10 @@ define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind { ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev32Q16: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev32.8h v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev32Q16: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 @@ -477,11 +475,10 @@ define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind { ; CHECK-NEXT: rev16.16b v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev16Q8: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev16.16b v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev16Q8: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -513,11 +510,10 @@ define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind { ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev32Q16_undef: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev32.8h v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev32Q16_undef: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 From 98b61112c32e1d2f4a0037698e9ac1d46c23d6c7 Mon Sep 17 00:00:00 2001 From: Alex Richardson Date: Sun, 26 Jul 2020 11:37:47 +0100 Subject: [PATCH 0115/1035] [asan] Mark the strstr test as UNSUPPORTED on FreeBSD Like Android, FreeBSDs libc calls memchr which causes this test to fail. Reviewed By: emaste Differential Revision: https://reviews.llvm.org/D84541 --- compiler-rt/test/asan/TestCases/strstr_strict.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/strstr_strict.c b/compiler-rt/test/asan/TestCases/strstr_strict.c index 7d2c4ec2f3258..0d9482723f5f6 100644 --- a/compiler-rt/test/asan/TestCases/strstr_strict.c +++ b/compiler-rt/test/asan/TestCases/strstr_strict.c @@ -3,8 +3,8 @@ // Newer versions of Android's strstr() uses memchr() internally, which actually // does trigger a heap-buffer-overflow (as it tries to find the -// null-terminator). -// UNSUPPORTED: android +// null-terminator). The same applies to FreeBSD. +// UNSUPPORTED: android, freebsd // RUN: %env_asan_opts=strict_string_checks=false %run %t 2>&1 // RUN: %env_asan_opts=strict_string_checks=true not %run %t 2>&1 | FileCheck %s From 18df607dbeb41451f5842f46e9b7a67baf759d5e Mon Sep 17 00:00:00 2001 From: Alex Richardson Date: Sun, 26 Jul 2020 11:39:22 +0100 Subject: [PATCH 0116/1035] [lit] Don't include tests skipped due to sharding in reports When running multiple shards, don't include skipped tests in the xunit output since merging the files will result in duplicates. In our CHERI Jenkins CI, I configured the libc++ tests to run using sharding (since we are testing using a single-CPU QEMU). We then merge the generated XUnit xml files to produce a final result, but if the individual XMLs report tests excluded due to sharding each test is included N times in the final result. This also makes it difficult to find the tests that were skipped due to missing REQUIRES: etc. Reviewed By: yln Differential Revision: https://reviews.llvm.org/D84235 --- llvm/utils/lit/lit/main.py | 6 +++++- llvm/utils/lit/lit/reports.py | 2 +- llvm/utils/lit/tests/xunit-output.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py index c47bdede3176c..d94d7280809da 100755 --- a/llvm/utils/lit/lit/main.py +++ b/llvm/utils/lit/lit/main.py @@ -80,9 +80,13 @@ def main(builtin_params={}): 'error.\n') sys.exit(2) + # When running multiple shards, don't include skipped tests in the xunit + # output since merging the files will result in duplicates. + tests_for_report = discovered_tests if opts.shard: (run, shards) = opts.shard selected_tests = filter_by_shard(selected_tests, run, shards, lit_config) + tests_for_report = selected_tests if not selected_tests: sys.stderr.write('warning: shard does not contain any tests. ' 'Consider decreasing the number of shards.\n') @@ -102,7 +106,7 @@ def main(builtin_params={}): print_results(discovered_tests, elapsed, opts) for report in opts.reports: - report.write_results(discovered_tests, elapsed) + report.write_results(tests_for_report, elapsed) if lit_config.numErrors: sys.stderr.write('\n%d error(s) in tests\n' % lit_config.numErrors) diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py index 9631d5fe3b358..3ce961b44029e 100755 --- a/llvm/utils/lit/lit/reports.py +++ b/llvm/utils/lit/lit/reports.py @@ -127,7 +127,7 @@ def _write_test(self, file, test, suite_name): def _get_skip_reason(self, test): code = test.result.code if code == lit.Test.EXCLUDED: - return 'Test not selected (--filter, --max-tests, --run-shard)' + return 'Test not selected (--filter, --max-tests)' if code == lit.Test.SKIPPED: return 'User interrupt' diff --git a/llvm/utils/lit/tests/xunit-output.py b/llvm/utils/lit/tests/xunit-output.py index 81d8525f33b46..92b693256c69b 100644 --- a/llvm/utils/lit/tests/xunit-output.py +++ b/llvm/utils/lit/tests/xunit-output.py @@ -14,7 +14,7 @@ # CHECK-NEXT: ]]]]> &"]]> # CHECK-NEXT: # CHECK-NEXT: -# CHECK-NEXT: +# CHECK-NEXT: # CHECK-NEXT: # CHECK-NEXT: # CHECK-NEXT: From ed5a6b9305145fd073d046d6e9814ffb70c5fdbe Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 26 Jul 2020 14:00:15 +0300 Subject: [PATCH 0117/1035] [NFC][XRay] Account: decouple getStats() interface from underlying data structure It doesn't really need to know where Timings are stored, it just needs to be able to sort them, so MutableArrayRef is enough. That uncovers an interesting quirk that it relied on implicit double->int conversion for calculating percentiles. --- llvm/tools/llvm-xray/xray-account.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/tools/llvm-xray/xray-account.cpp b/llvm/tools/llvm-xray/xray-account.cpp index fcac33b23d4d6..21da53f362b09 100644 --- a/llvm/tools/llvm-xray/xray-account.cpp +++ b/llvm/tools/llvm-xray/xray-account.cpp @@ -226,7 +226,7 @@ struct ResultRow { std::string Function; }; -ResultRow getStats(std::vector &Timings) { +ResultRow getStats(MutableArrayRef Timings) { assert(!Timings.empty()); ResultRow R; R.Sum = std::accumulate(Timings.begin(), Timings.end(), 0.0); @@ -240,11 +240,13 @@ ResultRow getStats(std::vector &Timings) { R.Median = Timings[MedianOff]; auto Pct90Off = std::floor(Timings.size() * 0.9); - std::nth_element(Timings.begin(), Timings.begin() + Pct90Off, Timings.end()); + std::nth_element(Timings.begin(), Timings.begin() + (uint64_t)Pct90Off, + Timings.end()); R.Pct90 = Timings[Pct90Off]; auto Pct99Off = std::floor(Timings.size() * 0.99); - std::nth_element(Timings.begin(), Timings.begin() + Pct99Off, Timings.end()); + std::nth_element(Timings.begin(), Timings.begin() + (uint64_t)Pct99Off, + Timings.end()); R.Pct99 = Timings[Pct99Off]; return R; } From b1210c059d1ef084ecd275ed0ffb8343ac3cdfad Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 26 Jul 2020 14:05:00 +0300 Subject: [PATCH 0118/1035] [NFC][XRay] Account: migrate to DenseMap + SmallVector, -16% faster on large (3.8G) input DenseMap is a single allocation underneath, so this is has pretty expected performance impact on large-ish (3.8G) xray log processing time. --- llvm/tools/llvm-xray/xray-account.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/tools/llvm-xray/xray-account.h b/llvm/tools/llvm-xray/xray-account.h index b63ecc59b71ad..575114d6096a2 100644 --- a/llvm/tools/llvm-xray/xray-account.h +++ b/llvm/tools/llvm-xray/xray-account.h @@ -27,12 +27,14 @@ namespace xray { class LatencyAccountant { public: - typedef std::map> FunctionLatencyMap; - typedef std::map> + typedef llvm::DenseMap> + FunctionLatencyMap; + typedef llvm::DenseMap> PerThreadMinMaxTSCMap; - typedef std::map> PerCPUMinMaxTSCMap; - typedef std::vector> FunctionStack; - typedef std::map PerThreadFunctionStackMap; + typedef llvm::DenseMap> + PerCPUMinMaxTSCMap; + typedef llvm::SmallVector, 32> FunctionStack; + typedef llvm::DenseMap PerThreadFunctionStackMap; private: PerThreadFunctionStackMap PerThreadFunctionStack; From 02dadab1b459a3ed4f440b2de3ab21eba7bd5a2e Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Sun, 26 Jul 2020 20:47:19 +0900 Subject: [PATCH 0119/1035] NFC; add an example that subtracts pointers to two global vars --- llvm/test/Transforms/InstSimplify/freeze.ll | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/InstSimplify/freeze.ll b/llvm/test/Transforms/InstSimplify/freeze.ll index 3c4ca7a1afb95..3aa77854f6020 100644 --- a/llvm/test/Transforms/InstSimplify/freeze.ll +++ b/llvm/test/Transforms/InstSimplify/freeze.ll @@ -111,8 +111,7 @@ define <2 x float> @constvector_FP_noopt() { } @g = external global i16, align 1 - -; Negative test +@g2 = external global i16, align 1 define float @constant_expr() { ; CHECK-LABEL: @constant_expr( @@ -138,6 +137,18 @@ define i32* @constant_expr3() { ret i32* %r } +define i64 @ptrdiff() { +; CHECK-LABEL: @ptrdiff( +; CHECK-NEXT: [[R:%.*]] = freeze i64 sub (i64 ptrtoint (i16* @g to i64), i64 ptrtoint (i16* @g2 to i64)) +; CHECK-NEXT: ret i64 [[R]] +; + %i = ptrtoint i16* @g to i64 + %i2 = ptrtoint i16* @g2 to i64 + %diff = sub i64 %i, %i2 + %r = freeze i64 %diff + ret i64 %r +} + ; Negative test define <2 x i31> @vector_element_constant_expr() { From 1b802fe34d716f2a3542fad3e8ba74f77685590d Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Sun, 26 Jul 2020 21:02:31 +0900 Subject: [PATCH 0120/1035] NFC; add a test for freeze's constprop --- llvm/test/Analysis/ConstantFolding/freeze.ll | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 llvm/test/Analysis/ConstantFolding/freeze.ll diff --git a/llvm/test/Analysis/ConstantFolding/freeze.ll b/llvm/test/Analysis/ConstantFolding/freeze.ll new file mode 100644 index 0000000000000..c2da041308456 --- /dev/null +++ b/llvm/test/Analysis/ConstantFolding/freeze.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -constprop -S | FileCheck %s + +@g = external global i16, align 1 +@g2 = external global i16, align 1 + +define i64 @ptrdiff1() { +; CHECK-LABEL: @ptrdiff1( +; CHECK-NEXT: [[R:%.*]] = freeze i64 sub (i64 ptrtoint (i16* @g to i64), i64 ptrtoint (i16* @g2 to i64)) +; CHECK-NEXT: ret i64 [[R]] +; + %i = ptrtoint i16* @g to i64 + %i2 = ptrtoint i16* @g2 to i64 + %diff = sub i64 %i, %i2 + %r = freeze i64 %diff + ret i64 %r +} + +define i64 @ptrdiff2() { +; CHECK-LABEL: @ptrdiff2( +; CHECK-NEXT: [[R:%.*]] = freeze i64 -2 +; CHECK-NEXT: ret i64 [[R]] +; + %i = ptrtoint i16* @g to i64 + %gep = getelementptr i16, i16* @g, i64 1 + %i2 = ptrtoint i16* %gep to i64 + %diff = sub i64 %i, %i2 + %r = freeze i64 %diff + ret i64 %r +} From 9f074214b7a3bc9d88caadbf7bd6116305977a4e Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Sun, 26 Jul 2020 21:48:51 +0900 Subject: [PATCH 0121/1035] [ValueTracking] Instruction::isBinaryOp should be used for constexprs This is a simple patch that makes canCreateUndefOrPoison use Instruction::isBinaryOp because BinaryOperator inherits Instruction. Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D84596 --- llvm/lib/Analysis/ValueTracking.cpp | 2 +- llvm/test/Transforms/InstSimplify/freeze.ll | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 271200f7030a2..3bf09b30b2f9c 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4773,7 +4773,7 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) { const auto *CE = dyn_cast(Op); if (isa(Op) || (CE && CE->isCast())) return false; - else if (isa(Op)) + else if (Instruction::isBinaryOp(Opcode)) return false; // Be conservative and return true. return true; diff --git a/llvm/test/Transforms/InstSimplify/freeze.ll b/llvm/test/Transforms/InstSimplify/freeze.ll index 3aa77854f6020..66ec5d61a3c1c 100644 --- a/llvm/test/Transforms/InstSimplify/freeze.ll +++ b/llvm/test/Transforms/InstSimplify/freeze.ll @@ -139,8 +139,7 @@ define i32* @constant_expr3() { define i64 @ptrdiff() { ; CHECK-LABEL: @ptrdiff( -; CHECK-NEXT: [[R:%.*]] = freeze i64 sub (i64 ptrtoint (i16* @g to i64), i64 ptrtoint (i16* @g2 to i64)) -; CHECK-NEXT: ret i64 [[R]] +; CHECK-NEXT: ret i64 sub (i64 ptrtoint (i16* @g to i64), i64 ptrtoint (i16* @g2 to i64)) ; %i = ptrtoint i16* @g to i64 %i2 = ptrtoint i16* @g2 to i64 From 32088f4f7fc734474d1249d288a34894a4e901a9 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Sun, 26 Jul 2020 21:54:44 +0900 Subject: [PATCH 0122/1035] [ConstantFolding] Fold freeze if it is never undef or poison This is a simple patch that adds constant folding for freeze instruction. IIUC, it isn't needed to update ConstantFold.cpp because there is no freeze constexpr. Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D84597 --- llvm/lib/Analysis/ConstantFolding.cpp | 2 ++ llvm/test/Analysis/ConstantFolding/freeze.ll | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 794edeb714fd2..7e4293d72a0e5 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1071,6 +1071,8 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode, default: return nullptr; case Instruction::ICmp: case Instruction::FCmp: llvm_unreachable("Invalid for compares"); + case Instruction::Freeze: + return isGuaranteedNotToBeUndefOrPoison(Ops[0]) ? Ops[0] : nullptr; case Instruction::Call: if (auto *F = dyn_cast(Ops.back())) { const auto *Call = cast(InstOrCE); diff --git a/llvm/test/Analysis/ConstantFolding/freeze.ll b/llvm/test/Analysis/ConstantFolding/freeze.ll index c2da041308456..4908f71b924ec 100644 --- a/llvm/test/Analysis/ConstantFolding/freeze.ll +++ b/llvm/test/Analysis/ConstantFolding/freeze.ll @@ -6,8 +6,7 @@ define i64 @ptrdiff1() { ; CHECK-LABEL: @ptrdiff1( -; CHECK-NEXT: [[R:%.*]] = freeze i64 sub (i64 ptrtoint (i16* @g to i64), i64 ptrtoint (i16* @g2 to i64)) -; CHECK-NEXT: ret i64 [[R]] +; CHECK-NEXT: ret i64 sub (i64 ptrtoint (i16* @g to i64), i64 ptrtoint (i16* @g2 to i64)) ; %i = ptrtoint i16* @g to i64 %i2 = ptrtoint i16* @g2 to i64 @@ -18,8 +17,7 @@ define i64 @ptrdiff1() { define i64 @ptrdiff2() { ; CHECK-LABEL: @ptrdiff2( -; CHECK-NEXT: [[R:%.*]] = freeze i64 -2 -; CHECK-NEXT: ret i64 [[R]] +; CHECK-NEXT: ret i64 -2 ; %i = ptrtoint i16* @g to i64 %gep = getelementptr i16, i16* @g, i64 1 From 920e26797451774b29debfaa3faaca2cb18cfb1e Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Sun, 26 Jul 2020 22:00:01 +0900 Subject: [PATCH 0123/1035] [JumpThreading] Add a test for D84598; NFC --- llvm/test/Transforms/JumpThreading/freeze.ll | 94 ++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 llvm/test/Transforms/JumpThreading/freeze.ll diff --git a/llvm/test/Transforms/JumpThreading/freeze.ll b/llvm/test/Transforms/JumpThreading/freeze.ll new file mode 100644 index 0000000000000..37d2ad672f789 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/freeze.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -jump-threading -S < %s | FileCheck %s + +declare i32 @f1() +declare i32 @f2() +declare void @f3() + +define i32 @test1(i1 %cond) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: T1: +; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: F1: +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: br label [[MERGE]] +; CHECK: Merge: +; CHECK-NEXT: [[A:%.*]] = phi i1 [ true, [[T1]] ], [ false, [[F1]] ] +; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] +; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] +; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK: T2: +; CHECK-NEXT: call void @f3() +; CHECK-NEXT: ret i32 [[B]] +; CHECK: F2: +; CHECK-NEXT: ret i32 [[B]] +; + br i1 %cond, label %T1, label %F1 + +T1: + %v1 = call i32 @f1() + br label %Merge + +F1: + %v2 = call i32 @f2() + br label %Merge + +Merge: + %A = phi i1 [true, %T1], [false, %F1] + %B = phi i32 [%v1, %T1], [%v2, %F1] + %A.fr = freeze i1 %A + br i1 %A.fr, label %T2, label %F2 + +T2: + call void @f3() + ret i32 %B + +F2: + ret i32 %B +} + +define i32 @test2(i1 %cond, i1 %cond2) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: T1: +; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: F1: +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: br label [[MERGE]] +; CHECK: Merge: +; CHECK-NEXT: [[A:%.*]] = phi i1 [ true, [[T1]] ], [ [[COND2:%.*]], [[F1]] ] +; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] +; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] +; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK: T2: +; CHECK-NEXT: call void @f3() +; CHECK-NEXT: ret i32 [[B]] +; CHECK: F2: +; CHECK-NEXT: ret i32 [[B]] +; + br i1 %cond, label %T1, label %F1 + +T1: + %v1 = call i32 @f1() + br label %Merge + +F1: + %v2 = call i32 @f2() + br label %Merge + +Merge: + %A = phi i1 [true, %T1], [%cond2, %F1] + %B = phi i32 [%v1, %T1], [%v2, %F1] + %A.fr = freeze i1 %A + br i1 %A.fr, label %T2, label %F2 + +T2: + call void @f3() + ret i32 %B + +F2: + ret i32 %B +} From 912e9e526233c54aa08b082c957499fa9124eece Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 24 Jul 2020 14:45:50 -0400 Subject: [PATCH 0124/1035] [InstSimplify] add tests for fcmp with infinity constant; NFC --- .../InstSimplify/floating-point-compare.ll | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll index 718a4427e15e3..6ce5c5e67b08a 100644 --- a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll +++ b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll @@ -1070,6 +1070,28 @@ define i1 @is_infinite_or_nan(float %x) { ret i1 %r } +define i1 @is_infinite_or_nan2(float %x) { +; CHECK-LABEL: @is_infinite_or_nan2( +; CHECK-NEXT: [[XABS:%.*]] = call nnan ninf float @llvm.fabs.f32(float [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = fcmp ueq float [[XABS]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[R]] +; + %xabs = call nnan ninf float @llvm.fabs.f32(float %x) + %r = fcmp ueq float %xabs, 0x7FF0000000000000 + ret i1 %r +} + +define <2 x i1> @is_infinite_neg_or_nan(<2 x float> %x) { +; CHECK-LABEL: @is_infinite_neg_or_nan( +; CHECK-NEXT: [[X42:%.*]] = fadd nnan ninf <2 x float> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = fcmp ueq <2 x float> [[X42]], +; CHECK-NEXT: ret <2 x i1> [[R]] +; + %x42 = fadd nnan ninf <2 x float> %x, + %r = fcmp ueq <2 x float> %x42, + ret <2 x i1> %r +} + define i1 @is_finite_or_nan(i1 %c, double %x) { ; CHECK-LABEL: @is_finite_or_nan( ; CHECK-NEXT: ret i1 true @@ -1101,3 +1123,27 @@ define i1 @is_finite_and_ordered(double %x) { %r = fcmp one double %xx, 0x7FF0000000000000 ret i1 %r } + +define i1 @is_finite(i1 %c, double %x) { +; CHECK-LABEL: @is_finite( +; CHECK-NEXT: [[XX:%.*]] = fmul nnan ninf double [[X:%.*]], [[X]] +; CHECK-NEXT: [[S:%.*]] = select i1 [[C:%.*]], double 4.200000e+01, double [[XX]] +; CHECK-NEXT: [[R:%.*]] = fcmp one double [[S]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[R]] +; + %xx = fmul nnan ninf double %x, %x + %s = select i1 %c, double 42.0, double %xx + %r = fcmp one double %s, 0x7FF0000000000000 + ret i1 %r +} + +define <2 x i1> @is_finite_commute(<2 x i8> %x) { +; CHECK-LABEL: @is_finite_commute( +; CHECK-NEXT: [[CAST:%.*]] = uitofp <2 x i8> [[X:%.*]] to <2 x float> +; CHECK-NEXT: [[R:%.*]] = fcmp one <2 x float> , [[CAST]] +; CHECK-NEXT: ret <2 x i1> [[R]] +; + %cast = uitofp <2 x i8> %x to <2 x float> + %r = fcmp one <2 x float> , %cast + ret <2 x i1> %r +} From b89ae102e6f5ed3760f1ae5788bd76ef8e9d9490 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 24 Jul 2020 15:11:02 -0400 Subject: [PATCH 0125/1035] [InstSimplify] fold fcmp using isKnownNeverInfinity + isKnownNeverNaN Follow-up to D84035 / rG7393d7574c09. This sidesteps a question of FMF/poison on fcmp raised in PR46077: http://bugs.llvm.org/PR46077 https://alive2.llvm.org/ce/z/TCsyzD define i1 @src(float %x) { %0: %x42 = fadd nnan ninf float %x, 42.000000 %r = fcmp ueq float %x42, inf ret i1 %r } => define i1 @tgt(float %x) { %0: ret i1 0 } Transformation seems to be correct! https://alive2.llvm.org/ce/z/FQaH7a define i1 @src(i8 %x) { %0: %cast = uitofp i8 %x to float %r = fcmp one float inf, %cast ret i1 %r } => define i1 @tgt(i8 %x) { %0: ret i1 1 } Transformation seems to be correct! --- llvm/lib/Analysis/InstructionSimplify.cpp | 8 ++++++++ .../InstSimplify/floating-point-compare.ll | 17 ++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 49346860f251e..396fc22920cdf 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -3701,6 +3701,14 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, // LHS != Inf if (Pred == FCmpInst::FCMP_UNE && isKnownNeverInfinity(LHS, Q.TLI)) return getTrue(RetTy); + // LHS == Inf || LHS == NaN + if (Pred == FCmpInst::FCMP_UEQ && isKnownNeverInfinity(LHS, Q.TLI) && + isKnownNeverNaN(LHS, Q.TLI)) + return getFalse(RetTy); + // LHS != Inf && LHS != NaN + if (Pred == FCmpInst::FCMP_ONE && isKnownNeverInfinity(LHS, Q.TLI) && + isKnownNeverNaN(LHS, Q.TLI)) + return getTrue(RetTy); } if (C->isNegative() && !C->isNegZero()) { assert(!C->isNaN() && "Unexpected NaN constant!"); diff --git a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll index 6ce5c5e67b08a..d0864e78570b3 100644 --- a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll +++ b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll @@ -1072,9 +1072,7 @@ define i1 @is_infinite_or_nan(float %x) { define i1 @is_infinite_or_nan2(float %x) { ; CHECK-LABEL: @is_infinite_or_nan2( -; CHECK-NEXT: [[XABS:%.*]] = call nnan ninf float @llvm.fabs.f32(float [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = fcmp ueq float [[XABS]], 0x7FF0000000000000 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 false ; %xabs = call nnan ninf float @llvm.fabs.f32(float %x) %r = fcmp ueq float %xabs, 0x7FF0000000000000 @@ -1083,9 +1081,7 @@ define i1 @is_infinite_or_nan2(float %x) { define <2 x i1> @is_infinite_neg_or_nan(<2 x float> %x) { ; CHECK-LABEL: @is_infinite_neg_or_nan( -; CHECK-NEXT: [[X42:%.*]] = fadd nnan ninf <2 x float> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = fcmp ueq <2 x float> [[X42]], -; CHECK-NEXT: ret <2 x i1> [[R]] +; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %x42 = fadd nnan ninf <2 x float> %x, %r = fcmp ueq <2 x float> %x42, @@ -1126,10 +1122,7 @@ define i1 @is_finite_and_ordered(double %x) { define i1 @is_finite(i1 %c, double %x) { ; CHECK-LABEL: @is_finite( -; CHECK-NEXT: [[XX:%.*]] = fmul nnan ninf double [[X:%.*]], [[X]] -; CHECK-NEXT: [[S:%.*]] = select i1 [[C:%.*]], double 4.200000e+01, double [[XX]] -; CHECK-NEXT: [[R:%.*]] = fcmp one double [[S]], 0x7FF0000000000000 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 true ; %xx = fmul nnan ninf double %x, %x %s = select i1 %c, double 42.0, double %xx @@ -1139,9 +1132,7 @@ define i1 @is_finite(i1 %c, double %x) { define <2 x i1> @is_finite_commute(<2 x i8> %x) { ; CHECK-LABEL: @is_finite_commute( -; CHECK-NEXT: [[CAST:%.*]] = uitofp <2 x i8> [[X:%.*]] to <2 x float> -; CHECK-NEXT: [[R:%.*]] = fcmp one <2 x float> , [[CAST]] -; CHECK-NEXT: ret <2 x i1> [[R]] +; CHECK-NEXT: ret <2 x i1> ; %cast = uitofp <2 x i8> %x to <2 x float> %r = fcmp one <2 x float> , %cast From c6cf71107af8b8110572e8fe42d4624e1fc83549 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sat, 25 Jul 2020 16:40:14 -0400 Subject: [PATCH 0126/1035] [InstSimplify] add tests for min/max intrinsics; NFC --- .../InstSimplify/maxmin_intrinsics.ll | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll diff --git a/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll b/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll new file mode 100644 index 0000000000000..fe940ef7bc179 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare i8 @llvm.smax.i8(i8, i8) +declare <2 x i8> @llvm.smax.v2i8(<2 x i8>, <2 x i8>) +declare i8 @llvm.smin.i8(i8, i8) +declare <2 x i8> @llvm.smin.v2i8(<2 x i8>, <2 x i8>) +declare i8 @llvm.umax.i8(i8, i8) +declare <2 x i8> @llvm.umax.v2i8(<2 x i8>, <2 x i8>) +declare i8 @llvm.umin.i8(i8, i8) +declare <2 x i8> @llvm.umin.v2i8(<2 x i8>, <2 x i8>) + +define i8 @smax_maxval(i8 %x) { +; CHECK-LABEL: @smax_maxval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 127) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.smax.i8(i8 %x, i8 127) + ret i8 %r +} + +define <2 x i8> @smax_maxval_commute(<2 x i8> %x) { +; CHECK-LABEL: @smax_maxval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> %x) + ret <2 x i8> %r +} + +define i8 @smin_minval(i8 %x) { +; CHECK-LABEL: @smin_minval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smin.i8(i8 -128, i8 [[X:%.*]]) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.smin.i8(i8 -128, i8 %x) + ret i8 %r +} + +define <2 x i8> @smin_minval_commute(<2 x i8> %x) { +; CHECK-LABEL: @smin_minval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smin.v2i8(<2 x i8> %x, <2 x i8> ) + ret <2 x i8> %r +} + +define i8 @umax_maxval(i8 %x) { +; CHECK-LABEL: @umax_maxval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 -1) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.umax.i8(i8 %x, i8 255) + ret i8 %r +} + +define <2 x i8> @umax_maxval_commute(<2 x i8> %x) { +; CHECK-LABEL: @umax_maxval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> %x) + ret <2 x i8> %r +} + +define i8 @umin_minval(i8 %x) { +; CHECK-LABEL: @umin_minval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umin.i8(i8 0, i8 [[X:%.*]]) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.umin.i8(i8 0, i8 %x) + ret i8 %r +} + +define <2 x i8> @umin_minval_commute(<2 x i8> %x) { +; CHECK-LABEL: @umin_minval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> zeroinitializer) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> zeroinitializer) + ret <2 x i8> %r +} From b1731da871593a7bd5acad9830dc0797c5e6dec9 Mon Sep 17 00:00:00 2001 From: Xing GUO Date: Sun, 26 Jul 2020 16:01:22 +0800 Subject: [PATCH 0127/1035] [DWARFYAML] Rename getUsedSectionNames() to getNonEmptySectionNames(). This patch renames getUsedSectionNames() to getNonEmptySectionNames. NFC. --- llvm/include/llvm/ObjectYAML/DWARFYAML.h | 2 +- llvm/lib/ObjectYAML/DWARFYAML.cpp | 2 +- llvm/lib/ObjectYAML/ELFEmitter.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h index 7b18ceebdb1cf..88ac404b21b13 100644 --- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h +++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h @@ -225,7 +225,7 @@ struct Data { bool isEmpty() const; - SetVector getUsedSectionNames() const; + SetVector getNonEmptySectionNames() const; }; } // end namespace DWARFYAML diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp index 186ec8ac4bdc3..e5c77bc3721fb 100644 --- a/llvm/lib/ObjectYAML/DWARFYAML.cpp +++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp @@ -22,7 +22,7 @@ bool DWARFYAML::Data::isEmpty() const { !GNUPubTypes && CompileUnits.empty() && DebugLines.empty(); } -SetVector DWARFYAML::Data::getUsedSectionNames() const { +SetVector DWARFYAML::Data::getNonEmptySectionNames() const { SetVector SecNames; if (!DebugStrings.empty()) SecNames.insert("debug_str"); diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index fc80839064ee0..65b03050c7cdb 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -355,7 +355,7 @@ ELFState::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH) if (Doc.Symbols) ImplicitSections.push_back(".symtab"); if (Doc.DWARF) - for (StringRef DebugSecName : Doc.DWARF->getUsedSectionNames()) { + for (StringRef DebugSecName : Doc.DWARF->getNonEmptySectionNames()) { std::string SecName = ("." + DebugSecName).str(); ImplicitSections.push_back(StringRef(SecName).copy(StringAlloc)); } @@ -931,7 +931,7 @@ void ELFState::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name, } static bool shouldEmitDWARF(DWARFYAML::Data &DWARF, StringRef Name) { - SetVector DebugSecNames = DWARF.getUsedSectionNames(); + SetVector DebugSecNames = DWARF.getNonEmptySectionNames(); return Name.consume_front(".") && DebugSecNames.count(Name); } From 4f6502ab3356f51f711b57001ab596f1bc66fd3a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 13:21:31 -0400 Subject: [PATCH 0128/1035] AMDGPU/GlobalISel: Replace selection tests for G_CONSTANT/G_FCONSTANT Split into separate tests and make more consistent with the others. --- .../GlobalISel/inst-select-constant.mir | 166 ++++++++++++++---- .../GlobalISel/inst-select-fconstant.mir | 159 +++++++++++++++++ 2 files changed, 288 insertions(+), 37 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir index db37495f052d8..2f2c305cfac79 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir @@ -1,55 +1,147 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN --- - -name: constant +name: constant_v_s32 legalized: true regBankSelected: true tracksRegLiveness: true - body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GCN-LABEL: name: constant - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(p1) = COPY $vgpr2_vgpr3 - - ; GCN: %{{[0-9]+}}:sreg_32 = S_MOV_B32 1 - %2:sreg_32(s32) = G_CONSTANT i32 1 - - ; GCN: [[LO0:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: [[HI0:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GCN: %{{[0-9]+}}:sreg_64 = REG_SEQUENCE [[LO0]], %subreg.sub0, [[HI0]], %subreg.sub1 - %3:sgpr(s64) = G_CONSTANT i64 4294967296 - - ; GCN: %{{[0-9]+}}:sreg_32 = S_MOV_B32 1065353216 - %4:sgpr(s32) = G_FCONSTANT float 1.0 + ; GCN-LABEL: name: constant_v_s32 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec + ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]] + %0:vgpr(s32) = G_CONSTANT i32 0 + %1:vgpr(s32) = G_CONSTANT i32 1 + %2:vgpr(s32) = G_CONSTANT i32 -1 + %3:vgpr(s32) = G_CONSTANT i32 -54 + %4:vgpr(s32) = G_CONSTANT i32 27 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4 +... - ; GCN: %5:sreg_64 = S_MOV_B64 4607182418800017408 - %5:sgpr(s64) = G_FCONSTANT double 1.0 +--- +name: constant_s_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true - ; GCN: [[LO1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: [[HI1:%[0-9]+]]:sreg_32 = S_MOV_B32 1076101120 - ; GCN: %{{[0-9]+}}:sreg_64 = REG_SEQUENCE [[LO1]], %subreg.sub0, [[HI1]], %subreg.sub1 - %6:sgpr(s64) = G_FCONSTANT double 10.0 +body: | + bb.0: + ; GCN-LABEL: name: constant_s_s32 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 + ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27 + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]] + %0:sgpr(s32) = G_CONSTANT i32 0 + %1:sgpr(s32) = G_CONSTANT i32 1 + %2:sgpr(s32) = G_CONSTANT i32 -1 + %3:sgpr(s32) = G_CONSTANT i32 -54 + %4:sgpr(s32) = G_CONSTANT i32 27 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4 +... - ; GCN: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 1 - %7:vgpr(s32) = G_CONSTANT i32 1 +# FIXME +# --- +# name: constant_v_s16 +# legalized: true +# regBankSelected: true +# tracksRegLiveness: true - ; GCN: [[LO2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0 - ; GCN: [[HI2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1 - ; GCN: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE [[LO2]], %subreg.sub0, [[HI2]], %subreg.sub1 - %8:vgpr(s64) = G_CONSTANT i64 4294967296 +# body: | +# bb.0: +# %0:vgpry(s16) = G_CONSTANT i16 0 +# %1:vgpr(s16) = G_CONSTANT i16 1 +# %2:vgpr(s16) = G_CONSTANT i16 -1 +# %3:vgpr(s16) = G_CONSTANT i16 -54 +# %4:vgpr(s16) = G_CONSTANT i16 27 +# S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4 +# ... - ; GCN: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 1065353216 - %9:vgpr(s32) = G_FCONSTANT float 1.0 +--- +name: constant_v_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true - ; GCN: [[LO3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0 - ; GCN: [[HI3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248 - ; GCN: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE [[LO3]], %subreg.sub0, [[HI3]], %subreg.sub1 - %10:vgpr(s64) = G_FCONSTANT double 1.0 +body: | + bb.0: + ; GCN-LABEL: name: constant_v_s64 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GCN: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec + ; GCN: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec + ; GCN: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GCN: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; GCN: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec + ; GCN: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + %0:vgpr(s64) = G_CONSTANT i64 0 + %1:vgpr(s64) = G_CONSTANT i64 1 + %2:vgpr(s64) = G_CONSTANT i64 -1 + %3:vgpr(s64) = G_CONSTANT i64 -54 + %4:vgpr(s64) = G_CONSTANT i64 27 + %5:vgpr(s64) = G_CONSTANT i64 4294967295 + %6:vgpr(s64) = G_CONSTANT i64 4294967296 + %7:vgpr(s64) = G_CONSTANT i64 18446744004990098135 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 +... - S_ENDPGM 0, implicit %2, implicit %4, implicit %5, implicit %6, implicit %8, implicit %3, implicit %5, implicit %7, implicit %9, implicit %10 +--- +name: constant_s_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_s64 + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GCN: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GCN: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GCN: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 + ; GCN: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; GCN: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 + ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + %0:sgpr(s64) = G_CONSTANT i64 0 + %1:sgpr(s64) = G_CONSTANT i64 1 + %2:sgpr(s64) = G_CONSTANT i64 -1 + %3:sgpr(s64) = G_CONSTANT i64 -54 + %4:sgpr(s64) = G_CONSTANT i64 27 + %5:sgpr(s64) = G_CONSTANT i64 4294967295 + %6:sgpr(s64) = G_CONSTANT i64 4294967296 + %7:sgpr(s64) = G_CONSTANT i64 18446744004990098135 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir new file mode 100644 index 0000000000000..9afa4b08c0ecb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir @@ -0,0 +1,159 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: fconstant_v_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_v_s32 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]] + %0:vgpr(s32) = G_FCONSTANT float 1.0 + %1:vgpr(s32) = G_FCONSTANT float 8.0 + %2:vgpr(s32) = G_FCONSTANT float 1.0 + %3:vgpr(s32) = G_FCONSTANT float 8.0 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 +... + +--- +name: fconstant_s_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_s_s32 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1065353216 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1090519040 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3212836864 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 3238002688 + ; GCN: $sgpr0 = COPY [[S_MOV_B32_]] + ; GCN: $sgpr1 = COPY [[S_MOV_B32_1]] + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]] + %0:sgpr(s32) = G_FCONSTANT float 1.0 + %1:sgpr(s32) = G_FCONSTANT float 8.0 + %2:sgpr(s32) = G_FCONSTANT float -1.0 + %3:sgpr(s32) = G_FCONSTANT float -8.0 + $sgpr0 = COPY %0 + $sgpr1 = COPY %1 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 + +... + +--- +name: fconstant_v_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_v_s64 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248, implicit $exec + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1075838976, implicit $exec + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1073741824, implicit $exec + ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1076101120, implicit $exec + ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 + ; GCN: $vgpr0_vgpr1 = COPY [[REG_SEQUENCE]] + ; GCN: $vgpr2_vgpr3 = COPY [[REG_SEQUENCE1]] + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + %0:vgpr(s64) = G_FCONSTANT double 1.0 + %1:vgpr(s64) = G_FCONSTANT double 8.0 + %2:vgpr(s64) = G_FCONSTANT double -2.0 + %3:vgpr(s64) = G_FCONSTANT double 10.0 + $vgpr0_vgpr1 = COPY %0 + $vgpr2_vgpr3 = COPY %1 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 + +... + +--- +name: fconstant_s_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_s_s64 + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1075838976 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GCN: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1071382528 + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GCN: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]] + ; GCN: $sgpr2_sgpr3 = COPY [[REG_SEQUENCE]] + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_1]], implicit [[REG_SEQUENCE1]] + %0:sgpr(s64) = G_FCONSTANT double 1.0 + %1:sgpr(s64) = G_FCONSTANT double 8.0 + %2:sgpr(s64) = G_FCONSTANT double -2.0 + %3:sgpr(s64) = G_FCONSTANT double -10.0 + $sgpr0_sgpr1 = COPY %0 + $sgpr2_sgpr3 = COPY %1 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 +... + +--- +name: fconstant_v_s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_v_s16 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 18432, implicit $exec + ; GCN: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; GCN: $vgpr1 = COPY [[V_MOV_B32_e32_1]] + %0:vgpr(s16) = G_FCONSTANT half 1.0 + %1:vgpr(s16) = G_FCONSTANT half 8.0 + %2:vgpr(s32) = G_ANYEXT %0 + %3:vgpr(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 + $vgpr1 = COPY %3 + +... + +--- +name: fconstant_s_s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_s_s16 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 15360 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 18432 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GCN: $sgpr0 = COPY [[COPY]] + ; GCN: $sgpr1 = COPY [[COPY1]] + %0:sgpr(s16) = G_FCONSTANT half 1.0 + %1:sgpr(s16) = G_FCONSTANT half 8.0 + %2:vgpr(s32) = G_ANYEXT %0 + %3:vgpr(s32) = G_ANYEXT %1 + $sgpr0 = COPY %2 + $sgpr1 = COPY %3 + +... + From 4033aa1467d6ea0cb5289ed9db2f4e3cc015eac1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 14:37:29 -0400 Subject: [PATCH 0129/1035] AMDGPU/GlobalISel: Sign extend integer constants This matches the DAG behavior and fixes immediate folding --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 2 +- .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll | 8 +- .../GlobalISel/divergent-control-flow.ll | 26 +- .../AMDGPU/GlobalISel/inst-select-add.mir | 2 +- .../AMDGPU/GlobalISel/inst-select-and.mir | 24 +- .../GlobalISel/inst-select-constant.mir | 8 +- .../inst-select-extract-vector-elt.mir | 12 +- .../GlobalISel/inst-select-load-local.mir | 6 +- .../GlobalISel/inst-select-load-private.mir | 28 +- .../AMDGPU/GlobalISel/inst-select-or.mir | 24 +- .../AMDGPU/GlobalISel/inst-select-ptrmask.mir | 30 +- .../AMDGPU/GlobalISel/inst-select-xor.mir | 24 +- .../GlobalISel/llvm.amdgcn.s.buffer.load.ll | 26 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 8 +- .../AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll | 8 +- .../CodeGen/AMDGPU/GlobalISel/localizer.ll | 8 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 244 +++--- .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 2 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 17 +- .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll | 2 +- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 15 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 750 +++++++++--------- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 330 ++++---- .../CodeGen/AMDGPU/GlobalISel/udiv.i32.ll | 2 +- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 19 +- .../CodeGen/AMDGPU/GlobalISel/urem.i32.ll | 2 +- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 19 +- 27 files changed, 796 insertions(+), 850 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 4740a58519996..a126ed1daf17f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1952,7 +1952,7 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); ImmOp.ChangeToImmediate(Imm.getZExtValue()); } else if (ImmOp.isCImm()) { - ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); + ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); } Register DstReg = I.getOperand(0).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index ccf846a933abd..c6c0eb7c4a937 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -91,7 +91,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xffffffc0 +; GFX9-NEXT: s_movk_i32 s4, 0xffc0 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -99,7 +99,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffc0 +; GFX8-NEXT: s_movk_i32 s4, 0xffc0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -140,7 +140,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffc0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 ; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -152,7 +152,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffffffc0 +; GFX9-NEXT: s_movk_i32 s1, 0xffc0 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 2695952bfd193..1f9c3bc60876e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -135,24 +135,24 @@ define void @constrained_if_register_class() { ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_xor_b32 s4, s4, 1 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cselect_b32 s5, 1, 0 +; CHECK-NEXT: s_xor_b32 s5, s5, -1 +; CHECK-NEXT: s_and_b32 s5, s5, 1 +; CHECK-NEXT: s_mov_b32 s4, -1 +; CHECK-NEXT: s_cmp_lg_u32 s5, 0 ; CHECK-NEXT: s_cbranch_scc0 BB4_6 ; CHECK-NEXT: ; %bb.1: ; %bb2 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: flat_load_dword v0, v[0:1] -; CHECK-NEXT: s_mov_b32 s4, -1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 ; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir index 51a116a944ad6..baed490c07581 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir @@ -94,7 +94,7 @@ body: | ; GFX6-LABEL: name: add_neg_inline_const_64_to_sub_s32_v ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967232, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -64, implicit $exec ; GFX6: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: S_ENDPGM 0, implicit %2 ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir index ecfb9b618f5ed..81437acbbbc53 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir @@ -20,7 +20,7 @@ body: | ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_AND_B64_]] ; WAVE32-LABEL: name: and_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -30,7 +30,7 @@ body: | ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -386,7 +386,7 @@ body: | ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_AND_B64_]] ; WAVE32-LABEL: name: and_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -397,7 +397,7 @@ body: | ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -425,24 +425,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B32_1]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -471,24 +471,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B32_1]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir index 2f2c305cfac79..c8762c0d578eb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir @@ -12,8 +12,8 @@ body: | ; GCN-LABEL: name: constant_v_s32 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]] %0:vgpr(s32) = G_CONSTANT i32 0 @@ -35,8 +35,8 @@ body: | ; GCN-LABEL: name: constant_s_s32 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54 ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27 ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]] %0:sgpr(s32) = G_CONSTANT i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir index da730b0c9fa9a..10e4cbdc14676 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir @@ -315,7 +315,7 @@ body: | ; MOVREL-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_m1 ; MOVREL: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; MOVREL: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; MOVREL: $m0 = COPY [[S_ADD_I32_]] ; MOVREL: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]] @@ -323,7 +323,7 @@ body: | ; GPRIDX-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_m1 ; GPRIDX: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GPRIDX: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GPRIDX: $m0 = COPY [[S_ADD_I32_]] ; GPRIDX: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]] @@ -468,7 +468,7 @@ body: | ; MOVREL-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_m1 ; MOVREL: [[COPY:%[0-9]+]]:sgpr_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; MOVREL: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; MOVREL: $m0 = COPY [[S_ADD_I32_]] ; MOVREL: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]] @@ -476,7 +476,7 @@ body: | ; GPRIDX-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_m1 ; GPRIDX: [[COPY:%[0-9]+]]:sgpr_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GPRIDX: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GPRIDX: $m0 = COPY [[S_ADD_I32_]] ; GPRIDX: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]] @@ -699,7 +699,7 @@ body: | ; MOVREL-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_m1 ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; MOVREL: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; MOVREL: $m0 = COPY [[S_ADD_I32_]] ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] @@ -707,7 +707,7 @@ body: | ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_m1 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GPRIDX: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_I32_]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir index a80ad208b5898..45d74ad38e532 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir @@ -681,7 +681,7 @@ body: | ; GFX7-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX7: $m0 = S_MOV_B32 -1 ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) @@ -689,14 +689,14 @@ body: | ; GFX9-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index 162dd01de66d1..9f6d10722143d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -311,14 +311,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -347,14 +347,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -453,14 +453,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -489,14 +489,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -597,14 +597,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -633,14 +633,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -860,11 +860,11 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_neg1 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_neg1 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir index 40b7b69f83b7f..7f1f52d2c522a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir @@ -20,7 +20,7 @@ body: | ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_OR_B64_]] ; WAVE32-LABEL: name: or_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -30,7 +30,7 @@ body: | ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -386,7 +386,7 @@ body: | ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_OR_B64_]] ; WAVE32-LABEL: name: or_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -397,7 +397,7 @@ body: | ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -425,24 +425,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -471,24 +471,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir index bbb885c705ed6..a7f875fcdd428 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir @@ -33,7 +33,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_0xf0f0f0f0 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4042322160 + ; CHECK: %const:sreg_32 = S_MOV_B32 -252645136 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -54,7 +54,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_0xffffffff ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967295 + ; CHECK: %const:sreg_32 = S_MOV_B32 -1 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -96,7 +96,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearhi1 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 2147483648 + ; CHECK: %const:sreg_32 = S_MOV_B32 -2147483648 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -117,7 +117,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearhi2 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 3221225472 + ; CHECK: %const:sreg_32 = S_MOV_B32 -1073741824 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -138,7 +138,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo1 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967294 + ; CHECK: %const:sreg_32 = S_MOV_B32 -2 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -159,7 +159,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo2 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967292 + ; CHECK: %const:sreg_32 = S_MOV_B32 -4 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -180,7 +180,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo3 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967288 + ; CHECK: %const:sreg_32 = S_MOV_B32 -8 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -201,7 +201,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo4 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967280 + ; CHECK: %const:sreg_32 = S_MOV_B32 -16 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -222,7 +222,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo29 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 3758096384 + ; CHECK: %const:sreg_32 = S_MOV_B32 -536870912 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -560,7 +560,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_0xf0f0f0f0 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4042322160, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -252645136, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -581,7 +581,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo1 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967294, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -602,7 +602,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo2 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -623,7 +623,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo3 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967288, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -8, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -644,7 +644,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo4 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967280, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -665,7 +665,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo29 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 3758096384, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -536870912, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir index 1c03557cc4953..f923a4c9f02b8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir @@ -20,7 +20,7 @@ body: | ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_XOR_B64_]] ; WAVE32-LABEL: name: xor_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -30,7 +30,7 @@ body: | ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_XOR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -387,7 +387,7 @@ body: | ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_XOR_B64_]] ; WAVE32-LABEL: name: xor_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -398,7 +398,7 @@ body: | ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_XOR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -426,24 +426,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -472,24 +472,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 66425c27a19fe..aba47890f61cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -973,7 +973,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -987,7 +987,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX7: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1001,7 +1001,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1020,7 +1020,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1047,7 +1047,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1066,7 +1066,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967288 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1093,7 +1093,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967288 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1112,7 +1112,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1139,7 +1139,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1342,7 +1342,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4293918720 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1369,7 +1369,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4293918720 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1433,7 +1433,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1460,7 +1460,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 7d116f8e8925f..e5d26476e9424 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -414,8 +414,8 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_i32_constant_fold_test_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_i32 s3, -1, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_i32 s2, -1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -617,8 +617,8 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_i32_constant_fold_test_16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_i32 s3, -1, 0x70001 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_i32 s2, -1, 0x70001 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index a5737e8233af3..6fc1cd5753083 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -694,8 +694,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_u32_constant_fold_test_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_u32 s3, -1, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_u32 s2, -1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -897,8 +897,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_u32_constant_fold_test_16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_u32 s3, -1, 0x70001 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_u32 s2, -1, 0x70001 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 06bf7f7949309..23398f8ecf6e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -8,9 +8,9 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-LABEL: localize_constants: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s0, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, 1 +; GFX9-NEXT: s_xor_b32 s1, s1, -1 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 BB0_2 @@ -83,9 +83,9 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-LABEL: localize_globals: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s0, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, 1 +; GFX9-NEXT: s_xor_b32 s1, s1, -1 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 BB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 80ff9bb2b5752..b2e7f1ea326f6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -101,7 +101,7 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s1 @@ -127,7 +127,7 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX9-NEXT: s_cmp_lt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX9-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX9-NEXT: s_sext_i32_i16 s3, s3 ; GFX9-NEXT: s_sext_i32_i16 s1, s1 ; GFX9-NEXT: s_cmp_gt_i32 s3, s1 @@ -155,7 +155,7 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX10-NEXT: s_cmp_lt_i32 s3, s4 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX10-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX10-NEXT: s_sext_i32_i16 s3, s3 ; GFX10-NEXT: s_cmp_gt_i32 s3, s1 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1 @@ -268,7 +268,7 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s1 @@ -294,7 +294,7 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX9-NEXT: s_cmp_lt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX9-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX9-NEXT: s_sext_i32_i16 s3, s3 ; GFX9-NEXT: s_sext_i32_i16 s1, s1 ; GFX9-NEXT: s_cmp_gt_i32 s3, s1 @@ -322,7 +322,7 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX10-NEXT: s_cmp_lt_i32 s3, s4 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX10-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX10-NEXT: s_sext_i32_i16 s3, s3 ; GFX10-NEXT: s_cmp_gt_i32 s3, s1 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1 @@ -344,7 +344,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -379,7 +379,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -409,7 +409,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -440,7 +440,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s5, 0x8000 +; GFX10-NEXT: s_movk_i32 s5, 0x8000 ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: v_min_i16_e64 v4, v2, 0 @@ -483,7 +483,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s1 @@ -527,7 +527,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_cselect_b32 s9, s7, s8 ; GFX8-NEXT: s_sub_i32 s9, s5, s9 ; GFX8-NEXT: s_cmp_lt_i32 s7, s8 -; GFX8-NEXT: s_mov_b32 s6, 0x8000 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: s_cselect_b32 s7, s7, s8 ; GFX8-NEXT: s_sub_i32 s7, s6, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 @@ -582,7 +582,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: s_cselect_b32 s9, s7, s8 ; GFX9-NEXT: s_sub_i32 s9, s5, s9 ; GFX9-NEXT: s_cmp_lt_i32 s7, s8 -; GFX9-NEXT: s_mov_b32 s6, 0x8000 +; GFX9-NEXT: s_movk_i32 s6, 0x8000 ; GFX9-NEXT: s_cselect_b32 s7, s7, s8 ; GFX9-NEXT: s_sub_i32 s7, s6, s7 ; GFX9-NEXT: s_sext_i32_i16 s7, s7 @@ -635,7 +635,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_cmp_gt_i32 s5, s6 ; GFX10-NEXT: s_movk_i32 s7, 0x7fff ; GFX10-NEXT: s_cselect_b32 s8, s5, s6 -; GFX10-NEXT: s_mov_b32 s9, 0x8000 +; GFX10-NEXT: s_movk_i32 s9, 0x8000 ; GFX10-NEXT: s_sub_i32 s8, s7, s8 ; GFX10-NEXT: s_cmp_lt_i32 s5, s6 ; GFX10-NEXT: s_sext_i32_i16 s1, s1 @@ -693,7 +693,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 @@ -726,7 +726,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 @@ -760,7 +760,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 @@ -818,7 +818,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: v_min_i16_e32 v10, 0, v0 ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 @@ -878,7 +878,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_min_i16_e64 v8, v4, 0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s5, 0x8000 +; GFX10-NEXT: s_movk_i32 s5, 0x8000 ; GFX10-NEXT: v_min_i16_e64 v9, v2, 0 ; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 ; GFX10-NEXT: v_sub_nc_u16_e64 v8, s5, v8 @@ -946,7 +946,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s1 @@ -1028,7 +1028,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_cselect_b32 s13, s11, s12 ; GFX8-NEXT: s_sub_i32 s13, s9, s13 ; GFX8-NEXT: s_cmp_lt_i32 s11, s12 -; GFX8-NEXT: s_mov_b32 s10, 0x8000 +; GFX8-NEXT: s_movk_i32 s10, 0x8000 ; GFX8-NEXT: s_cselect_b32 s11, s11, s12 ; GFX8-NEXT: s_sub_i32 s11, s10, s11 ; GFX8-NEXT: s_sext_i32_i16 s11, s11 @@ -1133,7 +1133,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_cselect_b32 s13, s11, s12 ; GFX9-NEXT: s_sub_i32 s13, s9, s13 ; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_mov_b32 s10, 0x8000 +; GFX9-NEXT: s_movk_i32 s10, 0x8000 ; GFX9-NEXT: s_cselect_b32 s11, s11, s12 ; GFX9-NEXT: s_sub_i32 s11, s10, s11 ; GFX9-NEXT: s_sext_i32_i16 s11, s11 @@ -1236,7 +1236,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_cmp_gt_i32 s9, s10 ; GFX10-NEXT: s_movk_i32 s11, 0x7fff ; GFX10-NEXT: s_cselect_b32 s12, s9, s10 -; GFX10-NEXT: s_mov_b32 s13, 0x8000 +; GFX10-NEXT: s_movk_i32 s13, 0x8000 ; GFX10-NEXT: s_sub_i32 s12, s11, s12 ; GFX10-NEXT: s_cmp_lt_i32 s9, s10 ; GFX10-NEXT: s_sext_i32_i16 s1, s1 @@ -1723,7 +1723,7 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -1744,7 +1744,7 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -1765,7 +1765,7 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 ; GFX9-NEXT: s_brev_b32 s4, -2 @@ -1789,7 +1789,7 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_i32_e32 v4, 0, v0 ; GFX10-NEXT: v_min_i32_e32 v5, 0, v1 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_max_i32_e32 v6, 0, v0 ; GFX10-NEXT: v_max_i32_e32 v7, 0, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, s4, v4 @@ -1817,7 +1817,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s2 @@ -1845,7 +1845,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX8-NEXT: s_cselect_b32 s6, s0, 0 ; GFX8-NEXT: s_sub_i32 s6, s4, s6 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cselect_b32 s7, s0, 0 ; GFX8-NEXT: s_sub_i32 s7, s5, s7 ; GFX8-NEXT: s_cmp_gt_i32 s7, s2 @@ -1873,7 +1873,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX9-NEXT: s_cselect_b32 s6, s0, 0 ; GFX9-NEXT: s_sub_i32 s6, s4, s6 ; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: s_cselect_b32 s7, s0, 0 ; GFX9-NEXT: s_sub_i32 s7, s5, s7 ; GFX9-NEXT: s_cmp_gt_i32 s7, s2 @@ -1899,7 +1899,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, 0 ; GFX10-NEXT: s_brev_b32 s4, -2 ; GFX10-NEXT: s_cselect_b32 s5, s0, 0 -; GFX10-NEXT: s_mov_b32 s6, 0x80000000 +; GFX10-NEXT: s_brev_b32 s6, 1 ; GFX10-NEXT: s_sub_i32 s5, s4, s5 ; GFX10-NEXT: s_cmp_lt_i32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -1930,7 +1930,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -1958,7 +1958,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -1986,7 +1986,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX9-NEXT: v_sub_u32_e32 v7, s5, v7 ; GFX9-NEXT: s_brev_b32 s4, -2 @@ -2018,7 +2018,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX10-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX10-NEXT: v_min_i32_e32 v8, 0, v1 ; GFX10-NEXT: v_min_i32_e32 v9, 0, v2 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: v_max_i32_e32 v6, 0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v14, s5, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v15, s5, v8 @@ -2052,7 +2052,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX6-NEXT: s_cselect_b32 s8, s0, 0 ; GFX6-NEXT: s_sub_i32 s8, s6, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s7, 0x80000000 +; GFX6-NEXT: s_brev_b32 s7, 1 ; GFX6-NEXT: s_cselect_b32 s9, s0, 0 ; GFX6-NEXT: s_sub_i32 s9, s7, s9 ; GFX6-NEXT: s_cmp_gt_i32 s9, s3 @@ -2091,7 +2091,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX8-NEXT: s_cselect_b32 s8, s0, 0 ; GFX8-NEXT: s_sub_i32 s8, s6, s8 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s7, 0x80000000 +; GFX8-NEXT: s_brev_b32 s7, 1 ; GFX8-NEXT: s_cselect_b32 s9, s0, 0 ; GFX8-NEXT: s_sub_i32 s9, s7, s9 ; GFX8-NEXT: s_cmp_gt_i32 s9, s3 @@ -2130,7 +2130,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX9-NEXT: s_cselect_b32 s8, s0, 0 ; GFX9-NEXT: s_sub_i32 s8, s6, s8 ; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s7, 0x80000000 +; GFX9-NEXT: s_brev_b32 s7, 1 ; GFX9-NEXT: s_cselect_b32 s9, s0, 0 ; GFX9-NEXT: s_sub_i32 s9, s7, s9 ; GFX9-NEXT: s_cmp_gt_i32 s9, s3 @@ -2167,7 +2167,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, 0 ; GFX10-NEXT: s_brev_b32 s6, -2 ; GFX10-NEXT: s_cselect_b32 s7, s0, 0 -; GFX10-NEXT: s_mov_b32 s8, 0x80000000 +; GFX10-NEXT: s_brev_b32 s8, 1 ; GFX10-NEXT: s_sub_i32 s7, s6, s7 ; GFX10-NEXT: s_cmp_lt_i32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -2209,7 +2209,7 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -2244,7 +2244,7 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -2279,7 +2279,7 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v9, 0, v0 ; GFX9-NEXT: v_sub_u32_e32 v9, s5, v9 ; GFX9-NEXT: s_brev_b32 s4, -2 @@ -2316,7 +2316,7 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_i32_e32 v8, 0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_i32_e32 v11, 0, v1 ; GFX10-NEXT: v_min_i32_e32 v12, 0, v3 ; GFX10-NEXT: v_max_i32_e32 v9, 0, v0 @@ -2359,7 +2359,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s4 @@ -2409,7 +2409,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX8-NEXT: s_cselect_b32 s10, s0, 0 ; GFX8-NEXT: s_sub_i32 s10, s8, s10 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s9, 0x80000000 +; GFX8-NEXT: s_brev_b32 s9, 1 ; GFX8-NEXT: s_cselect_b32 s11, s0, 0 ; GFX8-NEXT: s_sub_i32 s11, s9, s11 ; GFX8-NEXT: s_cmp_gt_i32 s11, s4 @@ -2459,7 +2459,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX9-NEXT: s_cselect_b32 s10, s0, 0 ; GFX9-NEXT: s_sub_i32 s10, s8, s10 ; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s9, 0x80000000 +; GFX9-NEXT: s_brev_b32 s9, 1 ; GFX9-NEXT: s_cselect_b32 s11, s0, 0 ; GFX9-NEXT: s_sub_i32 s11, s9, s11 ; GFX9-NEXT: s_cmp_gt_i32 s11, s4 @@ -2507,7 +2507,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, 0 ; GFX10-NEXT: s_brev_b32 s8, -2 ; GFX10-NEXT: s_cselect_b32 s9, s0, 0 -; GFX10-NEXT: s_mov_b32 s10, 0x80000000 +; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: s_sub_i32 s9, s8, s9 ; GFX10-NEXT: s_cmp_lt_i32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -2560,7 +2560,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -2582,7 +2582,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 @@ -2604,7 +2604,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -2626,7 +2626,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 @@ -2648,7 +2648,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v12, 0, v0 ; GFX9-NEXT: v_sub_u32_e32 v12, s5, v12 ; GFX9-NEXT: s_brev_b32 s4, -2 @@ -2670,7 +2670,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 ; GFX9-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX9-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX9-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX9-NEXT: v_sub_u32_e32 v6, v13, v6 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 @@ -2694,10 +2694,10 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_i32_e32 v13, 0, v1 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX10-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX10-NEXT: v_mov_b32_e32 v15, 0x80000000 +; GFX10-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v13, s5, v13 ; GFX10-NEXT: v_min_i32_e32 v17, 0, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v10, s5, v10 @@ -2746,7 +2746,7 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX6-NEXT: s_cselect_b32 s12, s0, 0 ; GFX6-NEXT: s_sub_i32 s12, s10, s12 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s11, 0x80000000 +; GFX6-NEXT: s_brev_b32 s11, 1 ; GFX6-NEXT: s_cselect_b32 s13, s0, 0 ; GFX6-NEXT: s_sub_i32 s13, s11, s13 ; GFX6-NEXT: s_cmp_gt_i32 s13, s5 @@ -2807,7 +2807,7 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX8-NEXT: s_cselect_b32 s12, s0, 0 ; GFX8-NEXT: s_sub_i32 s12, s10, s12 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s11, 0x80000000 +; GFX8-NEXT: s_brev_b32 s11, 1 ; GFX8-NEXT: s_cselect_b32 s13, s0, 0 ; GFX8-NEXT: s_sub_i32 s13, s11, s13 ; GFX8-NEXT: s_cmp_gt_i32 s13, s5 @@ -2868,7 +2868,7 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX9-NEXT: s_cselect_b32 s12, s0, 0 ; GFX9-NEXT: s_sub_i32 s12, s10, s12 ; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s11, 0x80000000 +; GFX9-NEXT: s_brev_b32 s11, 1 ; GFX9-NEXT: s_cselect_b32 s13, s0, 0 ; GFX9-NEXT: s_sub_i32 s13, s11, s13 ; GFX9-NEXT: s_cmp_gt_i32 s13, s5 @@ -2927,7 +2927,7 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, 0 ; GFX10-NEXT: s_brev_b32 s10, -2 ; GFX10-NEXT: s_cselect_b32 s11, s0, 0 -; GFX10-NEXT: s_mov_b32 s12, 0x80000000 +; GFX10-NEXT: s_brev_b32 s12, 1 ; GFX10-NEXT: s_sub_i32 s11, s10, s11 ; GFX10-NEXT: s_cmp_lt_i32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -2991,7 +2991,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0x80000000 +; GFX6-NEXT: s_brev_b32 s4, 1 ; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s4, v32 ; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 @@ -3014,7 +3014,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_mov_b32_e32 v16, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 @@ -3112,7 +3112,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80000000 +; GFX8-NEXT: s_brev_b32 s4, 1 ; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s4, v32 ; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 @@ -3135,7 +3135,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_mov_b32_e32 v16, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 @@ -3233,7 +3233,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0x80000000 +; GFX9-NEXT: s_brev_b32 s4, 1 ; GFX9-NEXT: v_min_i32_e32 v32, 0, v0 ; GFX9-NEXT: v_sub_u32_e32 v32, s4, v32 ; GFX9-NEXT: v_max_i32_e32 v16, v32, v16 @@ -3256,7 +3256,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-NEXT: v_sub_u32_e32 v17, s5, v17 ; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, 0x80000000 +; GFX9-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX9-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 ; GFX9-NEXT: v_max_i32_e32 v17, v17, v19 @@ -3356,7 +3356,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_max_i32_e32 v33, 0, v0 ; GFX10-NEXT: s_brev_b32 s5, -2 ; GFX10-NEXT: v_min_i32_e32 v36, 0, v2 @@ -3367,7 +3367,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX10-NEXT: v_sub_nc_u32_e32 v36, s4, v36 ; GFX10-NEXT: v_max_i32_e32 v16, v35, v16 ; GFX10-NEXT: v_sub_nc_u32_e32 v32, s4, v32 -; GFX10-NEXT: v_mov_b32_e32 v35, 0x80000000 +; GFX10-NEXT: v_bfrev_b32_e32 v35, 1 ; GFX10-NEXT: v_min_i32_e32 v38, 0, v3 ; GFX10-NEXT: v_max_i32_e32 v18, v36, v18 ; GFX10-NEXT: v_min_i32_e32 v16, v16, v33 @@ -3485,7 +3485,7 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX6-NEXT: s_cselect_b32 s34, s0, 0 ; GFX6-NEXT: s_sub_i32 s34, s32, s34 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s33, 0x80000000 +; GFX6-NEXT: s_brev_b32 s33, 1 ; GFX6-NEXT: s_cselect_b32 s35, s0, 0 ; GFX6-NEXT: s_sub_i32 s35, s33, s35 ; GFX6-NEXT: s_cmp_gt_i32 s35, s16 @@ -3667,7 +3667,7 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX8-NEXT: s_cselect_b32 s34, s0, 0 ; GFX8-NEXT: s_sub_i32 s34, s32, s34 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s33, 0x80000000 +; GFX8-NEXT: s_brev_b32 s33, 1 ; GFX8-NEXT: s_cselect_b32 s35, s0, 0 ; GFX8-NEXT: s_sub_i32 s35, s33, s35 ; GFX8-NEXT: s_cmp_gt_i32 s35, s16 @@ -3849,7 +3849,7 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX9-NEXT: s_cselect_b32 s34, s0, 0 ; GFX9-NEXT: s_sub_i32 s34, s32, s34 ; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s33, 0x80000000 +; GFX9-NEXT: s_brev_b32 s33, 1 ; GFX9-NEXT: s_cselect_b32 s35, s0, 0 ; GFX9-NEXT: s_sub_i32 s35, s33, s35 ; GFX9-NEXT: s_cmp_gt_i32 s35, s16 @@ -4029,7 +4029,7 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX10-NEXT: s_cmp_gt_i32 s0, 0 ; GFX10-NEXT: s_brev_b32 s32, -2 ; GFX10-NEXT: s_cselect_b32 s33, s0, 0 -; GFX10-NEXT: s_mov_b32 s34, 0x80000000 +; GFX10-NEXT: s_brev_b32 s34, 1 ; GFX10-NEXT: s_sub_i32 s46, s32, s33 ; GFX10-NEXT: s_cmp_lt_i32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -4295,7 +4295,7 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_cmp_lt_i32 s2, s3 ; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s2, s1 @@ -4316,7 +4316,7 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX9-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX9-NEXT: s_cmp_lt_i32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX9-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: s_sext_i32_i16 s1, s1 ; GFX9-NEXT: s_cmp_gt_i32 s2, s1 @@ -4339,7 +4339,7 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX10-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX10-NEXT: s_cmp_lt_i32 s3, s2 ; GFX10-NEXT: s_cselect_b32 s2, s3, s2 -; GFX10-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX10-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: s_cmp_gt_i32 s2, s1 ; GFX10-NEXT: s_cselect_b32 s1, s2, s1 @@ -4379,7 +4379,7 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_cmp_lt_i32 s1, s2 ; GFX8-NEXT: s_cselect_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s1, 0x8000, s1 +; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 ; GFX8-NEXT: v_max_i16_e32 v0, s1, v0 ; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 @@ -4394,7 +4394,7 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX9-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX9-NEXT: s_cmp_lt_i32 s1, s2 ; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s1, 0x8000, s1 +; GFX9-NEXT: s_sub_i32 s1, 0xffff8000, s1 ; GFX9-NEXT: v_max_i16_e32 v0, s1, v0 ; GFX9-NEXT: v_min_i16_e32 v0, s3, v0 ; GFX9-NEXT: v_add_u16_e32 v0, s0, v0 @@ -4410,7 +4410,7 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX10-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX10-NEXT: s_cmp_lt_i32 s1, s2 ; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s1, 0x8000, s1 +; GFX10-NEXT: s_sub_i32 s1, 0xffff8000, s1 ; GFX10-NEXT: v_max_i16_e64 v0, s1, v0 ; GFX10-NEXT: v_min_i16_e64 v0, v0, s3 ; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 @@ -4478,7 +4478,7 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 @@ -4504,7 +4504,7 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4527,7 +4527,7 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 @@ -4546,7 +4546,7 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s5, 0x8000 ; GFX10-NEXT: v_pk_min_i16 v2, v0, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 ; GFX10-NEXT: v_pk_max_i16 v3, v0, s4 @@ -4573,7 +4573,7 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s2 @@ -4614,7 +4614,7 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_cselect_b32 s8, s6, s7 ; GFX8-NEXT: s_sub_i32 s8, s4, s8 ; GFX8-NEXT: s_cmp_lt_i32 s6, s7 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: s_cselect_b32 s6, s6, s7 ; GFX8-NEXT: s_sub_i32 s6, s5, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -4669,7 +4669,7 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX9-NEXT: s_cmp_lt_i32 s5, s7 ; GFX9-NEXT: s_cselect_b32 s5, s5, s7 ; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_mov_b32 s3, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s3, 0x8000 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 @@ -4726,7 +4726,7 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX10-NEXT: s_cmp_lt_i32 s3, s5 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s5, 0x8000 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 @@ -4774,7 +4774,7 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: s_cselect_b32 s4, s0, 0 ; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: s_cselect_b32 s5, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_sub_i32 s5, s3, s5 @@ -4811,7 +4811,7 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: s_cselect_b32 s6, s4, s5 ; GFX8-NEXT: s_sub_i32 s6, s2, s6 ; GFX8-NEXT: s_cmp_lt_i32 s4, s5 -; GFX8-NEXT: s_mov_b32 s3, 0x8000 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s4, s3, s4 ; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 @@ -4853,7 +4853,7 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX9-NEXT: s_cmp_lt_i32 s4, s6 ; GFX9-NEXT: s_cselect_b32 s4, s4, s6 ; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s2, 0x8000 ; GFX9-NEXT: s_cselect_b32 s3, s5, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 @@ -4890,7 +4890,7 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX10-NEXT: s_cmp_lt_i32 s2, s4 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4 ; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s4, 0x8000 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 @@ -4913,7 +4913,7 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 @@ -4943,7 +4943,7 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: saddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0x8000 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -4966,7 +4966,7 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s2, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, 0 ; GFX9-NEXT: s_movk_i32 s1, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 @@ -4983,7 +4983,7 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX10-LABEL: saddsat_v2i16_vs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_pack_ll_b32_b16 s1, 0, 0 -; GFX10-NEXT: s_mov_b32 s2, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s2, 0x8000 ; GFX10-NEXT: v_pk_min_i16 v1, v0, s1 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 ; GFX10-NEXT: v_pk_max_i16 v2, v0, s1 @@ -5017,7 +5017,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 @@ -5046,7 +5046,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 @@ -5074,7 +5074,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 @@ -5113,7 +5113,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 @@ -5139,7 +5139,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s6, 0x8000 ; GFX10-NEXT: v_pk_min_i16 v4, v0, s5 ; GFX10-NEXT: v_pk_min_i16 v5, v1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 @@ -5174,7 +5174,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s4 @@ -5249,7 +5249,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_cselect_b32 s12, s10, s11 ; GFX8-NEXT: s_sub_i32 s12, s8, s12 ; GFX8-NEXT: s_cmp_lt_i32 s10, s11 -; GFX8-NEXT: s_mov_b32 s9, 0x8000 +; GFX8-NEXT: s_movk_i32 s9, 0x8000 ; GFX8-NEXT: s_cselect_b32 s10, s10, s11 ; GFX8-NEXT: s_sub_i32 s10, s9, s10 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 @@ -5340,7 +5340,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX9-NEXT: s_cmp_lt_i32 s7, s9 ; GFX9-NEXT: s_cselect_b32 s7, s7, s9 ; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_cselect_b32 s8, s8, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 @@ -5431,7 +5431,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX10-NEXT: s_cmp_gt_i32 s6, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s9 ; GFX10-NEXT: s_cselect_b32 s10, s6, s4 -; GFX10-NEXT: s_mov_b32 s12, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s12, 0x8000 ; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 ; GFX10-NEXT: s_lshr_b32 s10, s9, 16 ; GFX10-NEXT: s_lshr_b32 s11, s8, 16 @@ -5538,7 +5538,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 @@ -5567,7 +5567,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_mov_b32_e32 v15, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 @@ -5619,7 +5619,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 @@ -5646,7 +5646,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 ; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 -; GFX8-NEXT: v_mov_b32_e32 v12, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 @@ -5676,7 +5676,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 @@ -5709,7 +5709,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s6, 0x8000 ; GFX10-NEXT: v_pk_min_i16 v7, v0, s5 ; GFX10-NEXT: v_pk_min_i16 v8, v1, s5 ; GFX10-NEXT: v_pk_min_i16 v9, v2, s5 @@ -5751,7 +5751,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_cselect_b32 s14, s0, 0 ; GFX6-NEXT: s_sub_i32 s14, s12, s14 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s13, 0x80000000 +; GFX6-NEXT: s_brev_b32 s13, 1 ; GFX6-NEXT: s_cselect_b32 s15, s0, 0 ; GFX6-NEXT: s_sub_i32 s15, s13, s15 ; GFX6-NEXT: s_cmp_gt_i32 s15, s6 @@ -5860,7 +5860,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_cselect_b32 s16, s14, s15 ; GFX8-NEXT: s_sub_i32 s16, s12, s16 ; GFX8-NEXT: s_cmp_lt_i32 s14, s15 -; GFX8-NEXT: s_mov_b32 s13, 0x8000 +; GFX8-NEXT: s_movk_i32 s13, 0x8000 ; GFX8-NEXT: s_cselect_b32 s14, s14, s15 ; GFX8-NEXT: s_sub_i32 s14, s13, s14 ; GFX8-NEXT: s_sext_i32_i16 s14, s14 @@ -5987,7 +5987,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX9-NEXT: s_cmp_lt_i32 s9, s11 ; GFX9-NEXT: s_cselect_b32 s9, s9, s11 ; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_mov_b32 s7, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s7, 0x8000 ; GFX9-NEXT: s_cselect_b32 s10, s10, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 @@ -6121,7 +6121,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX10-NEXT: s_cmp_gt_i32 s8, s6 ; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s11 ; GFX10-NEXT: s_cselect_b32 s12, s8, s6 -; GFX10-NEXT: s_mov_b32 s14, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s14, 0x8000 ; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s12 ; GFX10-NEXT: s_lshr_b32 s12, s11, 16 ; GFX10-NEXT: s_lshr_b32 s13, s10, 16 @@ -6260,7 +6260,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 @@ -6289,7 +6289,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_mov_b32_e32 v19, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 @@ -6365,7 +6365,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 @@ -6392,7 +6392,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v17, s5, v17 ; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 -; GFX8-NEXT: v_mov_b32_e32 v15, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 ; GFX8-NEXT: v_min_i16_e32 v17, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 @@ -6438,7 +6438,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 @@ -6478,7 +6478,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s5, 0x8000 ; GFX10-NEXT: v_pk_min_i16 v8, v0, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 ; GFX10-NEXT: v_pk_min_i16 v11, v1, s4 @@ -6527,7 +6527,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_cselect_b32 s18, s0, 0 ; GFX6-NEXT: s_sub_i32 s18, s16, s18 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s17, 0x80000000 +; GFX6-NEXT: s_brev_b32 s17, 1 ; GFX6-NEXT: s_cselect_b32 s19, s0, 0 ; GFX6-NEXT: s_sub_i32 s19, s17, s19 ; GFX6-NEXT: s_cmp_gt_i32 s19, s8 @@ -6670,7 +6670,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_cselect_b32 s20, s18, s19 ; GFX8-NEXT: s_sub_i32 s20, s16, s20 ; GFX8-NEXT: s_cmp_lt_i32 s18, s19 -; GFX8-NEXT: s_mov_b32 s17, 0x8000 +; GFX8-NEXT: s_movk_i32 s17, 0x8000 ; GFX8-NEXT: s_cselect_b32 s18, s18, s19 ; GFX8-NEXT: s_sub_i32 s18, s17, s18 ; GFX8-NEXT: s_sext_i32_i16 s18, s18 @@ -6833,7 +6833,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX9-NEXT: s_cmp_lt_i32 s11, s13 ; GFX9-NEXT: s_cselect_b32 s11, s11, s13 ; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s9, 0x8000 ; GFX9-NEXT: s_cselect_b32 s12, s12, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 @@ -7010,7 +7010,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX10-NEXT: s_cmp_gt_i32 s10, s8 ; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s13 ; GFX10-NEXT: s_cselect_b32 s14, s10, s8 -; GFX10-NEXT: s_mov_b32 s16, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s16, 0x8000 ; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 ; GFX10-NEXT: s_lshr_b32 s14, s13, 16 ; GFX10-NEXT: s_lshr_b32 s15, s12, 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index f68465faf61cd..57737aeb886fa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -399,7 +399,7 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; CGP-NEXT: v_mov_b32_e32 v3, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0xfffff000 +; CGP-NEXT: s_movk_i32 s5, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 1813c33019ae8..f6565fe1b6e24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -200,13 +200,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_sdiv_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[0:1], 0 +; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_mov_b32 s0, 1 ; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 @@ -354,13 +353,13 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 43f79f4b207d0..320d814be8a94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -369,7 +369,7 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; CGP-NEXT: v_mov_b32_e32 v3, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0xfffff000 +; CGP-NEXT: s_movk_i32 s5, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 438388ebf7136..06d46321a59b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -196,13 +196,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_srem_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[0:1], 0 +; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_mov_b32 s0, 1 ; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 @@ -352,9 +351,9 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 -; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index a7154131e3c07..3e1778bcb881e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -24,9 +24,8 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 @@ -40,9 +39,8 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX9-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX9-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX9-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 ; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 @@ -57,11 +55,10 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, s4 -; GFX10-NEXT: v_min_i16_e64 v3, v0, s4 +; GFX10-NEXT: v_max_i16_e64 v2, v0, -1 +; GFX10-NEXT: v_min_i16_e64 v3, v0, -1 ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff ; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 ; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 @@ -98,13 +95,13 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s4, -1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s5, s3, s4 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s4, s1 @@ -124,13 +121,13 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX9-NEXT: s_sext_i32_i16 s4, -1 ; GFX9-NEXT: s_cmp_gt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s5, s3, s4 ; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff ; GFX9-NEXT: s_cmp_lt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX9-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX9-NEXT: s_sext_i32_i16 s4, s5 ; GFX9-NEXT: s_sext_i32_i16 s1, s1 ; GFX9-NEXT: s_cmp_gt_i32 s4, s1 @@ -147,7 +144,7 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX10-LABEL: s_ssubsat_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX10-NEXT: s_sext_i32_i16 s4, -1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_sext_i32_i16 s3, s0 @@ -159,7 +156,7 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX10-NEXT: s_cmp_lt_i32 s3, s4 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 ; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX10-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX10-NEXT: s_cmp_gt_i32 s4, s1 ; GFX10-NEXT: s_sext_i32_i16 s3, s3 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 @@ -194,9 +191,8 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 @@ -210,9 +206,8 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX9-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX9-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX9-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 @@ -227,11 +222,10 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, s4 -; GFX10-NEXT: v_min_i16_e64 v3, v0, s4 +; GFX10-NEXT: v_max_i16_e64 v2, v0, -1 +; GFX10-NEXT: v_min_i16_e64 v3, v0, -1 ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff ; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 ; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 @@ -268,13 +262,13 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s4, -1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s5, s3, s4 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s4, s1 @@ -294,13 +288,13 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX9-NEXT: s_sext_i32_i16 s4, -1 ; GFX9-NEXT: s_cmp_gt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s5, s3, s4 ; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff ; GFX9-NEXT: s_cmp_lt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX9-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX9-NEXT: s_sext_i32_i16 s4, s5 ; GFX9-NEXT: s_sext_i32_i16 s1, s1 ; GFX9-NEXT: s_cmp_gt_i32 s4, s1 @@ -317,7 +311,7 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10-LABEL: s_ssubsat_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX10-NEXT: s_sext_i32_i16 s4, -1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_sext_i32_i16 s3, s0 @@ -329,7 +323,7 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10-NEXT: s_cmp_lt_i32 s3, s4 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 ; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX10-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX10-NEXT: s_cmp_gt_i32 s4, s1 ; GFX10-NEXT: s_sext_i32_i16 s3, s3 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 @@ -355,7 +349,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 @@ -385,21 +379,20 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v4, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v5, s6, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 ; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_max_i16_e32 v1, s6, v3 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX8-NEXT: v_min_i16_e32 v4, s6, v3 +; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 @@ -416,21 +409,20 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v4, s6, v0 +; GFX9-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_subrev_u16_e32 v4, s4, v4 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v5, s6, v0 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 +; GFX9-NEXT: v_min_i16_e32 v5, -1, v0 ; GFX9-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX9-NEXT: v_subrev_u16_e32 v5, s5, v5 ; GFX9-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, s6, v2 +; GFX9-NEXT: v_max_i16_e32 v1, -1, v2 ; GFX9-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_min_i16_e32 v4, s6, v2 +; GFX9-NEXT: v_min_i16_e32 v4, -1, v2 ; GFX9-NEXT: v_subrev_u16_e32 v4, s5, v4 ; GFX9-NEXT: v_max_i16_e32 v1, v1, v3 ; GFX9-NEXT: v_min_i16_e32 v1, v1, v4 @@ -448,23 +440,22 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s6, 0xffff ; GFX10-NEXT: s_movk_i32 s5, 0x7fff ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_i16_e64 v4, v2, s6 -; GFX10-NEXT: v_max_i16_e64 v5, v0, s6 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_min_i16_e64 v6, v2, s6 -; GFX10-NEXT: v_min_i16_e64 v7, v0, s6 +; GFX10-NEXT: v_max_i16_e64 v4, v2, -1 +; GFX10-NEXT: v_max_i16_e64 v5, v0, -1 +; GFX10-NEXT: v_min_i16_e64 v6, v2, -1 +; GFX10-NEXT: v_min_i16_e64 v7, v0, -1 +; GFX10-NEXT: s_movk_i32 s4, 0x8000 ; GFX10-NEXT: v_sub_nc_u16_e64 v4, v4, s5 ; GFX10-NEXT: v_sub_nc_u16_e64 v5, v5, s5 -; GFX10-NEXT: s_mov_b32 s4, 0x8000 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_sub_nc_u16_e64 v6, v6, s4 -; GFX10-NEXT: v_max_i16_e64 v1, v4, v1 ; GFX10-NEXT: v_sub_nc_u16_e64 v7, v7, s4 -; GFX10-NEXT: v_max_i16_e64 v10, v5, v3 ; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_max_i16_e64 v1, v4, v1 +; GFX10-NEXT: v_max_i16_e64 v10, v5, v3 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_min_i16_e64 v1, v1, v6 ; GFX10-NEXT: v_min_i16_e64 v3, v10, v7 ; GFX10-NEXT: v_sub_nc_u16_e64 v1, v2, v1 @@ -492,7 +483,7 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s1 @@ -530,13 +521,13 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s8, -1 ; GFX8-NEXT: s_cmp_gt_i32 s7, s8 ; GFX8-NEXT: s_movk_i32 s5, 0x7fff ; GFX8-NEXT: s_cselect_b32 s9, s7, s8 ; GFX8-NEXT: s_sub_i32 s9, s9, s5 ; GFX8-NEXT: s_cmp_lt_i32 s7, s8 -; GFX8-NEXT: s_mov_b32 s6, 0x8000 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: s_cselect_b32 s7, s7, s8 ; GFX8-NEXT: s_sub_i32 s7, s7, s6 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 @@ -585,13 +576,13 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 ; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_sext_i32_i16 s8, 0xffff +; GFX9-NEXT: s_sext_i32_i16 s8, -1 ; GFX9-NEXT: s_cmp_gt_i32 s7, s8 ; GFX9-NEXT: s_movk_i32 s5, 0x7fff ; GFX9-NEXT: s_cselect_b32 s9, s7, s8 ; GFX9-NEXT: s_sub_i32 s9, s9, s5 ; GFX9-NEXT: s_cmp_lt_i32 s7, s8 -; GFX9-NEXT: s_mov_b32 s6, 0x8000 +; GFX9-NEXT: s_movk_i32 s6, 0x8000 ; GFX9-NEXT: s_cselect_b32 s7, s7, s8 ; GFX9-NEXT: s_sub_i32 s7, s7, s6 ; GFX9-NEXT: s_sext_i32_i16 s9, s9 @@ -637,14 +628,14 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s6, 0xffff +; GFX10-NEXT: s_sext_i32_i16 s6, -1 ; GFX10-NEXT: s_sext_i32_i16 s5, s0 ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_cmp_gt_i32 s5, s6 ; GFX10-NEXT: s_movk_i32 s7, 0x7fff ; GFX10-NEXT: s_cselect_b32 s8, s5, s6 -; GFX10-NEXT: s_mov_b32 s9, 0x8000 +; GFX10-NEXT: s_movk_i32 s9, 0x8000 ; GFX10-NEXT: s_sub_i32 s8, s8, s7 ; GFX10-NEXT: s_cmp_lt_i32 s5, s6 ; GFX10-NEXT: s_sext_i32_i16 s8, s8 @@ -709,7 +700,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 @@ -736,7 +727,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: s_movk_i32 s4, 0xff @@ -769,43 +760,41 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v9, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v11, s6, v0 -; GFX8-NEXT: v_max_i16_e32 v1, v9, v1 -; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v11 +; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 +; GFX8-NEXT: v_subrev_u16_e32 v10, s5, v10 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_max_i16_e32 v1, s6, v3 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX8-NEXT: v_min_i16_e32 v9, s6, v3 +; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 -; GFX8-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v9 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v4, v2, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_min_i16_e32 v6, v2, v8 -; GFX8-NEXT: v_sub_u16_e32 v4, v4, v10 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v4, v4, v9 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_subrev_u16_e32 v6, s5, v6 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 -; GFX8-NEXT: v_max_i16_e32 v5, v3, v8 -; GFX8-NEXT: v_min_i16_e32 v6, v3, v8 +; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_sub_u16_e32 v5, v5, v10 +; GFX8-NEXT: v_sub_u16_e32 v5, v5, v9 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v6 @@ -829,43 +818,41 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v9, s6, v0 +; GFX9-NEXT: v_max_i16_e32 v8, -1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v9, s4, v9 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v11, s6, v0 -; GFX9-NEXT: v_max_i16_e32 v1, v9, v1 -; GFX9-NEXT: v_subrev_u16_e32 v11, s5, v11 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v11 +; GFX9-NEXT: v_subrev_u16_e32 v8, s4, v8 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 +; GFX9-NEXT: v_min_i16_e32 v10, -1, v0 +; GFX9-NEXT: v_max_i16_e32 v1, v8, v1 +; GFX9-NEXT: v_subrev_u16_e32 v10, s5, v10 +; GFX9-NEXT: v_min_i16_e32 v1, v1, v10 ; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, s6, v2 +; GFX9-NEXT: v_max_i16_e32 v1, -1, v2 ; GFX9-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_min_i16_e32 v9, s6, v2 -; GFX9-NEXT: v_subrev_u16_e32 v9, s5, v9 +; GFX9-NEXT: v_min_i16_e32 v8, -1, v2 +; GFX9-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX9-NEXT: v_max_i16_e32 v1, v1, v5 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v9 +; GFX9-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v5, v2, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff +; GFX9-NEXT: v_max_i16_e32 v5, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_min_i16_e32 v6, v2, v8 -; GFX9-NEXT: v_sub_u16_e32 v5, v5, v10 +; GFX9-NEXT: v_min_i16_e32 v6, -1, v2 +; GFX9-NEXT: v_sub_u16_e32 v5, v5, v9 ; GFX9-NEXT: v_subrev_u16_e32 v6, s5, v6 ; GFX9-NEXT: v_max_i16_e32 v3, v5, v3 ; GFX9-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX9-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_max_i16_e32 v5, v3, v8 -; GFX9-NEXT: v_min_i16_e32 v6, v3, v8 +; GFX9-NEXT: v_max_i16_e32 v5, -1, v3 +; GFX9-NEXT: v_min_i16_e32 v6, -1, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX9-NEXT: v_sub_u16_e32 v5, v5, v10 +; GFX9-NEXT: v_sub_u16_e32 v5, v5, v9 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX9-NEXT: v_max_i16_e32 v4, v5, v4 @@ -884,59 +871,57 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_mov_b32 s6, 24 +; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v19, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_max_i16_e64 v8, v4, -1 +; GFX10-NEXT: s_movk_i32 s4, 0x7fff +; GFX10-NEXT: v_min_i16_e64 v10, v4, -1 +; GFX10-NEXT: v_max_i16_e64 v9, v2, -1 +; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: v_sub_nc_u16_e64 v8, v8, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 s6, 24 ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_max_i16_e64 v8, v0, s4 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_i16_e64 v9, v2, s4 -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_min_i16_e64 v10, v0, s4 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v8, s5 -; GFX10-NEXT: s_mov_b32 s6, 0x8000 -; GFX10-NEXT: v_sub_nc_u16_e64 v15, v9, s5 -; GFX10-NEXT: v_min_i16_e64 v11, v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xffff -; GFX10-NEXT: v_max_i16_e64 v1, v8, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v10, v10, s6 +; GFX10-NEXT: s_movk_i32 s5, 0x8000 +; GFX10-NEXT: v_sub_nc_u16_e64 v15, v9, s4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_i16_e64 v11, v2, -1 +; GFX10-NEXT: v_max_i16_e64 v7, v8, v7 +; GFX10-NEXT: v_sub_nc_u16_e64 v10, v10, s5 ; GFX10-NEXT: v_max_i16_e64 v5, v15, v5 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v11, s6 ; GFX10-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX10-NEXT: v_max_i16_e64 v11, v3, v12 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v10 -; GFX10-NEXT: v_max_i16_e64 v10, v19, v12 +; GFX10-NEXT: v_sub_nc_u16_e64 v8, v11, s5 +; GFX10-NEXT: v_max_i16_e64 v11, v3, -1 +; GFX10-NEXT: v_min_i16_e64 v7, v7, v10 +; GFX10-NEXT: v_max_i16_e64 v10, v0, -1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_min_i16_e64 v5, v5, v8 -; GFX10-NEXT: v_min_i16_e64 v8, v3, v12 ; GFX10-NEXT: v_sub_nc_u16_e64 v11, v11, v9 -; GFX10-NEXT: v_min_i16_e64 v12, v19, v12 -; GFX10-NEXT: v_sub_nc_u16_e64 v9, v10, v9 +; GFX10-NEXT: v_min_i16_e64 v8, v3, -1 +; GFX10-NEXT: v_sub_nc_u16_e64 v15, v10, v9 +; GFX10-NEXT: v_min_i16_e64 v12, v0, -1 ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, v5 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, v8, s6 ; GFX10-NEXT: v_max_i16_e64 v6, v11, v6 +; GFX10-NEXT: v_sub_nc_u16_e64 v5, v8, s5 +; GFX10-NEXT: v_max_i16_e64 v1, v15, v1 ; GFX10-NEXT: v_sub_nc_u16_e64 v8, v12, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v7, v9, v7 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v2, v6, v5 -; GFX10-NEXT: v_min_i16_e64 v5, v7, v8 +; GFX10-NEXT: v_sub_nc_u16_e64 v4, v4, v7 +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_min_i16_e64 v5, v6, v5 +; GFX10-NEXT: v_min_i16_e64 v1, v1, v8 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v19, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, v5 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, v4, s4, v2 +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -961,7 +946,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s1 @@ -1037,13 +1022,13 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 ; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s12, -1 ; GFX8-NEXT: s_cmp_gt_i32 s11, s12 ; GFX8-NEXT: s_movk_i32 s9, 0x7fff ; GFX8-NEXT: s_cselect_b32 s13, s11, s12 ; GFX8-NEXT: s_sub_i32 s13, s13, s9 ; GFX8-NEXT: s_cmp_lt_i32 s11, s12 -; GFX8-NEXT: s_mov_b32 s10, 0x8000 +; GFX8-NEXT: s_movk_i32 s10, 0x8000 ; GFX8-NEXT: s_cselect_b32 s11, s11, s12 ; GFX8-NEXT: s_sub_i32 s11, s11, s10 ; GFX8-NEXT: s_sext_i32_i16 s13, s13 @@ -1142,13 +1127,13 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 ; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_sext_i32_i16 s12, 0xffff +; GFX9-NEXT: s_sext_i32_i16 s12, -1 ; GFX9-NEXT: s_cmp_gt_i32 s11, s12 ; GFX9-NEXT: s_movk_i32 s9, 0x7fff ; GFX9-NEXT: s_cselect_b32 s13, s11, s12 ; GFX9-NEXT: s_sub_i32 s13, s13, s9 ; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_mov_b32 s10, 0x8000 +; GFX9-NEXT: s_movk_i32 s10, 0x8000 ; GFX9-NEXT: s_cselect_b32 s11, s11, s12 ; GFX9-NEXT: s_sub_i32 s11, s11, s10 ; GFX9-NEXT: s_sext_i32_i16 s13, s13 @@ -1242,7 +1227,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, 0xffff +; GFX10-NEXT: s_sext_i32_i16 s10, -1 ; GFX10-NEXT: s_sext_i32_i16 s9, s0 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 @@ -1251,7 +1236,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_cmp_gt_i32 s9, s10 ; GFX10-NEXT: s_movk_i32 s11, 0x7fff ; GFX10-NEXT: s_cselect_b32 s12, s9, s10 -; GFX10-NEXT: s_mov_b32 s13, 0x8000 +; GFX10-NEXT: s_movk_i32 s13, 0x8000 ; GFX10-NEXT: s_sub_i32 s12, s12, s11 ; GFX10-NEXT: s_cmp_lt_i32 s9, s10 ; GFX10-NEXT: s_sext_i32_i16 s12, s12 @@ -1726,7 +1711,7 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 @@ -1747,7 +1732,7 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 @@ -1768,7 +1753,7 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v4 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX9-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v5 @@ -1794,7 +1779,7 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX10-NEXT: v_min_i32_e32 v7, -1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s4, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s4, v5 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v6 ; GFX10-NEXT: v_max_i32_e32 v11, v4, v2 @@ -1817,7 +1802,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s2 @@ -1845,7 +1830,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX8-NEXT: s_cselect_b32 s6, s0, -1 ; GFX8-NEXT: s_sub_i32 s6, s6, s4 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cselect_b32 s7, s0, -1 ; GFX8-NEXT: s_sub_i32 s7, s7, s5 ; GFX8-NEXT: s_cmp_gt_i32 s6, s2 @@ -1873,7 +1858,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX9-NEXT: s_cselect_b32 s6, s0, -1 ; GFX9-NEXT: s_sub_i32 s6, s6, s4 ; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: s_cselect_b32 s7, s0, -1 ; GFX9-NEXT: s_sub_i32 s7, s7, s5 ; GFX9-NEXT: s_cmp_gt_i32 s6, s2 @@ -1899,7 +1884,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, -1 ; GFX10-NEXT: s_brev_b32 s4, -2 ; GFX10-NEXT: s_cselect_b32 s5, s0, -1 -; GFX10-NEXT: s_mov_b32 s6, 0x80000000 +; GFX10-NEXT: s_brev_b32 s6, 1 ; GFX10-NEXT: s_sub_i32 s5, s5, s4 ; GFX10-NEXT: s_cmp_lt_i32 s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -1933,7 +1918,7 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s5, v7 @@ -1961,7 +1946,7 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v7, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s5, v7 @@ -1989,7 +1974,7 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: v_max_i32_e32 v6, -1, v0 ; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v6 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v7, -1, v0 ; GFX9-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 @@ -2025,7 +2010,7 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX10-NEXT: v_subrev_nc_u32_e32 v19, s4, v9 ; GFX10-NEXT: v_min_i32_e32 v10, -1, v1 ; GFX10-NEXT: v_min_i32_e32 v11, -1, v2 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: v_max_i32_e32 v14, v6, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s5, v7 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s5, v10 @@ -2052,7 +2037,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX6-NEXT: s_cselect_b32 s8, s0, -1 ; GFX6-NEXT: s_sub_i32 s8, s8, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s7, 0x80000000 +; GFX6-NEXT: s_brev_b32 s7, 1 ; GFX6-NEXT: s_cselect_b32 s9, s0, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s7 ; GFX6-NEXT: s_cmp_gt_i32 s8, s3 @@ -2091,7 +2076,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX8-NEXT: s_cselect_b32 s8, s0, -1 ; GFX8-NEXT: s_sub_i32 s8, s8, s6 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s7, 0x80000000 +; GFX8-NEXT: s_brev_b32 s7, 1 ; GFX8-NEXT: s_cselect_b32 s9, s0, -1 ; GFX8-NEXT: s_sub_i32 s9, s9, s7 ; GFX8-NEXT: s_cmp_gt_i32 s8, s3 @@ -2130,7 +2115,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX9-NEXT: s_cselect_b32 s8, s0, -1 ; GFX9-NEXT: s_sub_i32 s8, s8, s6 ; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s7, 0x80000000 +; GFX9-NEXT: s_brev_b32 s7, 1 ; GFX9-NEXT: s_cselect_b32 s9, s0, -1 ; GFX9-NEXT: s_sub_i32 s9, s9, s7 ; GFX9-NEXT: s_cmp_gt_i32 s8, s3 @@ -2167,7 +2152,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, -1 ; GFX10-NEXT: s_brev_b32 s6, -2 ; GFX10-NEXT: s_cselect_b32 s7, s0, -1 -; GFX10-NEXT: s_mov_b32 s8, 0x80000000 +; GFX10-NEXT: s_brev_b32 s8, 1 ; GFX10-NEXT: s_sub_i32 s7, s7, s6 ; GFX10-NEXT: s_cmp_lt_i32 s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -2212,7 +2197,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v9, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s5, v9 @@ -2247,7 +2232,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v9, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s5, v9 @@ -2282,7 +2267,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX9-NEXT: v_subrev_u32_e32 v8, s4, v8 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v9, -1, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX9-NEXT: v_subrev_u32_e32 v9, s5, v9 @@ -2328,7 +2313,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX10-NEXT: v_min_i32_e32 v13, -1, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s4, v8 ; GFX10-NEXT: v_min_i32_e32 v14, -1, v3 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s5, v9 ; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 @@ -2359,7 +2344,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s4 @@ -2409,7 +2394,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX8-NEXT: s_cselect_b32 s10, s0, -1 ; GFX8-NEXT: s_sub_i32 s10, s10, s8 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s9, 0x80000000 +; GFX8-NEXT: s_brev_b32 s9, 1 ; GFX8-NEXT: s_cselect_b32 s11, s0, -1 ; GFX8-NEXT: s_sub_i32 s11, s11, s9 ; GFX8-NEXT: s_cmp_gt_i32 s10, s4 @@ -2459,7 +2444,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX9-NEXT: s_cselect_b32 s10, s0, -1 ; GFX9-NEXT: s_sub_i32 s10, s10, s8 ; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s9, 0x80000000 +; GFX9-NEXT: s_brev_b32 s9, 1 ; GFX9-NEXT: s_cselect_b32 s11, s0, -1 ; GFX9-NEXT: s_sub_i32 s11, s11, s9 ; GFX9-NEXT: s_cmp_gt_i32 s10, s4 @@ -2507,7 +2492,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, -1 ; GFX10-NEXT: s_brev_b32 s8, -2 ; GFX10-NEXT: s_cselect_b32 s9, s0, -1 -; GFX10-NEXT: s_mov_b32 s10, 0x80000000 +; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: s_sub_i32 s9, s9, s8 ; GFX10-NEXT: s_cmp_lt_i32 s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -2563,7 +2548,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 @@ -2586,7 +2571,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 -; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 @@ -2607,7 +2592,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s5, v12 @@ -2630,7 +2615,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 @@ -2651,7 +2636,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX9-NEXT: v_subrev_u32_e32 v10, s4, v10 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX9-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX9-NEXT: v_subrev_u32_e32 v12, s5, v12 @@ -2674,7 +2659,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX9-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX9-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX9-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX9-NEXT: v_sub_u32_e32 v6, v6, v13 ; GFX9-NEXT: v_max_i32_e32 v5, v5, v8 @@ -2701,7 +2686,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s4, v10 ; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s4, v13 ; GFX10-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, 0x80000000 +; GFX10-NEXT: v_bfrev_b32_e32 v14, 1 ; GFX10-NEXT: v_min_i32_e32 v15, -1, v1 ; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX10-NEXT: v_max_i32_e32 v10, -1, v2 @@ -2713,7 +2698,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX10-NEXT: v_min_i32_e32 v19, -1, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v13, v13, v11 ; GFX10-NEXT: v_sub_nc_u32_e32 v11, v17, v11 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: v_max_i32_e32 v7, v10, v7 ; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s5, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v15, s5, v15 @@ -2746,7 +2731,7 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX6-NEXT: s_cselect_b32 s12, s0, -1 ; GFX6-NEXT: s_sub_i32 s12, s12, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s11, 0x80000000 +; GFX6-NEXT: s_brev_b32 s11, 1 ; GFX6-NEXT: s_cselect_b32 s13, s0, -1 ; GFX6-NEXT: s_sub_i32 s13, s13, s11 ; GFX6-NEXT: s_cmp_gt_i32 s12, s5 @@ -2807,7 +2792,7 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX8-NEXT: s_cselect_b32 s12, s0, -1 ; GFX8-NEXT: s_sub_i32 s12, s12, s10 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s11, 0x80000000 +; GFX8-NEXT: s_brev_b32 s11, 1 ; GFX8-NEXT: s_cselect_b32 s13, s0, -1 ; GFX8-NEXT: s_sub_i32 s13, s13, s11 ; GFX8-NEXT: s_cmp_gt_i32 s12, s5 @@ -2868,7 +2853,7 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX9-NEXT: s_cselect_b32 s12, s0, -1 ; GFX9-NEXT: s_sub_i32 s12, s12, s10 ; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s11, 0x80000000 +; GFX9-NEXT: s_brev_b32 s11, 1 ; GFX9-NEXT: s_cselect_b32 s13, s0, -1 ; GFX9-NEXT: s_sub_i32 s13, s13, s11 ; GFX9-NEXT: s_cmp_gt_i32 s12, s5 @@ -2927,7 +2912,7 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX10-NEXT: s_cmp_gt_i32 s0, -1 ; GFX10-NEXT: s_brev_b32 s10, -2 ; GFX10-NEXT: s_cselect_b32 s11, s0, -1 -; GFX10-NEXT: s_mov_b32 s12, 0x80000000 +; GFX10-NEXT: s_brev_b32 s12, 1 ; GFX10-NEXT: s_sub_i32 s11, s11, s10 ; GFX10-NEXT: s_cmp_lt_i32 s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -2995,7 +2980,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s4, v32 ; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v32, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s5, v32 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v32 @@ -3018,7 +3003,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_mov_b32_e32 v18, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 @@ -3116,7 +3101,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s4, v32 ; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v32, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s5, v32 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v32 @@ -3139,7 +3124,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_mov_b32_e32 v18, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 @@ -3237,7 +3222,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX9-NEXT: v_subrev_u32_e32 v32, s4, v32 ; GFX9-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 +; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: v_min_i32_e32 v32, -1, v0 ; GFX9-NEXT: v_subrev_u32_e32 v32, s5, v32 ; GFX9-NEXT: v_min_i32_e32 v16, v16, v32 @@ -3260,7 +3245,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 ; GFX9-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_mov_b32_e32 v18, 0x80000000 +; GFX9-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX9-NEXT: v_min_i32_e32 v19, -1, v3 ; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 ; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 @@ -3358,7 +3343,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX10-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX10-NEXT: s_brev_b32 s4, -2 ; GFX10-NEXT: v_min_i32_e32 v33, -1, v0 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: v_max_i32_e32 v36, -1, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v35, s4, v32 ; GFX10-NEXT: v_max_i32_e32 v32, -1, v1 @@ -3375,7 +3360,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX10-NEXT: v_max_i32_e32 v38, v32, v17 ; GFX10-NEXT: v_max_i32_e32 v17, -1, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v36, v39, v34 -; GFX10-NEXT: v_mov_b32_e32 v35, 0x80000000 +; GFX10-NEXT: v_bfrev_b32_e32 v35, 1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v32, s5, v33 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v16 ; GFX10-NEXT: v_min_i32_e32 v33, -1, v3 @@ -3485,7 +3470,7 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX6-NEXT: s_cselect_b32 s34, s0, -1 ; GFX6-NEXT: s_sub_i32 s34, s34, s32 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s33, 0x80000000 +; GFX6-NEXT: s_brev_b32 s33, 1 ; GFX6-NEXT: s_cselect_b32 s35, s0, -1 ; GFX6-NEXT: s_sub_i32 s35, s35, s33 ; GFX6-NEXT: s_cmp_gt_i32 s34, s16 @@ -3667,7 +3652,7 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX8-NEXT: s_cselect_b32 s34, s0, -1 ; GFX8-NEXT: s_sub_i32 s34, s34, s32 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s33, 0x80000000 +; GFX8-NEXT: s_brev_b32 s33, 1 ; GFX8-NEXT: s_cselect_b32 s35, s0, -1 ; GFX8-NEXT: s_sub_i32 s35, s35, s33 ; GFX8-NEXT: s_cmp_gt_i32 s34, s16 @@ -3849,7 +3834,7 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX9-NEXT: s_cselect_b32 s34, s0, -1 ; GFX9-NEXT: s_sub_i32 s34, s34, s32 ; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s33, 0x80000000 +; GFX9-NEXT: s_brev_b32 s33, 1 ; GFX9-NEXT: s_cselect_b32 s35, s0, -1 ; GFX9-NEXT: s_sub_i32 s35, s35, s33 ; GFX9-NEXT: s_cmp_gt_i32 s34, s16 @@ -4029,7 +4014,7 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX10-NEXT: s_cmp_gt_i32 s0, -1 ; GFX10-NEXT: s_brev_b32 s46, -2 ; GFX10-NEXT: s_cselect_b32 s33, s0, -1 -; GFX10-NEXT: s_mov_b32 s34, 0x80000000 +; GFX10-NEXT: s_brev_b32 s34, 1 ; GFX10-NEXT: s_sub_i32 s47, s33, s46 ; GFX10-NEXT: s_cmp_lt_i32 s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -4229,9 +4214,8 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_ssubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 @@ -4242,9 +4226,8 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-LABEL: v_ssubsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX9-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX9-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX9-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX9-NEXT: v_max_i16_e32 v1, v2, v1 @@ -4256,10 +4239,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_max_i16_e64 v2, v0, -1 +; GFX10-NEXT: v_min_i16_e64 v3, v0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, s4 -; GFX10-NEXT: v_min_i16_e64 v3, v0, s4 ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff ; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 ; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 @@ -4292,13 +4274,13 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_sext_i32_i16 s3, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_cmp_gt_i32 s2, s3 ; GFX8-NEXT: s_cselect_b32 s4, s2, s3 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s2, s3 ; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x8000 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s1 @@ -4313,13 +4295,13 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s3, 0xffff +; GFX9-NEXT: s_sext_i32_i16 s3, -1 ; GFX9-NEXT: s_cmp_gt_i32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s4, s2, s3 ; GFX9-NEXT: s_sub_i32 s4, s4, 0x7fff ; GFX9-NEXT: s_cmp_lt_i32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x8000 +; GFX9-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX9-NEXT: s_sext_i32_i16 s3, s4 ; GFX9-NEXT: s_sext_i32_i16 s1, s1 ; GFX9-NEXT: s_cmp_gt_i32 s3, s1 @@ -4333,7 +4315,7 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX10-LABEL: s_ssubsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, 0xffff +; GFX10-NEXT: s_sext_i32_i16 s2, -1 ; GFX10-NEXT: s_sext_i32_i16 s3, s0 ; GFX10-NEXT: s_sext_i32_i16 s1, s1 ; GFX10-NEXT: s_cmp_gt_i32 s3, s2 @@ -4343,7 +4325,7 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX10-NEXT: s_cmp_lt_i32 s3, s2 ; GFX10-NEXT: s_cselect_b32 s2, s3, s2 ; GFX10-NEXT: s_sext_i32_i16 s3, s4 -; GFX10-NEXT: s_sub_i32 s2, s2, 0x8000 +; GFX10-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX10-NEXT: s_cmp_gt_i32 s3, s1 ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1 @@ -4376,13 +4358,13 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX8-LABEL: ssubsat_i16_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s1, s0 -; GFX8-NEXT: s_sext_i32_i16 s2, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s2, -1 ; GFX8-NEXT: s_cmp_gt_i32 s1, s2 ; GFX8-NEXT: s_cselect_b32 s3, s1, s2 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s1, s2 ; GFX8-NEXT: s_cselect_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, 0x8000 +; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX8-NEXT: v_max_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_min_i16_e32 v0, s1, v0 ; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 @@ -4391,13 +4373,13 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX9-LABEL: ssubsat_i16_sv: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GFX9-NEXT: s_sext_i32_i16 s2, 0xffff +; GFX9-NEXT: s_sext_i32_i16 s2, -1 ; GFX9-NEXT: s_cmp_gt_i32 s1, s2 ; GFX9-NEXT: s_cselect_b32 s3, s1, s2 ; GFX9-NEXT: s_sub_i32 s3, s3, 0x7fff ; GFX9-NEXT: s_cmp_lt_i32 s1, s2 ; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s1, s1, 0x8000 +; GFX9-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX9-NEXT: v_max_i16_e32 v0, s3, v0 ; GFX9-NEXT: v_min_i16_e32 v0, s1, v0 ; GFX9-NEXT: v_sub_u16_e32 v0, s0, v0 @@ -4406,7 +4388,7 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX10-LABEL: ssubsat_i16_sv: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sext_i32_i16 s1, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, 0xffff +; GFX10-NEXT: s_sext_i32_i16 s2, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_cmp_gt_i32 s1, s2 ; GFX10-NEXT: s_cselect_b32 s3, s1, s2 @@ -4414,7 +4396,7 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX10-NEXT: s_cmp_lt_i32 s1, s2 ; GFX10-NEXT: v_max_i16_e64 v0, s3, v0 ; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s1, s1, 0x8000 +; GFX10-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX10-NEXT: v_min_i16_e64 v0, v0, s1 ; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -4440,10 +4422,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: ssubsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: v_max_i16_e32 v1, s1, v0 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 -; GFX8-NEXT: v_min_i16_e32 v2, s1, v0 +; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 @@ -4452,10 +4433,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: ssubsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_max_i16_e32 v1, s1, v0 +; GFX9-NEXT: v_max_i16_e32 v1, -1, v0 ; GFX9-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_min_i16_e32 v2, s1, v0 +; GFX9-NEXT: v_min_i16_e32 v2, -1, v0 ; GFX9-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 ; GFX9-NEXT: v_max_i16_e32 v1, s0, v1 ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 @@ -4464,10 +4444,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX10-LABEL: ssubsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: v_max_i16_e64 v1, v0, -1 +; GFX10-NEXT: v_min_i16_e64 v2, v0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v1, v0, s1 -; GFX10-NEXT: v_min_i16_e64 v2, v0, s1 ; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 0x7fff ; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x8000 ; GFX10-NEXT: v_max_i16_e64 v1, v1, s0 @@ -4488,7 +4467,7 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 @@ -4510,18 +4489,17 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v3, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, s4, v3 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v4, s6, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_max_i16_e32 v4, s6, v2 -; GFX8-NEXT: v_min_i16_e32 v5, s6, v2 +; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 ; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -4536,7 +4514,7 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX9-NEXT: v_pk_max_i16 v2, v0, s6 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, s4 @@ -4557,7 +4535,7 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10-NEXT: v_pk_max_i16 v2, v0, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 ; GFX10-NEXT: v_pk_min_i16 v3, v0, s4 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s6, 0x8000 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_sub_i16 v2, v2, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s6 @@ -4580,7 +4558,7 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s2 @@ -4615,13 +4593,13 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s7, -1 ; GFX8-NEXT: s_cmp_gt_i32 s6, s7 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: s_cselect_b32 s8, s6, s7 ; GFX8-NEXT: s_sub_i32 s8, s8, s4 ; GFX8-NEXT: s_cmp_lt_i32 s6, s7 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: s_cselect_b32 s6, s6, s7 ; GFX8-NEXT: s_sub_i32 s6, s6, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4676,7 +4654,7 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX9-NEXT: s_cmp_lt_i32 s5, s7 ; GFX9-NEXT: s_cselect_b32 s5, s5, s7 ; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_mov_b32 s3, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s3, 0x8000 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 @@ -4733,7 +4711,7 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX10-NEXT: s_cmp_lt_i32 s3, s5 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5 ; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s5, 0x8000 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 @@ -4782,7 +4760,7 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: s_sub_i32 s4, s4, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: s_cselect_b32 s5, s0, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s3 ; GFX6-NEXT: v_max_i32_e32 v0, s4, v0 @@ -4812,13 +4790,13 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s5, -1 ; GFX8-NEXT: s_cmp_gt_i32 s4, s5 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: s_cselect_b32 s6, s4, s5 ; GFX8-NEXT: s_sub_i32 s6, s6, s2 ; GFX8-NEXT: s_cmp_lt_i32 s4, s5 -; GFX8-NEXT: s_mov_b32 s3, 0x8000 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s4, s4, s3 ; GFX8-NEXT: v_max_i16_e32 v1, s6, v0 @@ -4860,7 +4838,7 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX9-NEXT: s_cmp_lt_i32 s4, s6 ; GFX9-NEXT: s_cselect_b32 s4, s4, s6 ; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s2, 0x8000 ; GFX9-NEXT: s_cselect_b32 s3, s5, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 @@ -4897,7 +4875,7 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX10-NEXT: s_cmp_lt_i32 s2, s4 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4 ; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s4, 0x8000 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 @@ -4924,7 +4902,7 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 @@ -4950,18 +4928,17 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v2, s2, v2 -; GFX8-NEXT: s_mov_b32 s3, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, s3, v3 ; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 -; GFX8-NEXT: v_max_i16_e32 v3, s4, v1 -; GFX8-NEXT: v_min_i16_e32 v4, s4, v1 +; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_subrev_u16_e32 v3, s2, v3 ; GFX8-NEXT: v_subrev_u16_e32 v4, s3, v4 @@ -4976,7 +4953,7 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_movk_i32 s1, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s3, -1, -1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s2, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 ; GFX9-NEXT: v_pk_max_i16 v1, v0, s3 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, s1 @@ -4995,7 +4972,7 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX10-NEXT: v_pk_max_i16 v1, v0, s1 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 ; GFX10-NEXT: v_pk_min_i16 v2, v0, s1 -; GFX10-NEXT: s_mov_b32 s3, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s3, 0x8000 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_sub_i16 v1, v1, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s3 @@ -5029,7 +5006,7 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 @@ -5056,7 +5033,7 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 @@ -5082,30 +5059,29 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v6, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v6, s4, v6 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v7, s6, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v7, s5, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 -; GFX8-NEXT: v_max_i16_e32 v7, s6, v4 -; GFX8-NEXT: v_min_i16_e32 v8, s6, v4 +; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 +; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 ; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 ; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v7, s6, v1 +; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 ; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 ; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 -; GFX8-NEXT: v_min_i16_e32 v8, s6, v1 +; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 -; GFX8-NEXT: v_max_i16_e32 v8, s6, v5 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 ; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 ; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 @@ -5124,7 +5100,7 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX9-NEXT: v_pk_max_i16 v4, v0, s6 ; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 @@ -5156,7 +5132,7 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10-NEXT: v_pk_min_i16 v7, v1, s5 ; GFX10-NEXT: v_pk_sub_i16 v4, v4, s4 ; GFX10-NEXT: v_pk_sub_i16 v5, v5, s4 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s6, 0x8000 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 ; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 @@ -5183,7 +5159,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s4 @@ -5252,13 +5228,13 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s11, -1 ; GFX8-NEXT: s_cmp_gt_i32 s10, s11 ; GFX8-NEXT: s_movk_i32 s8, 0x7fff ; GFX8-NEXT: s_cselect_b32 s12, s10, s11 ; GFX8-NEXT: s_sub_i32 s12, s12, s8 ; GFX8-NEXT: s_cmp_lt_i32 s10, s11 -; GFX8-NEXT: s_mov_b32 s9, 0x8000 +; GFX8-NEXT: s_movk_i32 s9, 0x8000 ; GFX8-NEXT: s_cselect_b32 s10, s10, s11 ; GFX8-NEXT: s_sub_i32 s10, s10, s9 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 @@ -5349,7 +5325,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX9-NEXT: s_cmp_lt_i32 s7, s9 ; GFX9-NEXT: s_cselect_b32 s7, s7, s9 ; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_cselect_b32 s8, s8, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 @@ -5438,7 +5414,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX10-NEXT: s_movk_i32 s10, 0x7fff ; GFX10-NEXT: s_cselect_b32 s8, s5, s7 ; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_mov_b32 s12, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s12, 0x8000 ; GFX10-NEXT: s_cselect_b32 s9, s6, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s12 ; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s9 @@ -5551,7 +5527,7 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, s5, v14 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 @@ -5578,7 +5554,7 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_mov_b32_e32 v15, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 @@ -5628,58 +5604,56 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v10, s6, v0 -; GFX8-NEXT: v_subrev_u16_e32 v10, s4, v10 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v12, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v12, s5, v12 -; GFX8-NEXT: v_max_i16_e32 v10, v10, v3 -; GFX8-NEXT: v_min_i16_e32 v10, v10, v12 -; GFX8-NEXT: v_max_i16_e32 v12, s6, v6 -; GFX8-NEXT: v_min_i16_e32 v14, s6, v6 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_max_i16_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v12, s6, v1 -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v14 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_min_i16_e32 v14, s6, v1 +; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 +; GFX8-NEXT: v_min_i16_e32 v9, v9, v11 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v6 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v6 +; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v13 +; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 +; GFX8-NEXT: v_min_i16_e32 v11, v11, v13 +; GFX8-NEXT: v_max_i16_e32 v13, -1, v7 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v7 +; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 +; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_e32 v14, s6, v7 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX8-NEXT: v_min_i16_e32 v15, v7, v9 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_subrev_u16_e32 v15, s5, v15 -; GFX8-NEXT: v_mov_b32_e32 v11, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v14, v2, v9 -; GFX8-NEXT: v_sub_u16_e32 v14, v14, v11 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v15 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v15, v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v13, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 +; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v15, v15, v13 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v14, v14, v15 -; GFX8-NEXT: v_max_i16_e32 v15, v8, v9 -; GFX8-NEXT: v_min_i16_e32 v9, v8, v9 -; GFX8-NEXT: v_sub_u16_e32 v11, v15, v11 -; GFX8-NEXT: v_sub_u16_e32 v9, v9, v13 -; GFX8-NEXT: v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v10 +; GFX8-NEXT: v_sub_u16_e32 v14, v14, v12 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 +; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_sub_u16_e32 v10, v14, v10 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_sub_u16_e32 v12, v14, v12 +; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v9 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v12 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v11 ; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v13 ; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5689,7 +5663,7 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX9-NEXT: v_pk_max_i16 v6, v0, s6 ; GFX9-NEXT: v_pk_sub_i16 v6, v6, s4 @@ -5731,7 +5705,7 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10-NEXT: v_pk_sub_i16 v19, v9, s4 ; GFX10-NEXT: v_pk_min_i16 v10, v1, s5 ; GFX10-NEXT: v_pk_min_i16 v11, v2, s5 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s6, 0x8000 ; GFX10-NEXT: v_pk_max_i16 v14, v6, v3 ; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 @@ -5762,7 +5736,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_cselect_b32 s14, s0, -1 ; GFX6-NEXT: s_sub_i32 s14, s14, s12 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s13, 0x80000000 +; GFX6-NEXT: s_brev_b32 s13, 1 ; GFX6-NEXT: s_cselect_b32 s15, s0, -1 ; GFX6-NEXT: s_sub_i32 s15, s15, s13 ; GFX6-NEXT: s_cmp_gt_i32 s14, s6 @@ -5865,13 +5839,13 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s15, -1 ; GFX8-NEXT: s_cmp_gt_i32 s14, s15 ; GFX8-NEXT: s_movk_i32 s12, 0x7fff ; GFX8-NEXT: s_cselect_b32 s16, s14, s15 ; GFX8-NEXT: s_sub_i32 s16, s16, s12 ; GFX8-NEXT: s_cmp_lt_i32 s14, s15 -; GFX8-NEXT: s_mov_b32 s13, 0x8000 +; GFX8-NEXT: s_movk_i32 s13, 0x8000 ; GFX8-NEXT: s_cselect_b32 s14, s14, s15 ; GFX8-NEXT: s_sub_i32 s14, s14, s13 ; GFX8-NEXT: s_sext_i32_i16 s16, s16 @@ -5998,7 +5972,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX9-NEXT: s_cmp_lt_i32 s9, s11 ; GFX9-NEXT: s_cselect_b32 s9, s9, s11 ; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_mov_b32 s7, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s7, 0x8000 ; GFX9-NEXT: s_cselect_b32 s10, s10, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 @@ -6130,7 +6104,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX10-NEXT: s_movk_i32 s12, 0x7fff ; GFX10-NEXT: s_cselect_b32 s10, s7, s9 ; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_mov_b32 s14, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s14, 0x8000 ; GFX10-NEXT: s_cselect_b32 s11, s8, s6 ; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s14 ; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s11 @@ -6275,7 +6249,7 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, s5, v18 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 @@ -6302,7 +6276,7 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v19, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 @@ -6376,74 +6350,72 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v13, s6, v0 -; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v15, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v15, s5, v15 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 -; GFX8-NEXT: v_min_i16_e32 v13, v13, v15 -; GFX8-NEXT: v_max_i16_e32 v15, s6, v8 -; GFX8-NEXT: v_min_i16_e32 v17, s6, v8 -; GFX8-NEXT: v_subrev_u16_e32 v15, s4, v15 -; GFX8-NEXT: v_max_i16_sdwa v4, v15, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v15, s6, v1 -; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v17 -; GFX8-NEXT: v_subrev_u16_e32 v15, s4, v15 -; GFX8-NEXT: v_min_i16_e32 v17, s6, v1 +; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 +; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v16 +; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 +; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 +; GFX8-NEXT: v_min_i16_e32 v14, v14, v16 +; GFX8-NEXT: v_max_i16_e32 v16, -1, v9 +; GFX8-NEXT: v_min_i16_e32 v17, -1, v9 +; GFX8-NEXT: v_subrev_u16_e32 v16, s4, v16 +; GFX8-NEXT: v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v5 -; GFX8-NEXT: v_min_i16_e32 v15, v15, v17 -; GFX8-NEXT: v_max_i16_e32 v17, s6, v9 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff -; GFX8-NEXT: v_min_i16_e32 v18, v9, v12 -; GFX8-NEXT: v_subrev_u16_e32 v17, s4, v17 -; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_subrev_u16_e32 v18, s5, v18 -; GFX8-NEXT: v_mov_b32_e32 v14, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v17, v2, v12 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v14 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v18 -; GFX8-NEXT: v_mov_b32_e32 v16, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v18, v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v16, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v17 +; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v17, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v16 -; GFX8-NEXT: v_max_i16_e32 v17, v17, v6 -; GFX8-NEXT: v_min_i16_e32 v17, v17, v18 -; GFX8-NEXT: v_max_i16_e32 v18, v10, v12 -; GFX8-NEXT: v_min_i16_e32 v19, v10, v12 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v14 -; GFX8-NEXT: v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v18, v3, v12 -; GFX8-NEXT: v_sub_u16_e32 v19, v19, v16 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v14 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v19 -; GFX8-NEXT: v_min_i16_e32 v19, v3, v12 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v15 +; GFX8-NEXT: v_max_i16_e32 v16, v16, v6 +; GFX8-NEXT: v_min_i16_e32 v16, v16, v17 +; GFX8-NEXT: v_max_i16_e32 v17, -1, v10 +; GFX8-NEXT: v_min_i16_e32 v18, -1, v10 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 +; GFX8-NEXT: v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v17, -1, v3 +; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v18 +; GFX8-NEXT: v_min_i16_e32 v18, -1, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v19, v19, v16 -; GFX8-NEXT: v_max_i16_e32 v18, v18, v7 -; GFX8-NEXT: v_min_i16_e32 v18, v18, v19 -; GFX8-NEXT: v_max_i16_e32 v19, v11, v12 -; GFX8-NEXT: v_min_i16_e32 v12, v11, v12 -; GFX8-NEXT: v_sub_u16_e32 v14, v19, v14 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v13 +; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 +; GFX8-NEXT: v_max_i16_e32 v17, v17, v7 +; GFX8-NEXT: v_min_i16_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i16_e32 v18, -1, v11 +; GFX8-NEXT: v_sub_u16_e32 v13, v18, v13 +; GFX8-NEXT: v_min_i16_e32 v18, -1, v11 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_sub_u16_e32 v12, v12, v16 -; GFX8-NEXT: v_max_i16_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v15 +; GFX8-NEXT: v_sub_u16_e32 v15, v18, v15 +; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v14 ; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_i16_e32 v7, v7, v12 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v17 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v15 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v16 ; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v18 +; GFX8-NEXT: v_sub_u16_e32 v3, v3, v17 ; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -6453,7 +6425,7 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s5, 0x8000 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX9-NEXT: v_pk_max_i16 v8, v0, s6 ; GFX9-NEXT: v_pk_sub_i16 v8, v8, s4 @@ -6505,7 +6477,7 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10-NEXT: v_pk_min_i16 v13, v2, s4 ; GFX10-NEXT: v_pk_sub_i16 v8, v8, s5 ; GFX10-NEXT: v_pk_min_i16 v14, v3, s4 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s6, 0x8000 ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 ; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 ; GFX10-NEXT: v_pk_max_i16 v5, v10, v5 @@ -6540,7 +6512,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_cselect_b32 s18, s0, -1 ; GFX6-NEXT: s_sub_i32 s18, s18, s16 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s17, 0x80000000 +; GFX6-NEXT: s_brev_b32 s17, 1 ; GFX6-NEXT: s_cselect_b32 s19, s0, -1 ; GFX6-NEXT: s_sub_i32 s19, s19, s17 ; GFX6-NEXT: s_cmp_gt_i32 s18, s8 @@ -6677,13 +6649,13 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 ; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s19, -1 ; GFX8-NEXT: s_cmp_gt_i32 s18, s19 ; GFX8-NEXT: s_movk_i32 s16, 0x7fff ; GFX8-NEXT: s_cselect_b32 s20, s18, s19 ; GFX8-NEXT: s_sub_i32 s20, s20, s16 ; GFX8-NEXT: s_cmp_lt_i32 s18, s19 -; GFX8-NEXT: s_mov_b32 s17, 0x8000 +; GFX8-NEXT: s_movk_i32 s17, 0x8000 ; GFX8-NEXT: s_cselect_b32 s18, s18, s19 ; GFX8-NEXT: s_sub_i32 s18, s18, s17 ; GFX8-NEXT: s_sext_i32_i16 s20, s20 @@ -6846,7 +6818,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX9-NEXT: s_cmp_lt_i32 s11, s13 ; GFX9-NEXT: s_cselect_b32 s11, s11, s13 ; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff8000 +; GFX9-NEXT: s_movk_i32 s9, 0x8000 ; GFX9-NEXT: s_cselect_b32 s12, s12, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 @@ -7021,7 +6993,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX10-NEXT: s_movk_i32 s14, 0x7fff ; GFX10-NEXT: s_cselect_b32 s12, s9, s11 ; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_mov_b32 s16, 0xffff8000 +; GFX10-NEXT: s_movk_i32 s16, 0x8000 ; GFX10-NEXT: s_cselect_b32 s13, s10, s8 ; GFX10-NEXT: s_pack_ll_b32_b16 s16, s16, s16 ; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s13 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 413bc71038470..5570309a5be7d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -21,7 +21,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -32,7 +32,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -45,7 +45,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 ; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 @@ -71,7 +71,7 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, 0xffff +; GFX8-NEXT: s_xor_b32 s3, s0, -1 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s3, s1 @@ -86,7 +86,7 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, 0xffff +; GFX9-NEXT: s_xor_b32 s3, s0, -1 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_cmp_lt_u32 s3, s1 @@ -102,7 +102,7 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s0, 0xffff +; GFX10-NEXT: s_xor_b32 s3, s0, -1 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: s_cmp_lt_u32 s3, s1 @@ -132,7 +132,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -143,7 +143,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -156,7 +156,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 ; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 @@ -182,7 +182,7 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, 0xffff +; GFX8-NEXT: s_xor_b32 s3, s0, -1 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s3, s1 @@ -197,7 +197,7 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, 0xffff +; GFX9-NEXT: s_xor_b32 s3, s0, -1 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_cmp_lt_u32 s3, s1 @@ -213,7 +213,7 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s0, 0xffff +; GFX10-NEXT: s_xor_b32 s3, s0, -1 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: s_cmp_lt_u32 s3, s1 @@ -257,13 +257,12 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX8-NEXT: v_min_u16_e32 v1, v4, v1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v3 ; GFX8-NEXT: v_min_u16_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 @@ -277,14 +276,13 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX9-NEXT: v_min_u16_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v2 ; GFX9-NEXT: v_min_u16_e32 v1, v1, v3 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 @@ -300,12 +298,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s5, 0xffff ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v4, s5, v2 -; GFX10-NEXT: v_xor_b32_e32 v5, s5, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_min_u16_e64 v1, v4, v1 ; GFX10-NEXT: v_min_u16_e64 v3, v5, v3 @@ -353,19 +350,18 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_mov_b32 s5, 0xffff -; GFX8-NEXT: s_xor_b32 s6, s0, s5 +; GFX8-NEXT: s_xor_b32 s5, s0, -1 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s1 -; GFX8-NEXT: s_cselect_b32 s1, s6, s1 +; GFX8-NEXT: s_cmp_lt_u32 s5, s1 +; GFX8-NEXT: s_cselect_b32 s1, s5, s1 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_xor_b32 s3, s1, s5 +; GFX8-NEXT: s_xor_b32 s3, s1, -1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 @@ -386,19 +382,18 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: s_xor_b32 s6, s0, s5 +; GFX9-NEXT: s_xor_b32 s5, s0, -1 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s6, s1 -; GFX9-NEXT: s_cselect_b32 s1, s6, s1 +; GFX9-NEXT: s_cmp_lt_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 ; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s2, s4 ; GFX9-NEXT: s_lshl_b32 s2, s3, s4 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_xor_b32 s3, s1, s5 +; GFX9-NEXT: s_xor_b32 s3, s1, -1 ; GFX9-NEXT: s_lshr_b32 s0, s0, s4 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 @@ -417,20 +412,19 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-LABEL: s_uaddsat_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s3, s0, s2 -; GFX10-NEXT: s_lshl_b32 s6, s1, s2 -; GFX10-NEXT: s_xor_b32 s5, s3, s4 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s1, s2 +; GFX10-NEXT: s_xor_b32 s4, s3, -1 ; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_lt_u32 s5, s6 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-NEXT: s_cmp_lt_u32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s5 -; GFX10-NEXT: s_xor_b32 s4, s0, s4 +; GFX10-NEXT: s_add_i32 s3, s3, s4 +; GFX10-NEXT: s_xor_b32 s4, s0, -1 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -509,25 +503,24 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v8, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v0 ; GFX8-NEXT: v_min_u16_e32 v1, v8, v1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v3 ; GFX8-NEXT: v_min_u16_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 ; GFX8-NEXT: v_min_u16_e32 v3, v4, v3 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX8-NEXT: v_min_u16_e32 v4, v5, v4 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff @@ -546,29 +539,28 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v8, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 ; GFX9-NEXT: v_min_u16_e32 v1, v8, v1 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v2 ; GFX9-NEXT: v_min_u16_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v2 ; GFX9-NEXT: v_min_u16_e32 v3, v5, v3 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_min_u16_e32 v4, v5, v4 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -585,30 +577,29 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 ; GFX10-NEXT: s_mov_b32 s5, 16 ; GFX10-NEXT: s_mov_b32 s6, 24 -; GFX10-NEXT: v_xor_b32_e32 v5, s7, v2 ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, s7, v4 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 ; GFX10-NEXT: v_min_u16_e64 v3, v5, v3 ; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v11, s7, v6 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_min_u16_e64 v7, v8, v7 -; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v3 -; GFX10-NEXT: v_xor_b32_e32 v3, s7, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v3 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v0 ; GFX10-NEXT: v_min_u16_e64 v5, v11, v5 ; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, v7 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_and_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_min_u16_e64 v1, v3, v1 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_nc_u16_e64 v3, v6, v5 ; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 @@ -682,21 +673,20 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_mov_b32 s9, 0xffff -; GFX8-NEXT: s_xor_b32 s10, s0, s9 +; GFX8-NEXT: s_xor_b32 s9, s0, -1 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s10, s1 -; GFX8-NEXT: s_cselect_b32 s1, s10, s1 +; GFX8-NEXT: s_cmp_lt_u32 s9, s1 +; GFX8-NEXT: s_cselect_b32 s1, s9, s1 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s8 ; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_xor_b32 s5, s1, s9 +; GFX8-NEXT: s_xor_b32 s5, s1, -1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s8 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 @@ -706,7 +696,7 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_xor_b32 s5, s2, s9 +; GFX8-NEXT: s_xor_b32 s5, s2, -1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s8 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -716,7 +706,7 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_xor_b32 s5, s3, s9 +; GFX8-NEXT: s_xor_b32 s5, s3, -1 ; GFX8-NEXT: s_lshr_b32 s2, s2, s8 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -745,21 +735,20 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NEXT: s_lshl_b32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_xor_b32 s10, s0, s9 +; GFX9-NEXT: s_xor_b32 s9, s0, -1 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s10, s1 -; GFX9-NEXT: s_cselect_b32 s1, s10, s1 +; GFX9-NEXT: s_cmp_lt_u32 s9, s1 +; GFX9-NEXT: s_cselect_b32 s1, s9, s1 ; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s2, s8 ; GFX9-NEXT: s_lshl_b32 s2, s5, s8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_xor_b32 s5, s1, s9 +; GFX9-NEXT: s_xor_b32 s5, s1, -1 ; GFX9-NEXT: s_lshr_b32 s0, s0, s8 ; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 @@ -769,7 +758,7 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshl_b32 s2, s3, s8 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_xor_b32 s5, s2, s9 +; GFX9-NEXT: s_xor_b32 s5, s2, -1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s8 ; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -779,7 +768,7 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshl_b32 s3, s4, s8 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_xor_b32 s5, s3, s9 +; GFX9-NEXT: s_xor_b32 s5, s3, -1 ; GFX9-NEXT: s_lshr_b32 s2, s2, s8 ; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -808,40 +797,39 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, s5 -; GFX10-NEXT: s_mov_b32 s7, 0xffff -; GFX10-NEXT: s_lshl_b32 s10, s1, s5 -; GFX10-NEXT: s_xor_b32 s9, s0, s7 -; GFX10-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX10-NEXT: s_lshl_b32 s9, s1, s5 +; GFX10-NEXT: s_xor_b32 s8, s0, -1 ; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-NEXT: s_cmp_lt_u32 s9, s10 +; GFX10-NEXT: s_cmp_lt_u32 s8, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 ; GFX10-NEXT: s_lshl_b32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s0, s0, s9 -; GFX10-NEXT: s_xor_b32 s9, s2, s7 +; GFX10-NEXT: s_add_i32 s0, s0, s8 +; GFX10-NEXT: s_xor_b32 s8, s2, -1 ; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: s_lshr_b32 s0, s0, s5 -; GFX10-NEXT: s_cmp_lt_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s6, s9, s6 +; GFX10-NEXT: s_cmp_lt_u32 s8, s6 +; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 ; GFX10-NEXT: s_add_i32 s2, s2, s6 -; GFX10-NEXT: s_xor_b32 s6, s3, s7 -; GFX10-NEXT: s_lshl_b32 s8, s8, s5 +; GFX10-NEXT: s_xor_b32 s6, s3, -1 +; GFX10-NEXT: s_lshl_b32 s7, s7, s5 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 +; GFX10-NEXT: s_cmp_lt_u32 s6, s7 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 ; GFX10-NEXT: s_add_i32 s3, s3, s6 -; GFX10-NEXT: s_xor_b32 s6, s4, s7 +; GFX10-NEXT: s_xor_b32 s6, s4, -1 ; GFX10-NEXT: s_lshl_b32 s1, s1, s5 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -2203,7 +2191,7 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_uaddsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2211,7 +2199,7 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-LABEL: v_uaddsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2220,7 +2208,7 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 ; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 @@ -2243,7 +2231,7 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: s_uaddsat_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s2, s0, 0xffff +; GFX8-NEXT: s_xor_b32 s2, s0, -1 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s2, s1 @@ -2253,7 +2241,7 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: s_uaddsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s2, s0, 0xffff +; GFX9-NEXT: s_xor_b32 s2, s0, -1 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_cmp_lt_u32 s2, s1 @@ -2263,7 +2251,7 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX10-LABEL: s_uaddsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s2, s0, 0xffff +; GFX10-NEXT: s_xor_b32 s2, s0, -1 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: ; implicit-def: $vcc_hi @@ -2288,21 +2276,21 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX8-LABEL: uaddsat_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s1, s0, 0xffff +; GFX8-NEXT: s_xor_b32 s1, s0, -1 ; GFX8-NEXT: v_min_u16_e32 v0, s1, v0 ; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s1, s0, 0xffff +; GFX9-NEXT: s_xor_b32 s1, s0, -1 ; GFX9-NEXT: v_min_u16_e32 v0, s1, v0 ; GFX9-NEXT: v_add_u16_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s1, s0, 0xffff +; GFX10-NEXT: s_xor_b32 s1, s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_min_u16_e64 v0, s1, v0 ; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 @@ -2325,21 +2313,21 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: uaddsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX8-NEXT: v_min_u16_e32 v1, s0, v1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX9-NEXT: v_min_u16_e32 v1, s0, v1 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_min_u16_e64 v1, v1, s0 ; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 @@ -2370,10 +2358,9 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 ; GFX8-NEXT: v_min_u16_e32 v3, v3, v1 ; GFX8-NEXT: v_min_u16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 @@ -2430,16 +2417,15 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX8-LABEL: s_uaddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: s_xor_b32 s5, s0, s4 +; GFX8-NEXT: s_xor_b32 s4, s0, -1 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s1 -; GFX8-NEXT: s_cselect_b32 s1, s5, s1 +; GFX8-NEXT: s_cmp_lt_u32 s4, s1 +; GFX8-NEXT: s_cselect_b32 s1, s4, s1 ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_xor_b32 s1, s2, s4 +; GFX8-NEXT: s_xor_b32 s1, s2, -1 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s1, s3 @@ -2522,12 +2508,11 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX8-LABEL: uaddsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_xor_b32 s2, s0, -1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_xor_b32 s3, s0, s2 -; GFX8-NEXT: s_xor_b32 s2, s1, s2 +; GFX8-NEXT: v_min_u16_e32 v1, s2, v0 +; GFX8-NEXT: s_xor_b32 s2, s1, -1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_min_u16_e32 v1, s3, v0 ; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 @@ -2580,11 +2565,10 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: uaddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX8-NEXT: v_min_u16_e32 v2, s0, v2 ; GFX8-NEXT: v_min_u16_e32 v3, s1, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 @@ -2666,15 +2650,14 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 ; GFX8-NEXT: v_min_u16_e32 v6, v6, v2 ; GFX8-NEXT: v_min_u16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v8, s4, v5 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v1 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX8-NEXT: v_min_u16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_u16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 @@ -2759,30 +2742,29 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s8, 0xffff -; GFX8-NEXT: s_xor_b32 s9, s0, s8 +; GFX8-NEXT: s_xor_b32 s8, s0, -1 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s9, s2 -; GFX8-NEXT: s_cselect_b32 s2, s9, s2 +; GFX8-NEXT: s_cmp_lt_u32 s8, s2 +; GFX8-NEXT: s_cselect_b32 s2, s8, s2 ; GFX8-NEXT: s_add_i32 s0, s0, s2 -; GFX8-NEXT: s_xor_b32 s2, s4, s8 +; GFX8-NEXT: s_xor_b32 s2, s4, -1 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s2, s6 ; GFX8-NEXT: s_cselect_b32 s2, s2, s6 ; GFX8-NEXT: s_add_i32 s4, s4, s2 -; GFX8-NEXT: s_xor_b32 s2, s1, s8 +; GFX8-NEXT: s_xor_b32 s2, s1, -1 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s2, s3 ; GFX8-NEXT: s_cselect_b32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s2, s5, s8 +; GFX8-NEXT: s_xor_b32 s2, s5, -1 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s2, s3 @@ -2944,20 +2926,19 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v9, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v10, s4, v6 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v6 ; GFX8-NEXT: v_min_u16_e32 v9, v9, v3 ; GFX8-NEXT: v_min_u16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v10, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v11, s4, v7 +; GFX8-NEXT: v_xor_b32_e32 v11, -1, v7 ; GFX8-NEXT: v_min_u16_e32 v10, v10, v4 ; GFX8-NEXT: v_min_u16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v11, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v12, s4, v8 +; GFX8-NEXT: v_xor_b32_e32 v11, -1, v2 +; GFX8-NEXT: v_xor_b32_e32 v12, -1, v8 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_u16_e32 v11, v11, v5 @@ -3069,44 +3050,43 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff -; GFX8-NEXT: s_xor_b32 s13, s0, s12 +; GFX8-NEXT: s_xor_b32 s12, s0, -1 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_bfe_u32 s13, s13, 0x100000 +; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s13, s3 -; GFX8-NEXT: s_cselect_b32 s3, s13, s3 +; GFX8-NEXT: s_cmp_lt_u32 s12, s3 +; GFX8-NEXT: s_cselect_b32 s3, s12, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_xor_b32 s3, s6, s12 +; GFX8-NEXT: s_xor_b32 s3, s6, -1 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s3, s9 ; GFX8-NEXT: s_cselect_b32 s3, s3, s9 ; GFX8-NEXT: s_add_i32 s6, s6, s3 -; GFX8-NEXT: s_xor_b32 s3, s1, s12 +; GFX8-NEXT: s_xor_b32 s3, s1, -1 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s1, s1, s3 -; GFX8-NEXT: s_xor_b32 s3, s7, s12 +; GFX8-NEXT: s_xor_b32 s3, s7, -1 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s7, s7, s3 -; GFX8-NEXT: s_xor_b32 s3, s2, s12 +; GFX8-NEXT: s_xor_b32 s3, s2, -1 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_xor_b32 s3, s8, s12 +; GFX8-NEXT: s_xor_b32 s3, s8, -1 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s3, s4 @@ -3307,28 +3287,27 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v12, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v12, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v13, s4, v8 +; GFX8-NEXT: v_xor_b32_e32 v13, -1, v8 ; GFX8-NEXT: v_min_u16_e32 v12, v12, v4 ; GFX8-NEXT: v_min_u16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v13, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v13, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v14, s4, v9 +; GFX8-NEXT: v_xor_b32_e32 v14, -1, v9 ; GFX8-NEXT: v_min_u16_e32 v13, v13, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX8-NEXT: v_min_u16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v14, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v15, s4, v10 +; GFX8-NEXT: v_xor_b32_e32 v14, -1, v2 +; GFX8-NEXT: v_xor_b32_e32 v15, -1, v10 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_u16_e32 v14, v14, v6 ; GFX8-NEXT: v_min_u16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v15, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v15, -1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v16, s4, v11 +; GFX8-NEXT: v_xor_b32_e32 v16, -1, v11 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 ; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_u16_e32 v15, v15, v7 @@ -3464,8 +3443,7 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s16, 0xffff -; GFX8-NEXT: s_xor_b32 s17, s0, s16 +; GFX8-NEXT: s_xor_b32 s16, s0, -1 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 @@ -3474,48 +3452,48 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_bfe_u32 s17, s17, 0x100000 +; GFX8-NEXT: s_bfe_u32 s16, s16, 0x100000 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s17, s4 -; GFX8-NEXT: s_cselect_b32 s4, s17, s4 +; GFX8-NEXT: s_cmp_lt_u32 s16, s4 +; GFX8-NEXT: s_cselect_b32 s4, s16, s4 ; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_xor_b32 s4, s8, s16 +; GFX8-NEXT: s_xor_b32 s4, s8, -1 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s4, s12 ; GFX8-NEXT: s_cselect_b32 s4, s4, s12 ; GFX8-NEXT: s_add_i32 s8, s8, s4 -; GFX8-NEXT: s_xor_b32 s4, s1, s16 +; GFX8-NEXT: s_xor_b32 s4, s1, -1 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s4, s5 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s4 -; GFX8-NEXT: s_xor_b32 s4, s9, s16 +; GFX8-NEXT: s_xor_b32 s4, s9, -1 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s5, s13, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s4, s5 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s9, s9, s4 -; GFX8-NEXT: s_xor_b32 s4, s2, s16 +; GFX8-NEXT: s_xor_b32 s4, s2, -1 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s5, s6, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s4, s5 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s4 -; GFX8-NEXT: s_xor_b32 s4, s10, s16 +; GFX8-NEXT: s_xor_b32 s4, s10, -1 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s5, s14, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s4, s5 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s10, s10, s4 -; GFX8-NEXT: s_xor_b32 s4, s3, s16 +; GFX8-NEXT: s_xor_b32 s4, s3, -1 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s5, s7, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s4, s5 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_xor_b32 s4, s11, s16 +; GFX8-NEXT: s_xor_b32 s4, s11, -1 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_bfe_u32 s5, s15, 0x100000 ; GFX8-NEXT: s_cmp_lt_u32 s4, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll index 25eafb45f9309..54eebc9205796 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -311,7 +311,7 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0xfffff000 +; CGP-NEXT: s_movk_i32 s5, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v3, 0xfffff000 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, s4 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index e956af93bc6f1..59b4318012e4c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -185,14 +185,13 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_udiv_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] -; CHECK-NEXT: s_mov_b32 s4, 1 +; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: @@ -324,9 +323,9 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s1, s4, 1 +; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 68a83a91c62f8..f331deea89e54 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -286,7 +286,7 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: s_mov_b32 s5, 0x4f7ffffe -; CGP-NEXT: s_mov_b32 s6, 0xfffff000 +; CGP-NEXT: s_movk_i32 s6, 0xf000 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, s4 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 6b9357043b3ca..a01ba29cc3276 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -182,14 +182,13 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_urem_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] -; CHECK-NEXT: s_mov_b32 s4, 1 +; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: @@ -320,9 +319,9 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s1, s4, 1 +; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: From 61ced4b87a80b65d89b2b84418038efde704b9ed Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 26 Jul 2020 09:26:48 -0400 Subject: [PATCH 0130/1035] GlobalISel: Handle 'n' inline asm constraint --- llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp | 1 + .../CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp index 502f3cb85726c..7acf9c843235e 100644 --- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp @@ -658,6 +658,7 @@ bool InlineAsmLowering::lowerAsmOperandForConstraint( default: return false; case 'i': // Simple Integer or Relocatable Constant + case 'n': // immediate integer with a known value. if (ConstantInt *CI = dyn_cast(Val)) { assert(CI->getBitWidth() <= 64 && "expected immediate to fit into 64-bits"); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index 07b8dced57132..1315f1bc275eb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -326,4 +326,13 @@ entry: ret i32 %asm1 } +define amdgpu_kernel void @asm_constraint_n_n() { + ; CHECK-LABEL: name: asm_constraint_n_n + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: INLINEASM &"s_trap ${0:n}", 1 /* sideeffect attdialect */, 13 /* imm */, 10 + ; CHECK: S_ENDPGM 0 + tail call void asm sideeffect "s_trap ${0:n}", "n"(i32 10) #1 + ret void +} + !0 = !{i32 70} From 0481e1ae3c17a8faa6c8e319495a51a76b77b76b Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 26 Jul 2020 09:33:13 -0400 Subject: [PATCH 0131/1035] [InstSimplify] fold integer min/max intrinsics with limit constant --- llvm/lib/Analysis/InstructionSimplify.cpp | 21 ++++++++++++++++ .../InstSimplify/maxmin_intrinsics.ll | 24 +++++++------------ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 396fc22920cdf..c920fb3f52bea 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -5254,6 +5254,27 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, Intrinsic::ID IID = F->getIntrinsicID(); Type *ReturnType = F->getReturnType(); switch (IID) { + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: { + // Canonicalize constant operand as Op1. + if (isa(Op0)) + std::swap(Op0, Op1); + + // TODO: Allow partial undef vector constants. + const APInt *C; + if (!match(Op1, m_APInt(C))) + break; + + if ((IID == Intrinsic::smax && C->isMaxSignedValue()) || + (IID == Intrinsic::smin && C->isMinSignedValue()) || + (IID == Intrinsic::umax && C->isMaxValue()) || + (IID == Intrinsic::umin && C->isMinValue())) + return Op1; + + break; + } case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: // X - X -> { 0, false } diff --git a/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll b/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll index fe940ef7bc179..99a8656b34aa6 100644 --- a/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll +++ b/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll @@ -12,8 +12,7 @@ declare <2 x i8> @llvm.umin.v2i8(<2 x i8>, <2 x i8>) define i8 @smax_maxval(i8 %x) { ; CHECK-LABEL: @smax_maxval( -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 127) -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: ret i8 127 ; %r = call i8 @llvm.smax.i8(i8 %x, i8 127) ret i8 %r @@ -21,8 +20,7 @@ define i8 @smax_maxval(i8 %x) { define <2 x i8> @smax_maxval_commute(<2 x i8> %x) { ; CHECK-LABEL: @smax_maxval_commute( -; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) -; CHECK-NEXT: ret <2 x i8> [[R]] +; CHECK-NEXT: ret <2 x i8> ; %r = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> %x) ret <2 x i8> %r @@ -30,8 +28,7 @@ define <2 x i8> @smax_maxval_commute(<2 x i8> %x) { define i8 @smin_minval(i8 %x) { ; CHECK-LABEL: @smin_minval( -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smin.i8(i8 -128, i8 [[X:%.*]]) -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: ret i8 -128 ; %r = call i8 @llvm.smin.i8(i8 -128, i8 %x) ret i8 %r @@ -39,8 +36,7 @@ define i8 @smin_minval(i8 %x) { define <2 x i8> @smin_minval_commute(<2 x i8> %x) { ; CHECK-LABEL: @smin_minval_commute( -; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) -; CHECK-NEXT: ret <2 x i8> [[R]] +; CHECK-NEXT: ret <2 x i8> ; %r = call <2 x i8> @llvm.smin.v2i8(<2 x i8> %x, <2 x i8> ) ret <2 x i8> %r @@ -48,8 +44,7 @@ define <2 x i8> @smin_minval_commute(<2 x i8> %x) { define i8 @umax_maxval(i8 %x) { ; CHECK-LABEL: @umax_maxval( -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 -1) -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: ret i8 -1 ; %r = call i8 @llvm.umax.i8(i8 %x, i8 255) ret i8 %r @@ -57,8 +52,7 @@ define i8 @umax_maxval(i8 %x) { define <2 x i8> @umax_maxval_commute(<2 x i8> %x) { ; CHECK-LABEL: @umax_maxval_commute( -; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) -; CHECK-NEXT: ret <2 x i8> [[R]] +; CHECK-NEXT: ret <2 x i8> ; %r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> %x) ret <2 x i8> %r @@ -66,8 +60,7 @@ define <2 x i8> @umax_maxval_commute(<2 x i8> %x) { define i8 @umin_minval(i8 %x) { ; CHECK-LABEL: @umin_minval( -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umin.i8(i8 0, i8 [[X:%.*]]) -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: ret i8 0 ; %r = call i8 @llvm.umin.i8(i8 0, i8 %x) ret i8 %r @@ -75,8 +68,7 @@ define i8 @umin_minval(i8 %x) { define <2 x i8> @umin_minval_commute(<2 x i8> %x) { ; CHECK-LABEL: @umin_minval_commute( -; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> zeroinitializer) -; CHECK-NEXT: ret <2 x i8> [[R]] +; CHECK-NEXT: ret <2 x i8> zeroinitializer ; %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> zeroinitializer) ret <2 x i8> %r From 5819159995657091e4e21e538509b2af210fd48d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 18 Jul 2020 15:30:59 -0400 Subject: [PATCH 0132/1035] AMDGPU/GlobalISel: Pack constant G_BUILD_VECTOR_TRUNCs when selecting --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 21 +- .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll | 44 +- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 3 +- .../inst-select-build-vector-trunc.v2s16.mir | 270 +++ .../AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll | 46 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll | 46 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 3 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 1700 ++++++++--------- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 3 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 1536 +++++++-------- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 426 ++--- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 10 +- 12 files changed, 2059 insertions(+), 2049 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a126ed1daf17f..8bc597664634a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -616,11 +616,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { return true; } -static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { - int64_t Val; - return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; -} - bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( MachineInstr &MI) const { if (selectImpl(MI, *CoverageInfo)) @@ -645,6 +640,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock *BB = MI.getParent(); + auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); + if (ConstSrc1) { + auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); + if (ConstSrc0) { + uint32_t Lo16 = static_cast(ConstSrc0->Value) & 0xffff; + uint32_t Hi16 = static_cast(ConstSrc1->Value) & 0xffff; + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) + .addImm(Lo16 | (Hi16 << 16)); + MI.eraseFromParent(); + return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); + } + } + // TODO: This should probably be a combine somewhere // (build_vector_trunc $src0, undef -> copy $src0 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); @@ -686,7 +695,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( } else if (Shift1) { Opc = AMDGPU::S_PACK_LH_B32_B16; MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift0 && isZero(Src1, *MRI)) { + } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) .addReg(ShiftSrc0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index c6c0eb7c4a937..2205bfe3c71d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -91,9 +91,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0xffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -113,8 +112,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0xffffffc0, 4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo: @@ -133,8 +132,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 4, 0xffffffc0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: @@ -152,13 +151,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s1, 0xffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc0ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: @@ -182,12 +178,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 0xffffffc0, 4 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0x4ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: @@ -210,12 +204,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 4, 0xffffffc0 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc00004 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 71ee562f0ecc2..c1896f81ef296 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -522,8 +522,7 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_ashr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_ashrrev_i16 v0, s4, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir index c380d3c77defc..056ea79a98988 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir @@ -430,3 +430,273 @@ body: | %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %3, %4 S_ENDPGM 0, implicit %5 ... + +--- +name: test_build_vector_trunc_s_v2s16_constant_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_impdef + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_CONSTANT i32 123 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_impdef + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: S_ENDPGM 0, implicit [[DEF]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc + ; GFX9: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294836208 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 -16 + %1:sgpr(s16) = G_CONSTANT i16 -3 + %2:sgpr(s32) = G_SEXT %0 + %3:sgpr(s32) = G_SEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[S_MOV_B32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_constant + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 456 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_0 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_0 + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_0_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_0_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 0 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll index fdcf0f1515f91..172656f08aefb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -82,24 +82,21 @@ define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ define i32 @v_sdot2_inline_literal_a_b_c() { ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll index d285ee132cc21..976536c728838 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -82,24 +82,21 @@ define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ define i32 @v_udot2_inline_literal_a_b_c() { ; GFX906-LABEL: v_udot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 9d82396bbc364..ea2631cbcb294 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -533,8 +533,7 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_lshr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index b2e7f1ea326f6..ba672883fa562 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4527,15 +4527,12 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, s5, v3 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v2, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, s4, v2 +; GFX9-NEXT: v_pk_max_i16 v2, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v2, v3, v2 +; GFX9-NEXT: v_pk_min_i16 v3, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v3, v4, v3 ; GFX9-NEXT: v_pk_max_i16 v1, v3, v1 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 @@ -4545,16 +4542,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v2, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_max_i16 v3, v0, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v2, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v3, v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, s5, v2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s6 -; GFX10-NEXT: v_pk_sub_i16 v3, s4, v3 +; GFX10-NEXT: v_pk_sub_i16 v2, 0x80008000, v2 +; GFX10-NEXT: v_pk_sub_i16 v3, 0x7fff7fff, v3 ; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 @@ -4650,53 +4642,45 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_saddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 -; GFX9-NEXT: s_ashr_i32 s6, s0, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s8, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s9, s2, 16 -; GFX9-NEXT: s_lshr_b32 s10, s8, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 -; GFX9-NEXT: s_sub_i32 s8, s9, s10 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s3, 0x8000 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: s_ashr_i32 s3, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s4, 0 +; GFX9-NEXT: s_cmp_gt_i32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s5, s2, s4 +; GFX9-NEXT: s_cmp_gt_i32 s3, 0 +; GFX9-NEXT: s_cselect_b32 s6, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s5, 16 +; GFX9-NEXT: s_sub_i32 s5, 0x7fff7fff, s5 +; GFX9-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX9-NEXT: s_cmp_lt_i32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_cmp_lt_i32 s3, 0 +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_sub_i32 s2, 0x80008000, s2 +; GFX9-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_sext_i32_i16 s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s1 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 +; GFX9-NEXT: s_cmp_gt_i32 s2, s1 +; GFX9-NEXT: s_cselect_b32 s1, s2, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX9-NEXT: s_sext_i32_i16 s2, s1 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, s5 +; GFX9-NEXT: s_ashr_i32 s4, s5, 16 +; GFX9-NEXT: s_cmp_lt_i32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s2, s3 +; GFX9-NEXT: s_cmp_lt_i32 s1, s4 +; GFX9-NEXT: s_cselect_b32 s1, s1, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s1 @@ -4706,55 +4690,47 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX10-LABEL: s_saddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-NEXT: s_sext_i32_i16 s2, s0 +; GFX10-NEXT: s_sext_i32_i16 s3, 0 ; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s6, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX10-NEXT: s_cselect_b32 s8, s4, s2 +; GFX10-NEXT: s_cmp_gt_i32 s2, s3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_lshr_b32 s8, s7, 16 -; GFX10-NEXT: s_lshr_b32 s9, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s7, s6 -; GFX10-NEXT: s_sub_i32 s7, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s4, s2 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 +; GFX10-NEXT: s_cselect_b32 s5, s2, s3 +; GFX10-NEXT: s_cmp_gt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s6, s4, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_sub_i32 s5, 0x7fff7fff, s5 +; GFX10-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX10-NEXT: s_cmp_lt_i32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: s_cmp_lt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s3, s4, 0 ; GFX10-NEXT: s_sext_i32_i16 s4, s1 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; GFX10-NEXT: s_sub_i32 s2, 0x80008000, s2 +; GFX10-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-NEXT: s_sext_i32_i16 s3, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 ; GFX10-NEXT: s_cmp_gt_i32 s3, s4 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 ; GFX10-NEXT: s_cmp_gt_i32 s2, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s6 ; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s7 +; GFX10-NEXT: s_sext_i32_i16 s2, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-NEXT: s_ashr_i32 s3, s4, 16 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lt_i32 s4, s2 +; GFX10-NEXT: s_cselect_b32 s2, s4, s2 +; GFX10-NEXT: s_cmp_lt_i32 s1, s3 +; GFX10-NEXT: s_cselect_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s1 ; GFX10-NEXT: s_add_i32 s2, s2, s3 @@ -4834,73 +4810,57 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: s_ashr_i32 s5, s0, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s7, s4, s6 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_cselect_b32 s8, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s7, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s7, s8, s9 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: v_pk_max_i16 v0, s2, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s1 +; GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GFX9-NEXT: s_ashr_i32 s2, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, 0 +; GFX9-NEXT: s_cmp_gt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s4, s1, s3 +; GFX9-NEXT: s_cmp_gt_i32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s5, s2, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_sub_i32 s4, 0x7fff7fff, s4 +; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX9-NEXT: s_cmp_lt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s1, s1, s3 +; GFX9-NEXT: s_cmp_lt_i32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_sub_i32 s1, 0x80008000, s1 +; GFX9-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: v_pk_max_i16 v0, s1, v0 +; GFX9-NEXT: v_pk_min_i16 v0, v0, s4 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_sext_i32_i16 s1, s0 +; GFX10-NEXT: s_sext_i32_i16 s2, 0 ; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff -; GFX10-NEXT: s_cselect_b32 s5, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: s_cselect_b32 s7, s3, s1 +; GFX10-NEXT: s_cmp_gt_i32 s1, s2 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX10-NEXT: s_lshr_b32 s7, s6, 16 -; GFX10-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_movk_i32 s4, 0x8000 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_sub_i32 s1, s3, s1 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s4, s1, s2 +; GFX10-NEXT: s_cmp_gt_i32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s5, s3, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, 0x7fff7fff, s4 +; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX10-NEXT: s_cmp_lt_i32 s1, s2 +; GFX10-NEXT: s_cselect_b32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lt_i32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s2, s3, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s1, 16 +; GFX10-NEXT: s_sub_i32 s1, 0x80008000, s1 +; GFX10-NEXT: s_sub_i32 s2, 0x8000, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s5, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5 ; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 ; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -4966,15 +4926,12 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, 0 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: v_pk_min_i16 v2, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: v_pk_max_i16 v1, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 +; GFX9-NEXT: v_pk_max_i16 v1, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v1, v2, v1 +; GFX9-NEXT: v_pk_min_i16 v2, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v2, v3, v2 ; GFX9-NEXT: v_pk_max_i16 v2, v2, s0 ; GFX9-NEXT: v_pk_min_i16 v1, v2, v1 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 @@ -4982,16 +4939,11 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX10-LABEL: saddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, 0, 0 -; GFX10-NEXT: s_movk_i32 s2, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v1, v0, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX10-NEXT: v_pk_max_i16 v2, v0, s1 -; GFX10-NEXT: s_movk_i32 s3, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v1, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v2, v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, s2, v1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s3 -; GFX10-NEXT: v_pk_sub_i16 v2, s1, v2 +; GFX10-NEXT: v_pk_sub_i16 v1, 0x80008000, v1 +; GFX10-NEXT: v_pk_sub_i16 v2, 0x7fff7fff, v2 ; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 @@ -5113,22 +5065,19 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v5, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v4, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 -; GFX9-NEXT: v_pk_max_i16 v2, v5, v2 +; GFX9-NEXT: v_pk_min_i16 v6, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v6, v7, v6 +; GFX9-NEXT: v_pk_max_i16 v4, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v4, v5, v4 +; GFX9-NEXT: v_pk_max_i16 v2, v6, v2 ; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_min_i16 v4, v1, s6 +; GFX9-NEXT: v_pk_min_i16 v4, v1, 0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s5, v4 -; GFX9-NEXT: v_pk_sub_i16 v2, s4, v2 +; GFX9-NEXT: v_pk_max_i16 v2, v1, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v7, v4 +; GFX9-NEXT: v_pk_sub_i16 v2, v5, v2 ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: v_pk_min_i16 v2, v3, v2 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 @@ -5138,24 +5087,19 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v4, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v5, v1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v7, v1, s5 -; GFX10-NEXT: v_pk_sub_i16 v4, s6, v4 -; GFX10-NEXT: v_pk_sub_i16 v5, s6, v5 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v4, v0, 0 +; GFX10-NEXT: v_pk_min_i16 v5, v1, 0 +; GFX10-NEXT: v_pk_max_i16 v6, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v7, v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX10-NEXT: v_pk_sub_i16 v4, 0x80008000, v4 +; GFX10-NEXT: v_pk_sub_i16 v5, 0x80008000, v5 +; GFX10-NEXT: v_pk_sub_i16 v6, 0x7fff7fff, v6 +; GFX10-NEXT: v_pk_sub_i16 v7, 0x7fff7fff, v7 ; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX10-NEXT: v_pk_sub_i16 v4, s4, v7 -; GFX10-NEXT: v_pk_max_i16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_i16 v10, v5, v3 ; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v3, v4 +; GFX10-NEXT: v_pk_min_i16 v3, v10, v7 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5321,76 +5265,72 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_saddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s9, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s6, s0 +; GFX9-NEXT: s_ashr_i32 s7, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, 0 +; GFX9-NEXT: s_cmp_gt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s9, s6, s8 +; GFX9-NEXT: s_cmp_gt_i32 s7, 0 +; GFX9-NEXT: s_cselect_b32 s10, s7, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s11, s9, 16 +; GFX9-NEXT: s_movk_i32 s10, 0x7fff +; GFX9-NEXT: s_sub_i32 s9, s4, s9 +; GFX9-NEXT: s_sub_i32 s11, s10, s11 +; GFX9-NEXT: s_cmp_lt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lt_i32 s7, 0 +; GFX9-NEXT: s_cselect_b32 s7, s7, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX9-NEXT: s_mov_b32 s5, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_mov_b32 s7, 0x8000 +; GFX9-NEXT: s_sub_i32 s6, s5, s6 +; GFX9-NEXT: s_sub_i32 s11, s7, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX9-NEXT: s_sext_i32_i16 s11, s6 +; GFX9-NEXT: s_sext_i32_i16 s12, s2 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s10, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_cselect_b32 s11, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_sub_i32 s10, s4, s10 -; GFX9-NEXT: s_sub_i32 s12, s11, s12 -; GFX9-NEXT: s_cmp_lt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_cselect_b32 s8, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_sub_i32 s7, s5, s7 -; GFX9-NEXT: s_sub_i32 s12, s8, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_sext_i32_i16 s12, s7 -; GFX9-NEXT: s_sext_i32_i16 s13, s2 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s12, s13 -; GFX9-NEXT: s_cselect_b32 s12, s12, s13 -; GFX9-NEXT: s_cmp_gt_i32 s7, s2 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX9-NEXT: s_sext_i32_i16 s7, s2 -; GFX9-NEXT: s_sext_i32_i16 s12, s10 +; GFX9-NEXT: s_cmp_gt_i32 s11, s12 +; GFX9-NEXT: s_cselect_b32 s11, s11, s12 +; GFX9-NEXT: s_cmp_gt_i32 s6, s2 +; GFX9-NEXT: s_cselect_b32 s2, s6, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: s_sext_i32_i16 s6, s2 +; GFX9-NEXT: s_sext_i32_i16 s11, s9 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_lt_i32 s7, s12 -; GFX9-NEXT: s_cselect_b32 s7, s7, s12 -; GFX9-NEXT: s_cmp_lt_i32 s2, s10 -; GFX9-NEXT: s_cselect_b32 s2, s2, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_add_i32 s7, s7, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s7, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s10, s2, s9 -; GFX9-NEXT: s_cmp_gt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s12, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_sub_i32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s10, s11, s12 +; GFX9-NEXT: s_ashr_i32 s9, s9, 16 +; GFX9-NEXT: s_cmp_lt_i32 s6, s11 +; GFX9-NEXT: s_cselect_b32 s6, s6, s11 ; GFX9-NEXT: s_cmp_lt_i32 s2, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s2 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s9, s2, 16 +; GFX9-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_sext_i32_i16 s2, s1 +; GFX9-NEXT: s_ashr_i32 s6, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s9, s2, s8 +; GFX9-NEXT: s_cmp_gt_i32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s11, s6, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX9-NEXT: s_lshr_b32 s11, s9, 16 +; GFX9-NEXT: s_sub_i32 s4, s4, s9 +; GFX9-NEXT: s_sub_i32 s9, s10, s11 +; GFX9-NEXT: s_cmp_lt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s2, s2, s8 +; GFX9-NEXT: s_cmp_lt_i32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s6, s6, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_sub_i32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s8, s6 +; GFX9-NEXT: s_sub_i32 s5, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: s_sext_i32_i16 s6, s3 @@ -5401,7 +5341,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX9-NEXT: s_cmp_gt_i32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s2, s2, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9 ; GFX9-NEXT: s_sext_i32_i16 s3, s2 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 @@ -5420,94 +5360,90 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX10-LABEL: s_saddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: s_sext_i32_i16 s5, 0 ; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_movk_i32 s9, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX10-NEXT: s_cselect_b32 s10, s6, s4 -; GFX10-NEXT: s_movk_i32 s12, 0x8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_lshr_b32 s10, s9, 16 -; GFX10-NEXT: s_lshr_b32 s11, s8, 16 -; GFX10-NEXT: s_sub_i32 s8, s9, s8 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s12 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_sext_i32_i16 s14, s2 -; GFX10-NEXT: s_cselect_b32 s6, s6, s4 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_lshr_b32 s6, s12, 16 -; GFX10-NEXT: s_lshr_b32 s13, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s12, s5 -; GFX10-NEXT: s_sub_i32 s13, s6, s13 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_mov_b32 s9, 0x7fff7fff +; GFX10-NEXT: s_cselect_b32 s7, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s6, 0 +; GFX10-NEXT: s_mov_b32 s11, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s8, s6, 0 +; GFX10-NEXT: s_sext_i32_i16 s13, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX10-NEXT: s_movk_i32 s8, 0x7fff +; GFX10-NEXT: s_lshr_b32 s10, s7, 16 +; GFX10-NEXT: s_sub_i32 s7, s9, s7 +; GFX10-NEXT: s_sub_i32 s10, s8, s10 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s13 -; GFX10-NEXT: s_sext_i32_i16 s13, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_gt_i32 s13, s14 -; GFX10-NEXT: s_cselect_b32 s13, s13, s14 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s8, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s13, s2 -; GFX10-NEXT: s_sext_i32_i16 s11, s5 -; GFX10-NEXT: s_sext_i32_i16 s8, s2 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s6, 0 +; GFX10-NEXT: s_cselect_b32 s6, s6, 0 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s11 -; GFX10-NEXT: s_cselect_b32 s8, s8, s11 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX10-NEXT: s_mov_b32 s6, 0x8000 +; GFX10-NEXT: s_lshr_b32 s12, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s11, s4 +; GFX10-NEXT: s_sub_i32 s12, s6, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_gt_i32 s12, s13 +; GFX10-NEXT: s_cselect_b32 s12, s12, s13 +; GFX10-NEXT: s_cmp_gt_i32 s4, s2 +; GFX10-NEXT: s_cselect_b32 s2, s4, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX10-NEXT: s_sext_i32_i16 s10, s4 +; GFX10-NEXT: s_sext_i32_i16 s7, s2 +; GFX10-NEXT: s_ashr_i32 s2, s2, 16 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_lt_i32 s7, s10 +; GFX10-NEXT: s_cselect_b32 s7, s7, s10 +; GFX10-NEXT: s_cmp_lt_i32 s2, s4 +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s7, s2 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_add_i32 s5, s5, s8 -; GFX10-NEXT: s_ashr_i32 s8, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX10-NEXT: s_cselect_b32 s11, s2, s7 -; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s13, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX10-NEXT: s_lshr_b32 s13, s11, 16 -; GFX10-NEXT: s_sub_i32 s9, s9, s11 -; GFX10-NEXT: s_sub_i32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s2, s7 -; GFX10-NEXT: s_cselect_b32 s2, s2, s7 -; GFX10-NEXT: s_cmp_lt_i32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_add_i32 s7, s7, s10 +; GFX10-NEXT: s_ashr_i32 s2, s1, 16 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX10-NEXT: s_cselect_b32 s10, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s2, 0 +; GFX10-NEXT: s_cselect_b32 s12, s2, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX10-NEXT: s_lshr_b32 s12, s10, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s12 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s2, 0 +; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s12, s2 +; GFX10-NEXT: s_sub_i32 s2, s11, s2 ; GFX10-NEXT: s_sub_i32 s4, s6, s4 -; GFX10-NEXT: s_sext_i32_i16 s6, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 ; GFX10-NEXT: s_sext_i32_i16 s4, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 ; GFX10-NEXT: s_cmp_gt_i32 s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s9, s8 ; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s6 +; GFX10-NEXT: s_sext_i32_i16 s3, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX10-NEXT: s_ashr_i32 s4, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s6, s2 +; GFX10-NEXT: s_ashr_i32 s4, s5, 16 +; GFX10-NEXT: s_sext_i32_i16 s5, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s3 -; GFX10-NEXT: s_cselect_b32 s3, s6, s3 +; GFX10-NEXT: s_cmp_lt_i32 s5, s3 +; GFX10-NEXT: s_cselect_b32 s3, s5, s3 ; GFX10-NEXT: s_cmp_lt_i32 s2, s4 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 @@ -5676,29 +5612,26 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v7, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v7, s5, v7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v6, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX9-NEXT: v_pk_max_i16 v3, v7, v3 +; GFX9-NEXT: v_pk_min_i16 v8, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v8, v9, v8 +; GFX9-NEXT: v_pk_max_i16 v6, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v6, v7, v6 +; GFX9-NEXT: v_pk_max_i16 v3, v8, v3 ; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 -; GFX9-NEXT: v_pk_min_i16 v6, v1, s6 +; GFX9-NEXT: v_pk_min_i16 v6, v1, 0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, s5, v6 -; GFX9-NEXT: v_pk_sub_i16 v3, s4, v3 +; GFX9-NEXT: v_pk_max_i16 v3, v1, 0 +; GFX9-NEXT: v_pk_sub_i16 v6, v9, v6 +; GFX9-NEXT: v_pk_sub_i16 v3, v7, v3 ; GFX9-NEXT: v_pk_max_i16 v4, v6, v4 ; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_min_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s5, v4 +; GFX9-NEXT: v_pk_min_i16 v4, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, s4, v3 +; GFX9-NEXT: v_pk_max_i16 v3, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v3, v7, v3 ; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 ; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 @@ -5708,28 +5641,23 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v7, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v8, v1, s5 -; GFX10-NEXT: v_pk_min_i16 v9, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_sub_i16 v14, s6, v7 -; GFX10-NEXT: v_pk_sub_i16 v15, s6, v8 -; GFX10-NEXT: v_pk_sub_i16 v19, s6, v9 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s5 -; GFX10-NEXT: v_pk_max_i16 v11, v2, s5 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v7, v0, 0 +; GFX10-NEXT: v_pk_min_i16 v8, v1, 0 +; GFX10-NEXT: v_pk_min_i16 v9, v2, 0 +; GFX10-NEXT: v_pk_max_i16 v6, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v10, v1, 0 +; GFX10-NEXT: v_pk_sub_i16 v14, 0x80008000, v7 +; GFX10-NEXT: v_pk_sub_i16 v15, 0x80008000, v8 +; GFX10-NEXT: v_pk_max_i16 v11, v2, 0 +; GFX10-NEXT: v_pk_sub_i16 v19, 0x80008000, v9 +; GFX10-NEXT: v_pk_sub_i16 v6, 0x7fff7fff, v6 ; GFX10-NEXT: v_pk_max_i16 v3, v14, v3 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX10-NEXT: v_pk_sub_i16 v7, 0x7fff7fff, v10 ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX10-NEXT: v_pk_sub_i16 v7, s4, v10 -; GFX10-NEXT: v_pk_sub_i16 v8, s4, v11 +; GFX10-NEXT: v_pk_sub_i16 v8, 0x7fff7fff, v11 ; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v3, v3, v6 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v4, v4, v7 ; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 @@ -5968,119 +5896,115 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_saddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s11, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, s0 +; GFX9-NEXT: s_ashr_i32 s9, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s10, 0 +; GFX9-NEXT: s_cmp_gt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s11, s8, s10 +; GFX9-NEXT: s_cmp_gt_i32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s12, s9, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_mov_b32 s6, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s13, s11, 16 +; GFX9-NEXT: s_movk_i32 s12, 0x7fff +; GFX9-NEXT: s_sub_i32 s11, s6, s11 +; GFX9-NEXT: s_sub_i32 s13, s12, s13 +; GFX9-NEXT: s_cmp_lt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s8, s8, s10 +; GFX9-NEXT: s_cmp_lt_i32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s9, s9, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_mov_b32 s7, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s13, s8, 16 +; GFX9-NEXT: s_mov_b32 s9, 0x8000 +; GFX9-NEXT: s_sub_i32 s8, s7, s8 +; GFX9-NEXT: s_sub_i32 s13, s9, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s13 +; GFX9-NEXT: s_sext_i32_i16 s13, s8 +; GFX9-NEXT: s_sext_i32_i16 s14, s3 ; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s12, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_sub_i32 s12, s6, s12 -; GFX9-NEXT: s_sub_i32 s14, s13, s14 -; GFX9-NEXT: s_cmp_lt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s9, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s7, 0x8000 -; GFX9-NEXT: s_cselect_b32 s10, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_sub_i32 s14, s10, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX9-NEXT: s_sext_i32_i16 s14, s9 -; GFX9-NEXT: s_sext_i32_i16 s15, s3 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s14, s15 -; GFX9-NEXT: s_cselect_b32 s14, s14, s15 -; GFX9-NEXT: s_cmp_gt_i32 s9, s3 -; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s14, s3 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s14, s12 +; GFX9-NEXT: s_cmp_gt_i32 s13, s14 +; GFX9-NEXT: s_cselect_b32 s13, s13, s14 +; GFX9-NEXT: s_cmp_gt_i32 s8, s3 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s3 +; GFX9-NEXT: s_sext_i32_i16 s8, s3 +; GFX9-NEXT: s_sext_i32_i16 s13, s11 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_cmp_lt_i32 s9, s14 -; GFX9-NEXT: s_cselect_b32 s9, s9, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s12 -; GFX9-NEXT: s_cselect_b32 s3, s3, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s12, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s9, s9, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s9, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s12, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s14, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_sub_i32 s12, s6, s12 -; GFX9-NEXT: s_sub_i32 s14, s13, s14 +; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_cmp_lt_i32 s8, s13 +; GFX9-NEXT: s_cselect_b32 s8, s8, s13 ; GFX9-NEXT: s_cmp_lt_i32 s3, s11 ; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s9, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: s_lshr_b32 s11, s3, 16 +; GFX9-NEXT: s_add_i32 s0, s0, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 +; GFX9-NEXT: s_ashr_i32 s8, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s11, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s13, s8, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_lshr_b32 s13, s11, 16 +; GFX9-NEXT: s_sub_i32 s11, s6, s11 +; GFX9-NEXT: s_sub_i32 s13, s12, s13 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s8, s8, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 ; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s9, s10, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s14, s4 +; GFX9-NEXT: s_sub_i32 s8, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_sext_i32_i16 s8, s3 +; GFX9-NEXT: s_sext_i32_i16 s13, s4 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s14 -; GFX9-NEXT: s_cselect_b32 s9, s9, s14 +; GFX9-NEXT: s_cmp_gt_i32 s8, s13 +; GFX9-NEXT: s_cselect_b32 s8, s8, s13 ; GFX9-NEXT: s_cmp_gt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s9, s12 +; GFX9-NEXT: s_sext_i32_i16 s8, s11 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s9 -; GFX9-NEXT: s_cselect_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_i32 s3, s12 -; GFX9-NEXT: s_cselect_b32 s3, s3, s12 +; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_cmp_lt_i32 s4, s8 +; GFX9-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-NEXT: s_cmp_lt_i32 s3, s11 +; GFX9-NEXT: s_cselect_b32 s3, s3, s11 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_add_i32 s4, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_sext_i32_i16 s3, s2 ; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s9, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s12, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX9-NEXT: s_lshr_b32 s12, s9, 16 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_sub_i32 s9, s13, s12 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s8, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s11, s4, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_sub_i32 s8, s12, s11 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s4, s4, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s4, s10, s4 +; GFX9-NEXT: s_sub_i32 s4, s9, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_sext_i32_i16 s7, s5 @@ -6091,7 +6015,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX9-NEXT: s_cmp_gt_i32 s3, s5 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_sext_i32_i16 s5, s6 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 @@ -6110,141 +6034,137 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX10-LABEL: s_saddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s7, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 +; GFX10-NEXT: s_sext_i32_i16 s6, s0 +; GFX10-NEXT: s_sext_i32_i16 s7, 0 ; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 -; GFX10-NEXT: s_movk_i32 s11, 0x7fff -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s11 -; GFX10-NEXT: s_cselect_b32 s12, s8, s6 -; GFX10-NEXT: s_movk_i32 s14, 0x8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX10-NEXT: s_lshr_b32 s12, s11, 16 -; GFX10-NEXT: s_lshr_b32 s13, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s11, s10 -; GFX10-NEXT: s_sub_i32 s13, s12, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s14 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_sext_i32_i16 s16, s3 -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_lshr_b32 s8, s14, 16 -; GFX10-NEXT: s_lshr_b32 s15, s7, 16 -; GFX10-NEXT: s_sub_i32 s7, s14, s7 -; GFX10-NEXT: s_sub_i32 s15, s8, s15 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 +; GFX10-NEXT: s_mov_b32 s11, 0x7fff7fff +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, 0 +; GFX10-NEXT: s_mov_b32 s13, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s10, s8, 0 +; GFX10-NEXT: s_sext_i32_i16 s15, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX10-NEXT: s_movk_i32 s10, 0x7fff +; GFX10-NEXT: s_lshr_b32 s12, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s11, s9 +; GFX10-NEXT: s_sub_i32 s12, s10, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s15 -; GFX10-NEXT: s_sext_i32_i16 s15, s7 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s3 -; GFX10-NEXT: s_sext_i32_i16 s16, s4 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s10, s13 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s15, s3 -; GFX10-NEXT: s_sext_i32_i16 s13, s7 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s8, s8, 0 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s13 -; GFX10-NEXT: s_cselect_b32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_lshr_b32 s10, s3, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_mov_b32 s8, 0x8000 +; GFX10-NEXT: s_lshr_b32 s14, s6, 16 +; GFX10-NEXT: s_sub_i32 s6, s13, s6 +; GFX10-NEXT: s_sub_i32 s14, s8, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s14 +; GFX10-NEXT: s_sext_i32_i16 s14, s6 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s15 +; GFX10-NEXT: s_cselect_b32 s14, s14, s15 +; GFX10-NEXT: s_cmp_gt_i32 s6, s3 +; GFX10-NEXT: s_sext_i32_i16 s15, s4 +; GFX10-NEXT: s_cselect_b32 s3, s6, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s14, s3 +; GFX10-NEXT: s_sext_i32_i16 s12, s6 +; GFX10-NEXT: s_sext_i32_i16 s9, s3 +; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_cmp_lt_i32 s9, s12 +; GFX10-NEXT: s_cselect_b32 s9, s9, s12 +; GFX10-NEXT: s_cmp_lt_i32 s3, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX10-NEXT: s_lshr_b32 s9, s3, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s3 ; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_add_i32 s7, s7, s10 -; GFX10-NEXT: s_ashr_i32 s10, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_cselect_b32 s13, s3, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s15, s10, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s13, s11, s13 -; GFX10-NEXT: s_sub_i32 s15, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s3, s9 -; GFX10-NEXT: s_cselect_b32 s3, s3, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s10, s10, s6 +; GFX10-NEXT: s_add_i32 s6, s6, s9 +; GFX10-NEXT: s_ashr_i32 s9, s1, 16 +; GFX10-NEXT: s_cmp_gt_i32 s3, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_cselect_b32 s12, s3, s7 +; GFX10-NEXT: s_cmp_gt_i32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s14, s9, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX10-NEXT: s_lshr_b32 s14, s12, 16 +; GFX10-NEXT: s_sub_i32 s12, s11, s12 +; GFX10-NEXT: s_sub_i32 s14, s10, s14 +; GFX10-NEXT: s_cmp_lt_i32 s3, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: s_cmp_lt_i32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s9, s9, 0 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 -; GFX10-NEXT: s_lshr_b32 s10, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s14, s3 -; GFX10-NEXT: s_sub_i32 s10, s8, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX10-NEXT: s_lshr_b32 s9, s3, 16 +; GFX10-NEXT: s_sub_i32 s3, s13, s3 +; GFX10-NEXT: s_sub_i32 s9, s8, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX10-NEXT: s_sext_i32_i16 s9, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s10, s16 -; GFX10-NEXT: s_cselect_b32 s10, s10, s16 +; GFX10-NEXT: s_cmp_gt_i32 s9, s15 +; GFX10-NEXT: s_cselect_b32 s9, s9, s15 ; GFX10-NEXT: s_cmp_gt_i32 s3, s4 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s13, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_sext_i32_i16 s13, s4 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 +; GFX10-NEXT: s_sext_i32_i16 s4, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX10-NEXT: s_ashr_i32 s9, s12, 16 +; GFX10-NEXT: s_sext_i32_i16 s12, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s13 -; GFX10-NEXT: s_cselect_b32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_lshr_b32 s10, s1, 16 -; GFX10-NEXT: s_lshr_b32 s13, s3, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s10, s10, s13 -; GFX10-NEXT: s_ashr_i32 s3, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s10 -; GFX10-NEXT: s_cselect_b32 s13, s4, s9 -; GFX10-NEXT: s_cmp_gt_i32 s3, s6 -; GFX10-NEXT: s_cselect_b32 s15, s3, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s11, s11, s13 -; GFX10-NEXT: s_sub_i32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s5 +; GFX10-NEXT: s_cmp_lt_i32 s12, s4 +; GFX10-NEXT: s_cselect_b32 s4, s12, s4 +; GFX10-NEXT: s_cmp_lt_i32 s3, s9 +; GFX10-NEXT: s_sext_i32_i16 s12, s2 +; GFX10-NEXT: s_cselect_b32 s3, s3, s9 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s14, s3 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-NEXT: s_ashr_i32 s4, s5, 16 +; GFX10-NEXT: s_ashr_i32 s4, s2, 16 +; GFX10-NEXT: s_lshr_b32 s14, s3, 16 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_add_i32 s9, s9, s14 +; GFX10-NEXT: s_cmp_gt_i32 s12, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX10-NEXT: s_cselect_b32 s3, s12, s7 +; GFX10-NEXT: s_cmp_gt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s14, s4, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s14 +; GFX10-NEXT: s_lshr_b32 s14, s3, 16 +; GFX10-NEXT: s_sub_i32 s3, s11, s3 +; GFX10-NEXT: s_sub_i32 s10, s10, s14 +; GFX10-NEXT: s_cmp_lt_i32 s12, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 +; GFX10-NEXT: s_cselect_b32 s7, s12, s7 +; GFX10-NEXT: s_cmp_lt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s4, s4, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s13, s4 +; GFX10-NEXT: s_sub_i32 s7, s8, s7 +; GFX10-NEXT: s_sext_i32_i16 s8, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_gt_i32 s7, s8 +; GFX10-NEXT: s_cselect_b32 s7, s7, s8 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 ; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s11, s12 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX10-NEXT: s_ashr_i32 s5, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s6, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 +; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_lt_i32 s7, s5 +; GFX10-NEXT: s_cselect_b32 s5, s7, s5 +; GFX10-NEXT: s_cmp_lt_i32 s4, s3 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_add_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s4, s4, s5 @@ -6438,36 +6358,33 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v9, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v9, s5, v9 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v8, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, s4, v8 -; GFX9-NEXT: v_pk_max_i16 v4, v9, v4 +; GFX9-NEXT: v_pk_min_i16 v10, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v10, v11, v10 +; GFX9-NEXT: v_pk_max_i16 v8, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v8, v9, v8 +; GFX9-NEXT: v_pk_max_i16 v4, v10, v4 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 -; GFX9-NEXT: v_pk_min_i16 v8, v1, s6 +; GFX9-NEXT: v_pk_min_i16 v8, v1, 0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, s5, v8 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 +; GFX9-NEXT: v_pk_max_i16 v4, v1, 0 +; GFX9-NEXT: v_pk_sub_i16 v8, v11, v8 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_max_i16 v5, v8, v5 ; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 +; GFX9-NEXT: v_pk_min_i16 v5, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v5, v11, v5 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 +; GFX9-NEXT: v_pk_max_i16 v4, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_max_i16 v5, v5, v6 ; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 +; GFX9-NEXT: v_pk_min_i16 v5, v3, 0 +; GFX9-NEXT: v_pk_sub_i16 v5, v11, v5 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 +; GFX9-NEXT: v_pk_max_i16 v4, v3, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_max_i16 v5, v5, v7 ; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 @@ -6477,30 +6394,25 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v8, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v1, s4 -; GFX10-NEXT: v_pk_min_i16 v12, v3, s4 -; GFX10-NEXT: v_pk_max_i16 v9, v0, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, s5, v8 -; GFX10-NEXT: v_pk_min_i16 v8, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v11, s5, v11 -; GFX10-NEXT: v_pk_sub_i16 v12, s5, v12 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s4 -; GFX10-NEXT: v_pk_max_i16 v13, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v8, s5, v8 -; GFX10-NEXT: v_pk_max_i16 v14, v3, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_min_i16 v8, v0, 0 +; GFX10-NEXT: v_pk_min_i16 v11, v1, 0 +; GFX10-NEXT: v_pk_min_i16 v12, v3, 0 +; GFX10-NEXT: v_pk_max_i16 v9, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v10, v1, 0 +; GFX10-NEXT: v_pk_sub_i16 v15, 0x80008000, v8 +; GFX10-NEXT: v_pk_min_i16 v8, v2, 0 +; GFX10-NEXT: v_pk_sub_i16 v11, 0x80008000, v11 +; GFX10-NEXT: v_pk_sub_i16 v12, 0x80008000, v12 +; GFX10-NEXT: v_pk_max_i16 v13, v2, 0 +; GFX10-NEXT: v_pk_max_i16 v14, v3, 0 +; GFX10-NEXT: v_pk_sub_i16 v8, 0x80008000, v8 ; GFX10-NEXT: v_pk_max_i16 v5, v11, v5 -; GFX10-NEXT: v_pk_sub_i16 v9, s6, v9 -; GFX10-NEXT: v_pk_sub_i16 v10, s6, v10 +; GFX10-NEXT: v_pk_sub_i16 v10, 0x7fff7fff, v10 +; GFX10-NEXT: v_pk_sub_i16 v9, 0x7fff7fff, v9 +; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 ; GFX10-NEXT: v_pk_max_i16 v6, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v11, s6, v13 -; GFX10-NEXT: v_pk_sub_i16 v8, s6, v14 +; GFX10-NEXT: v_pk_sub_i16 v11, 0x7fff7fff, v13 +; GFX10-NEXT: v_pk_sub_i16 v8, 0x7fff7fff, v14 ; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 ; GFX10-NEXT: v_pk_min_i16 v15, v4, v9 ; GFX10-NEXT: v_pk_min_i16 v19, v5, v10 @@ -6814,138 +6726,134 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_saddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s10, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s13, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_ashr_i32 s12, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s10, s0 +; GFX9-NEXT: s_ashr_i32 s11, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s12, 0 +; GFX9-NEXT: s_cmp_gt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s13, s10, s12 +; GFX9-NEXT: s_cmp_gt_i32 s11, 0 +; GFX9-NEXT: s_cselect_b32 s14, s11, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s15, s13, 16 +; GFX9-NEXT: s_movk_i32 s14, 0x7fff +; GFX9-NEXT: s_sub_i32 s13, s8, s13 +; GFX9-NEXT: s_sub_i32 s15, s14, s15 +; GFX9-NEXT: s_cmp_lt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s10, s10, s12 +; GFX9-NEXT: s_cmp_lt_i32 s11, 0 +; GFX9-NEXT: s_cselect_b32 s11, s11, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX9-NEXT: s_mov_b32 s9, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s15, s10, 16 +; GFX9-NEXT: s_mov_b32 s11, 0x8000 +; GFX9-NEXT: s_sub_i32 s10, s9, s10 +; GFX9-NEXT: s_sub_i32 s15, s11, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s15 +; GFX9-NEXT: s_sext_i32_i16 s15, s10 +; GFX9-NEXT: s_sext_i32_i16 s16, s4 ; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s14, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_cselect_b32 s15, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s8 -; GFX9-NEXT: s_lshr_b32 s16, s14, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_sub_i32 s14, s8, s14 -; GFX9-NEXT: s_sub_i32 s16, s15, s16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s9, 0x8000 -; GFX9-NEXT: s_cselect_b32 s12, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_lshr_b32 s16, s11, 16 -; GFX9-NEXT: s_lshr_b32 s12, s9, 16 -; GFX9-NEXT: s_sub_i32 s11, s9, s11 -; GFX9-NEXT: s_sub_i32 s16, s12, s16 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s16 -; GFX9-NEXT: s_sext_i32_i16 s16, s11 -; GFX9-NEXT: s_sext_i32_i16 s17, s4 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_cmp_gt_i32 s11, s4 -; GFX9-NEXT: s_cselect_b32 s4, s11, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s16, s14 +; GFX9-NEXT: s_cmp_gt_i32 s15, s16 +; GFX9-NEXT: s_cselect_b32 s15, s15, s16 +; GFX9-NEXT: s_cmp_gt_i32 s10, s4 +; GFX9-NEXT: s_cselect_b32 s4, s10, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s15, s4 +; GFX9-NEXT: s_sext_i32_i16 s10, s4 +; GFX9-NEXT: s_sext_i32_i16 s15, s13 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s16 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s14 -; GFX9-NEXT: s_cselect_b32 s4, s4, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_lshr_b32 s11, s0, 16 -; GFX9-NEXT: s_lshr_b32 s14, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s11, s11, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s11 -; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s11, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s14, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s16, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_lshr_b32 s16, s14, 16 -; GFX9-NEXT: s_sub_i32 s14, s8, s14 -; GFX9-NEXT: s_sub_i32 s16, s15, s16 +; GFX9-NEXT: s_ashr_i32 s13, s13, 16 +; GFX9-NEXT: s_cmp_lt_i32 s10, s15 +; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lt_i32 s4, s13 ; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s11, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX9-NEXT: s_lshr_b32 s10, s0, 16 +; GFX9-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_add_i32 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX9-NEXT: s_sext_i32_i16 s4, s1 +; GFX9-NEXT: s_ashr_i32 s10, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s13, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s15, s10, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX9-NEXT: s_lshr_b32 s15, s13, 16 +; GFX9-NEXT: s_sub_i32 s13, s8, s13 +; GFX9-NEXT: s_sub_i32 s15, s14, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s10, s10, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s11, s12, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s16, s5 +; GFX9-NEXT: s_sub_i32 s10, s11, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX9-NEXT: s_sext_i32_i16 s10, s4 +; GFX9-NEXT: s_sext_i32_i16 s15, s5 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s16 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 +; GFX9-NEXT: s_cmp_gt_i32 s10, s15 +; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_gt_i32 s4, s5 ; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s11, s14 +; GFX9-NEXT: s_sext_i32_i16 s10, s13 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s11 -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s14 -; GFX9-NEXT: s_cselect_b32 s4, s4, s14 +; GFX9-NEXT: s_ashr_i32 s13, s13, 16 +; GFX9-NEXT: s_cmp_lt_i32 s5, s10 +; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, s13 +; GFX9-NEXT: s_cselect_b32 s4, s4, s13 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s5, s5, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX9-NEXT: s_sext_i32_i16 s4, s2 ; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s11, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s14, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s14 -; GFX9-NEXT: s_lshr_b32 s14, s11, 16 -; GFX9-NEXT: s_sub_i32 s11, s8, s11 -; GFX9-NEXT: s_sub_i32 s14, s15, s14 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s10, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s13, s5, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_sub_i32 s10, s8, s10 +; GFX9-NEXT: s_sub_i32 s13, s14, s13 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s12, s5 +; GFX9-NEXT: s_sub_i32 s5, s11, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s14, s6 +; GFX9-NEXT: s_sext_i32_i16 s13, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s14 -; GFX9-NEXT: s_cselect_b32 s5, s5, s14 +; GFX9-NEXT: s_cmp_gt_i32 s5, s13 +; GFX9-NEXT: s_cselect_b32 s5, s5, s13 ; GFX9-NEXT: s_cmp_gt_i32 s4, s6 ; GFX9-NEXT: s_cselect_b32 s4, s4, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s11 +; GFX9-NEXT: s_sext_i32_i16 s6, s10 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_ashr_i32 s10, s10, 16 ; GFX9-NEXT: s_cmp_lt_i32 s5, s6 ; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 +; GFX9-NEXT: s_cmp_lt_i32 s4, s10 +; GFX9-NEXT: s_cselect_b32 s4, s4, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16 @@ -6954,22 +6862,22 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s6, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s11, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s6, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s10, s5, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s10 +; GFX9-NEXT: s_lshr_b32 s10, s6, 16 ; GFX9-NEXT: s_sub_i32 s6, s8, s6 -; GFX9-NEXT: s_sub_i32 s8, s15, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_sub_i32 s8, s14, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s12, s5 +; GFX9-NEXT: s_sub_i32 s5, s11, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 @@ -6999,188 +6907,184 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX10-LABEL: s_saddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 +; GFX10-NEXT: s_sext_i32_i16 s8, s0 +; GFX10-NEXT: s_sext_i32_i16 s9, 0 ; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 -; GFX10-NEXT: s_movk_i32 s13, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s13 -; GFX10-NEXT: s_cselect_b32 s14, s10, s8 -; GFX10-NEXT: s_movk_i32 s16, 0x8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX10-NEXT: s_lshr_b32 s14, s13, 16 -; GFX10-NEXT: s_lshr_b32 s15, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s13, s12 -; GFX10-NEXT: s_sub_i32 s15, s14, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s16, s16, s16 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_sext_i32_i16 s18, s4 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_lshr_b32 s10, s16, 16 -; GFX10-NEXT: s_lshr_b32 s17, s9, 16 -; GFX10-NEXT: s_sub_i32 s9, s16, s9 -; GFX10-NEXT: s_sub_i32 s17, s10, s17 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 +; GFX10-NEXT: s_mov_b32 s13, 0x7fff7fff +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, 0 +; GFX10-NEXT: s_mov_b32 s15, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s12, s10, 0 +; GFX10-NEXT: s_sext_i32_i16 s17, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX10-NEXT: s_movk_i32 s12, 0x7fff +; GFX10-NEXT: s_lshr_b32 s14, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s13, s11 +; GFX10-NEXT: s_sub_i32 s14, s12, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s17 -; GFX10-NEXT: s_sext_i32_i16 s17, s9 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_gt_i32 s9, s4 -; GFX10-NEXT: s_sext_i32_i16 s18, s5 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s12, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s17, s4 -; GFX10-NEXT: s_sext_i32_i16 s15, s9 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, 0 +; GFX10-NEXT: s_cselect_b32 s10, s10, 0 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_cmp_lt_i32 s12, s15 -; GFX10-NEXT: s_cselect_b32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_mov_b32 s10, 0x8000 +; GFX10-NEXT: s_lshr_b32 s16, s8, 16 +; GFX10-NEXT: s_sub_i32 s8, s15, s8 +; GFX10-NEXT: s_sub_i32 s16, s10, s16 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s16 +; GFX10-NEXT: s_sext_i32_i16 s16, s8 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 +; GFX10-NEXT: s_cmp_gt_i32 s8, s4 +; GFX10-NEXT: s_sext_i32_i16 s17, s5 +; GFX10-NEXT: s_cselect_b32 s4, s8, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s11, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX10-NEXT: s_sext_i32_i16 s14, s8 +; GFX10-NEXT: s_sext_i32_i16 s11, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_cmp_lt_i32 s11, s14 +; GFX10-NEXT: s_cselect_b32 s11, s11, s14 +; GFX10-NEXT: s_cmp_lt_i32 s4, s8 +; GFX10-NEXT: s_cselect_b32 s4, s4, s8 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s4 ; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_add_i32 s9, s9, s12 -; GFX10-NEXT: s_ashr_i32 s12, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX10-NEXT: s_cselect_b32 s15, s4, s11 -; GFX10-NEXT: s_cmp_gt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s17, s12, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 -; GFX10-NEXT: s_lshr_b32 s17, s15, 16 -; GFX10-NEXT: s_sub_i32 s15, s13, s15 -; GFX10-NEXT: s_sub_i32 s17, s14, s17 -; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s12, s12, s8 +; GFX10-NEXT: s_add_i32 s8, s8, s11 +; GFX10-NEXT: s_ashr_i32 s11, s1, 16 +; GFX10-NEXT: s_cmp_gt_i32 s4, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_cselect_b32 s14, s4, s9 +; GFX10-NEXT: s_cmp_gt_i32 s11, 0 +; GFX10-NEXT: s_cselect_b32 s16, s11, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s16 +; GFX10-NEXT: s_lshr_b32 s16, s14, 16 +; GFX10-NEXT: s_sub_i32 s14, s13, s14 +; GFX10-NEXT: s_sub_i32 s16, s12, s16 +; GFX10-NEXT: s_cmp_lt_i32 s4, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s16 +; GFX10-NEXT: s_cselect_b32 s4, s4, s9 +; GFX10-NEXT: s_cmp_lt_i32 s11, 0 +; GFX10-NEXT: s_cselect_b32 s11, s11, 0 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s16, s4 -; GFX10-NEXT: s_sub_i32 s12, s10, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s15, s4 +; GFX10-NEXT: s_sub_i32 s11, s10, s11 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX10-NEXT: s_sext_i32_i16 s11, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s18 -; GFX10-NEXT: s_cselect_b32 s12, s12, s18 +; GFX10-NEXT: s_cmp_gt_i32 s11, s17 +; GFX10-NEXT: s_cselect_b32 s11, s11, s17 ; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s18, s6 +; GFX10-NEXT: s_sext_i32_i16 s17, s6 ; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s15, s17 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_sext_i32_i16 s15, s5 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_sext_i32_i16 s5, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX10-NEXT: s_ashr_i32 s11, s14, 16 +; GFX10-NEXT: s_sext_i32_i16 s14, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s12, s15 -; GFX10-NEXT: s_cselect_b32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_add_i32 s5, s5, s12 -; GFX10-NEXT: s_ashr_i32 s12, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s15, s4, s11 -; GFX10-NEXT: s_cmp_gt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s17, s12, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 -; GFX10-NEXT: s_lshr_b32 s17, s15, 16 -; GFX10-NEXT: s_sub_i32 s15, s13, s15 -; GFX10-NEXT: s_sub_i32 s17, s14, s17 +; GFX10-NEXT: s_cmp_lt_i32 s14, s5 +; GFX10-NEXT: s_cselect_b32 s5, s14, s5 ; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 +; GFX10-NEXT: s_sext_i32_i16 s14, s2 ; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s12, s12, s8 +; GFX10-NEXT: s_lshr_b32 s11, s1, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX10-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s11, s11, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX10-NEXT: s_cselect_b32 s4, s14, s9 +; GFX10-NEXT: s_cmp_gt_i32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s16, s5, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s13, s4 +; GFX10-NEXT: s_sub_i32 s16, s12, s16 +; GFX10-NEXT: s_cmp_lt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 +; GFX10-NEXT: s_cselect_b32 s14, s14, s9 +; GFX10-NEXT: s_cmp_lt_i32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s5, s5, 0 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s16, s4 -; GFX10-NEXT: s_sub_i32 s12, s10, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s18 -; GFX10-NEXT: s_cselect_b32 s12, s12, s18 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_ashr_i32 s12, s15, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s14, s5 +; GFX10-NEXT: s_lshr_b32 s14, s5, 16 +; GFX10-NEXT: s_sub_i32 s5, s15, s5 +; GFX10-NEXT: s_sub_i32 s14, s10, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX10-NEXT: s_sext_i32_i16 s14, s5 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s17 +; GFX10-NEXT: s_cselect_b32 s14, s14, s17 +; GFX10-NEXT: s_cmp_gt_i32 s5, s6 +; GFX10-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-NEXT: s_sext_i32_i16 s6, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s14, s5 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s15, s6 -; GFX10-NEXT: s_cselect_b32 s6, s15, s6 -; GFX10-NEXT: s_cmp_lt_i32 s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s15, s3 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s2, 16 +; GFX10-NEXT: s_sext_i32_i16 s14, s5 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_cmp_lt_i32 s14, s6 +; GFX10-NEXT: s_cselect_b32 s6, s14, s6 +; GFX10-NEXT: s_cmp_lt_i32 s5, s4 +; GFX10-NEXT: s_sext_i32_i16 s14, s3 +; GFX10-NEXT: s_cselect_b32 s4, s5, s4 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s4 ; GFX10-NEXT: s_ashr_i32 s6, s3, 16 -; GFX10-NEXT: s_lshr_b32 s17, s4, 16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 ; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_add_i32 s12, s12, s17 -; GFX10-NEXT: s_cmp_gt_i32 s15, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s12 -; GFX10-NEXT: s_cselect_b32 s4, s15, s11 -; GFX10-NEXT: s_cmp_gt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s17, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s17 -; GFX10-NEXT: s_lshr_b32 s17, s4, 16 +; GFX10-NEXT: s_add_i32 s5, s5, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX10-NEXT: s_cselect_b32 s4, s14, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, 0 +; GFX10-NEXT: s_cselect_b32 s16, s6, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 ; GFX10-NEXT: s_sub_i32 s4, s13, s4 -; GFX10-NEXT: s_sub_i32 s13, s14, s17 -; GFX10-NEXT: s_cmp_lt_i32 s15, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s13 -; GFX10-NEXT: s_cselect_b32 s11, s15, s11 -; GFX10-NEXT: s_cmp_lt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s11, s6 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s16, s6 -; GFX10-NEXT: s_sub_i32 s8, s10, s8 +; GFX10-NEXT: s_sub_i32 s12, s12, s16 +; GFX10-NEXT: s_cmp_lt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX10-NEXT: s_cselect_b32 s9, s14, s9 +; GFX10-NEXT: s_cmp_lt_i32 s6, 0 +; GFX10-NEXT: s_cselect_b32 s6, s6, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s6 +; GFX10-NEXT: s_lshr_b32 s9, s6, 16 +; GFX10-NEXT: s_sub_i32 s6, s15, s6 +; GFX10-NEXT: s_sub_i32 s9, s10, s9 ; GFX10-NEXT: s_sext_i32_i16 s10, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 ; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 +; GFX10-NEXT: s_cmp_gt_i32 s9, s10 +; GFX10-NEXT: s_cselect_b32 s9, s9, s10 ; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_cselect_b32 s6, s6, s7 ; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s6 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s7 -; GFX10-NEXT: s_cselect_b32 s7, s8, s7 +; GFX10-NEXT: s_cmp_lt_i32 s9, s7 +; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_cmp_lt_i32 s6, s4 ; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshr_b32 s6, s3, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index ed1fe7af5f365..015f6b5de8b04 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -529,8 +529,7 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_shl_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 3e1778bcb881e..ac2a75383cba3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4512,15 +4512,12 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v2, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s5 +; GFX9-NEXT: v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 +; GFX9-NEXT: v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -4530,16 +4527,11 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v2, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_min_i16 v3, v0, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s6 -; GFX10-NEXT: v_pk_sub_i16 v3, v3, s4 +; GFX10-NEXT: v_pk_sub_i16 v2, v2, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v3, v3, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -4635,53 +4627,45 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_ssubsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 -; GFX9-NEXT: s_ashr_i32 s6, s0, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s8, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_lshr_b32 s9, s8, 16 -; GFX9-NEXT: s_sub_i32 s2, s8, s2 -; GFX9-NEXT: s_sub_i32 s8, s9, s10 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s3, 0x8000 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: s_ashr_i32 s3, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s4, -1 +; GFX9-NEXT: s_cmp_gt_i32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s5, s2, s4 +; GFX9-NEXT: s_cmp_gt_i32 s3, -1 +; GFX9-NEXT: s_cselect_b32 s6, s3, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s5, 16 +; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff7fff +; GFX9-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX9-NEXT: s_cmp_lt_i32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_cmp_lt_i32 s3, -1 +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s2, s2, 0x80008000 +; GFX9-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_sext_i32_i16 s3, s5 +; GFX9-NEXT: s_ashr_i32 s4, s5, 16 ; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 +; GFX9-NEXT: s_cmp_gt_i32 s3, s5 +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cmp_gt_i32 s4, s1 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 ; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_cmp_lt_i32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s3, s4 +; GFX9-NEXT: s_cmp_lt_i32 s1, s2 +; GFX9-NEXT: s_cselect_b32 s1, s1, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s1 @@ -4691,47 +4675,39 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX10-LABEL: s_ssubsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-NEXT: s_sext_i32_i16 s2, s0 +; GFX10-NEXT: s_sext_i32_i16 s3, -1 ; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_movk_i32 s8, 0x7fff -; GFX10-NEXT: s_cselect_b32 s6, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, s2 +; GFX10-NEXT: s_cmp_gt_i32 s2, s3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s8, s8 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_lshr_b32 s9, s7, 16 -; GFX10-NEXT: s_sub_i32 s6, s6, s7 -; GFX10-NEXT: s_sub_i32 s7, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_sub_i32 s4, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_cselect_b32 s5, s2, s3 +; GFX10-NEXT: s_cmp_gt_i32 s4, -1 +; GFX10-NEXT: s_cselect_b32 s6, s4, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX10-NEXT: s_sext_i32_i16 s6, s1 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_lshr_b32 s7, s5, 16 +; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff7fff +; GFX10-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX10-NEXT: s_cmp_lt_i32 s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: s_cmp_lt_i32 s4, -1 +; GFX10-NEXT: s_sext_i32_i16 s3, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, -1 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_sub_i32 s2, s2, 0x80008000 +; GFX10-NEXT: s_sub_i32 s4, s4, 0x8000 +; GFX10-NEXT: s_cmp_gt_i32 s3, s6 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 +; GFX10-NEXT: s_cselect_b32 s3, s3, s6 +; GFX10-NEXT: s_cmp_gt_i32 s5, s1 ; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_cselect_b32 s1, s5, s1 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s5, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10-NEXT: s_sext_i32_i16 s3, s1 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 ; GFX10-NEXT: s_cmp_lt_i32 s3, s4 @@ -4819,72 +4795,56 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s3, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: s_ashr_i32 s5, s0, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s7, s4, s6 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_cselect_b32 s8, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s9, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_sub_i32 s1, s7, s1 -; GFX9-NEXT: s_sub_i32 s7, s8, s9 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GFX9-NEXT: s_ashr_i32 s2, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, -1 +; GFX9-NEXT: s_cmp_gt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s4, s1, s3 +; GFX9-NEXT: s_cmp_gt_i32 s2, -1 +; GFX9-NEXT: s_cselect_b32 s5, s2, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_sub_i32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX9-NEXT: s_cmp_lt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s1, s1, s3 +; GFX9-NEXT: s_cmp_lt_i32 s2, -1 +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s1, s1, 0x80008000 +; GFX9-NEXT: s_sub_i32 s2, s2, 0x8000 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: v_pk_max_i16 v0, s4, v0 +; GFX9-NEXT: v_pk_min_i16 v0, v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_sext_i32_i16 s1, s0 +; GFX10-NEXT: s_sext_i32_i16 s2, -1 ; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s4 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s5, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 +; GFX10-NEXT: s_cmp_gt_i32 s1, s2 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s7 -; GFX10-NEXT: s_lshr_b32 s7, s5, 16 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sub_i32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s6, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_movk_i32 s4, 0x8000 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s5, s6 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_cselect_b32 s4, s1, s2 +; GFX10-NEXT: s_cmp_gt_i32 s3, -1 +; GFX10-NEXT: s_cselect_b32 s5, s3, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s4, 0x7fff7fff +; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX10-NEXT: s_cmp_lt_i32 s1, s2 +; GFX10-NEXT: s_cselect_b32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lt_i32 s3, -1 +; GFX10-NEXT: s_cselect_b32 s2, s3, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s5 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: v_pk_max_i16 v0, s2, v0 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s2, s4, s5 +; GFX10-NEXT: s_sub_i32 s1, s1, 0x80008000 +; GFX10-NEXT: s_sub_i32 s2, s3, 0x8000 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 @@ -4951,15 +4911,12 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s3, -1, -1 -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: v_pk_max_i16 v1, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: v_pk_min_i16 v2, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s2 +; GFX9-NEXT: v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 +; GFX9-NEXT: v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 ; GFX9-NEXT: v_pk_max_i16 v1, v1, s0 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -4967,16 +4924,11 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX10-LABEL: ssubsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX10-NEXT: s_movk_i32 s2, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v1, v0, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX10-NEXT: v_pk_min_i16 v2, v0, s1 -; GFX10-NEXT: s_movk_i32 s3, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, v1, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s3 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, s1 +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v2, v2, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -5098,22 +5050,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v4, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v5, v0, s6 +; GFX9-NEXT: v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v5 +; GFX9-NEXT: v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x80008000 ; GFX9-NEXT: v_pk_max_i16 v2, v4, v2 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v5 +; GFX9-NEXT: v_pk_sub_i16 v6, v6, v7 +; GFX9-NEXT: v_pk_min_i16 v2, v2, v6 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s4 -; GFX9-NEXT: v_pk_min_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s5 +; GFX9-NEXT: v_pk_max_i16 v2, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v5 +; GFX9-NEXT: v_pk_min_i16 v4, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v7 ; GFX9-NEXT: v_pk_max_i16 v2, v2, v3 ; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 @@ -5123,24 +5072,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, -1, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v4, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v5, v1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_min_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v7, v1, s5 -; GFX10-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX10-NEXT: v_pk_sub_i16 v5, v5, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v5, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v7, v1, -1 op_sel_hi:[1,0] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_sub_i16 v4, v4, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v5, v5, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v6, v6, 0x80008000 +; GFX10-NEXT: v_pk_sub_i16 v7, v7, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_sub_i16 v6, v6, s6 -; GFX10-NEXT: v_pk_sub_i16 v4, v7, s6 -; GFX10-NEXT: v_pk_max_i16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_i16 v10, v5, v3 ; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v3, v4 +; GFX10-NEXT: v_pk_min_i16 v3, v10, v7 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5306,77 +5250,73 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 +; GFX9-NEXT: s_sext_i32_i16 s6, s0 +; GFX9-NEXT: s_ashr_i32 s7, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, -1 +; GFX9-NEXT: s_cmp_gt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s9, s6, s8 +; GFX9-NEXT: s_cmp_gt_i32 s7, -1 +; GFX9-NEXT: s_cselect_b32 s10, s7, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_movk_i32 s11, 0x7fff +; GFX9-NEXT: s_sub_i32 s9, s9, s4 +; GFX9-NEXT: s_sub_i32 s10, s10, s11 +; GFX9-NEXT: s_cmp_lt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lt_i32 s7, -1 +; GFX9-NEXT: s_cselect_b32 s7, s7, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_mov_b32 s5, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_mov_b32 s10, 0x8000 +; GFX9-NEXT: s_sub_i32 s6, s6, s5 +; GFX9-NEXT: s_sub_i32 s7, s7, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_sext_i32_i16 s7, s9 +; GFX9-NEXT: s_sext_i32_i16 s12, s2 +; GFX9-NEXT: s_ashr_i32 s9, s9, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_cmp_gt_i32 s7, s12 +; GFX9-NEXT: s_cselect_b32 s7, s7, s12 +; GFX9-NEXT: s_cmp_gt_i32 s9, s2 +; GFX9-NEXT: s_cselect_b32 s2, s9, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 +; GFX9-NEXT: s_sext_i32_i16 s7, s2 ; GFX9-NEXT: s_sext_i32_i16 s9, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s10, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_cselect_b32 s11, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: s_lshr_b32 s11, s10, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_sub_i32 s10, s10, s4 -; GFX9-NEXT: s_sub_i32 s11, s11, s12 ; GFX9-NEXT: s_cmp_lt_i32 s7, s9 ; GFX9-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_cselect_b32 s8, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_lshr_b32 s11, s5, 16 -; GFX9-NEXT: s_sub_i32 s7, s7, s5 -; GFX9-NEXT: s_sub_i32 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_sext_i32_i16 s8, s10 -; GFX9-NEXT: s_sext_i32_i16 s13, s2 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s8, s13 -; GFX9-NEXT: s_cselect_b32 s8, s8, s13 -; GFX9-NEXT: s_cmp_gt_i32 s10, s2 -; GFX9-NEXT: s_cselect_b32 s2, s10, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX9-NEXT: s_sext_i32_i16 s8, s2 -; GFX9-NEXT: s_sext_i32_i16 s10, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_cmp_lt_i32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s8, s8, s10 -; GFX9-NEXT: s_cmp_lt_i32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 16 +; GFX9-NEXT: s_cmp_lt_i32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s2, s7, s8 +; GFX9-NEXT: s_sub_i32 s2, s6, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s7, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s8, s2, s9 -; GFX9-NEXT: s_cmp_gt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s10, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX9-NEXT: s_lshr_b32 s10, s8, 16 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s2, s2, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_ashr_i32 s6, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s7, s2, s8 +; GFX9-NEXT: s_cmp_gt_i32 s6, -1 +; GFX9-NEXT: s_cselect_b32 s9, s6, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX9-NEXT: s_lshr_b32 s9, s7, 16 +; GFX9-NEXT: s_sub_i32 s4, s7, s4 +; GFX9-NEXT: s_sub_i32 s7, s9, s11 +; GFX9-NEXT: s_cmp_lt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s2, s2, s8 +; GFX9-NEXT: s_cmp_lt_i32 s6, -1 +; GFX9-NEXT: s_cselect_b32 s6, s6, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s6, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX9-NEXT: s_sub_i32 s5, s6, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: s_sext_i32_i16 s6, s3 @@ -5405,80 +5345,76 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX10-LABEL: s_ssubsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: s_sext_i32_i16 s5, -1 ; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 ; GFX10-NEXT: s_movk_i32 s10, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_movk_i32 s12, 0x8000 -; GFX10-NEXT: s_cselect_b32 s9, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s10, s10 -; GFX10-NEXT: s_lshr_b32 s10, s8, 16 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 -; GFX10-NEXT: s_sub_i32 s8, s8, s9 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_sext_i32_i16 s14, s2 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 +; GFX10-NEXT: s_cselect_b32 s7, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s6, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s8, s6, -1 +; GFX10-NEXT: s_sext_i32_i16 s13, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX10-NEXT: s_mov_b32 s8, 0x7fff7fff +; GFX10-NEXT: s_lshr_b32 s9, s7, 16 +; GFX10-NEXT: s_sub_i32 s7, s7, s8 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s4 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s6, -1 +; GFX10-NEXT: s_cselect_b32 s6, s6, -1 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s8, s10 -; GFX10-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-NEXT: s_lshr_b32 s10, s12, 16 -; GFX10-NEXT: s_sext_i32_i16 s13, s6 -; GFX10-NEXT: s_sub_i32 s5, s5, s12 -; GFX10-NEXT: s_sub_i32 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s9 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 +; GFX10-NEXT: s_mov_b32 s9, 0x8000 +; GFX10-NEXT: s_sext_i32_i16 s12, s6 +; GFX10-NEXT: s_sub_i32 s4, s4, s11 +; GFX10-NEXT: s_sub_i32 s7, s7, s9 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s13, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s8 -; GFX10-NEXT: s_cselect_b32 s13, s13, s14 +; GFX10-NEXT: s_cmp_gt_i32 s12, s13 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX10-NEXT: s_cselect_b32 s12, s12, s13 ; GFX10-NEXT: s_cmp_gt_i32 s6, s2 -; GFX10-NEXT: s_sext_i32_i16 s8, s5 +; GFX10-NEXT: s_sext_i32_i16 s7, s4 ; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s13, s2 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 ; GFX10-NEXT: s_sext_i32_i16 s6, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s2, s4 +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s2 ; GFX10-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_sub_i32 s2, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s1 +; GFX10-NEXT: s_sub_i32 s2, s4, s6 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 ; GFX10-NEXT: s_ashr_i32 s6, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s13, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s13 -; GFX10-NEXT: s_lshr_b32 s13, s8, 16 -; GFX10-NEXT: s_sub_i32 s8, s8, s9 -; GFX10-NEXT: s_sub_i32 s9, s13, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s8, s9 +; GFX10-NEXT: s_cselect_b32 s7, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s6, -1 +; GFX10-NEXT: s_cselect_b32 s12, s6, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX10-NEXT: s_lshr_b32 s12, s7, 16 +; GFX10-NEXT: s_sub_i32 s7, s7, s8 +; GFX10-NEXT: s_sub_i32 s8, s12, s10 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s6, -1 +; GFX10-NEXT: s_cselect_b32 s5, s6, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s7, s8 ; GFX10-NEXT: s_lshr_b32 s6, s4, 16 ; GFX10-NEXT: s_sext_i32_i16 s7, s5 ; GFX10-NEXT: s_sext_i32_i16 s8, s3 -; GFX10-NEXT: s_sub_i32 s4, s4, s12 -; GFX10-NEXT: s_sub_i32 s6, s6, s10 +; GFX10-NEXT: s_sub_i32 s4, s4, s11 +; GFX10-NEXT: s_sub_i32 s6, s6, s9 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 ; GFX10-NEXT: s_cmp_gt_i32 s7, s8 @@ -5661,29 +5597,26 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v6, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v7, v0, s6 +; GFX9-NEXT: v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v6, v6, v7 +; GFX9-NEXT: v_pk_min_i16 v8, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v9, 0x80008000 ; GFX9-NEXT: v_pk_max_i16 v3, v6, v3 -; GFX9-NEXT: v_pk_sub_i16 v7, v7, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v7 +; GFX9-NEXT: v_pk_sub_i16 v8, v8, v9 +; GFX9-NEXT: v_pk_min_i16 v3, v3, v8 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s4 -; GFX9-NEXT: v_pk_min_i16 v6, v1, s6 +; GFX9-NEXT: v_pk_max_i16 v3, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 +; GFX9-NEXT: v_pk_min_i16 v6, v1, -1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_i16 v3, v3, v4 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, s5 +; GFX9-NEXT: v_pk_sub_i16 v6, v6, v9 ; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s4 -; GFX9-NEXT: v_pk_min_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s5 +; GFX9-NEXT: v_pk_max_i16 v3, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 +; GFX9-NEXT: v_pk_min_i16 v4, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 ; GFX9-NEXT: v_pk_max_i16 v3, v3, v5 ; GFX9-NEXT: v_pk_min_i16 v3, v3, v4 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 @@ -5693,28 +5626,23 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, -1, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v8, v1, s5 -; GFX10-NEXT: v_pk_max_i16 v9, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_min_i16 v7, v0, s5 -; GFX10-NEXT: v_pk_sub_i16 v6, v6, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, v8, s4 -; GFX10-NEXT: v_pk_sub_i16 v19, v9, s4 -; GFX10-NEXT: v_pk_min_i16 v10, v1, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v2, s5 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v8, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v9, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v7, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v10, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v6, v6, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v15, v8, 0x7fff7fff +; GFX10-NEXT: v_pk_min_i16 v11, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v19, v9, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v7, v7, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v14, v6, v3 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_sub_i16 v6, v10, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v7, v7, s6 -; GFX10-NEXT: v_pk_sub_i16 v6, v10, s6 -; GFX10-NEXT: v_pk_sub_i16 v8, v11, s6 +; GFX10-NEXT: v_pk_sub_i16 v8, v11, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v3, v14, v7 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v4, v4, v6 ; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 @@ -5953,120 +5881,116 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1 +; GFX9-NEXT: s_sext_i32_i16 s8, s0 +; GFX9-NEXT: s_ashr_i32 s9, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s10, -1 +; GFX9-NEXT: s_cmp_gt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s11, s8, s10 +; GFX9-NEXT: s_cmp_gt_i32 s9, -1 +; GFX9-NEXT: s_cselect_b32 s12, s9, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_mov_b32 s6, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_movk_i32 s13, 0x7fff +; GFX9-NEXT: s_sub_i32 s11, s11, s6 +; GFX9-NEXT: s_sub_i32 s12, s12, s13 +; GFX9-NEXT: s_cmp_lt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s8, s8, s10 +; GFX9-NEXT: s_cmp_lt_i32 s9, -1 +; GFX9-NEXT: s_cselect_b32 s9, s9, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_mov_b32 s7, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s9, s8, 16 +; GFX9-NEXT: s_mov_b32 s12, 0x8000 +; GFX9-NEXT: s_sub_i32 s8, s8, s7 +; GFX9-NEXT: s_sub_i32 s9, s9, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_sext_i32_i16 s9, s11 +; GFX9-NEXT: s_sext_i32_i16 s14, s3 +; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 +; GFX9-NEXT: s_cmp_gt_i32 s9, s14 +; GFX9-NEXT: s_cselect_b32 s9, s9, s14 +; GFX9-NEXT: s_cmp_gt_i32 s11, s3 +; GFX9-NEXT: s_cselect_b32 s3, s11, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_sext_i32_i16 s9, s3 ; GFX9-NEXT: s_sext_i32_i16 s11, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 ; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s12, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX9-NEXT: s_lshr_b32 s13, s12, 16 -; GFX9-NEXT: s_lshr_b32 s14, s6, 16 -; GFX9-NEXT: s_sub_i32 s12, s12, s6 -; GFX9-NEXT: s_sub_i32 s13, s13, s14 ; GFX9-NEXT: s_cmp_lt_i32 s9, s11 ; GFX9-NEXT: s_cselect_b32 s9, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s7, 0x8000 -; GFX9-NEXT: s_cselect_b32 s10, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s13, s7, 16 -; GFX9-NEXT: s_sub_i32 s9, s9, s7 -; GFX9-NEXT: s_sub_i32 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_sext_i32_i16 s10, s12 -; GFX9-NEXT: s_sext_i32_i16 s15, s3 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s10, s15 -; GFX9-NEXT: s_cselect_b32 s10, s10, s15 -; GFX9-NEXT: s_cmp_gt_i32 s12, s3 -; GFX9-NEXT: s_cselect_b32 s3, s12, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX9-NEXT: s_sext_i32_i16 s10, s3 -; GFX9-NEXT: s_sext_i32_i16 s12, s9 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 -; GFX9-NEXT: s_cmp_lt_i32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s10, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s3, s9 -; GFX9-NEXT: s_cselect_b32 s3, s3, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 16 +; GFX9-NEXT: s_cmp_lt_i32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s3, s3, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: s_lshr_b32 s9, s3, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s3, s9, s10 +; GFX9-NEXT: s_sub_i32 s3, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s9, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s10, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s12, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_sub_i32 s10, s10, s6 -; GFX9-NEXT: s_sub_i32 s12, s12, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s9, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX9-NEXT: s_ashr_i32 s8, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s9, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s8, -1 +; GFX9-NEXT: s_cselect_b32 s11, s8, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX9-NEXT: s_lshr_b32 s11, s9, 16 +; GFX9-NEXT: s_sub_i32 s9, s9, s6 +; GFX9-NEXT: s_sub_i32 s11, s11, s13 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s8, -1 +; GFX9-NEXT: s_cselect_b32 s8, s8, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 ; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s9, s9, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_sext_i32_i16 s9, s10 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 +; GFX9-NEXT: s_sub_i32 s8, s8, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_sext_i32_i16 s8, s9 +; GFX9-NEXT: s_sext_i32_i16 s11, s4 +; GFX9-NEXT: s_ashr_i32 s9, s9, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s12 -; GFX9-NEXT: s_cselect_b32 s9, s9, s12 -; GFX9-NEXT: s_cmp_gt_i32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX9-NEXT: s_sext_i32_i16 s9, s4 -; GFX9-NEXT: s_sext_i32_i16 s10, s3 +; GFX9-NEXT: s_cmp_gt_i32 s8, s11 +; GFX9-NEXT: s_cselect_b32 s8, s8, s11 +; GFX9-NEXT: s_cmp_gt_i32 s9, s4 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s4 +; GFX9-NEXT: s_sext_i32_i16 s8, s4 +; GFX9-NEXT: s_sext_i32_i16 s9, s3 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s9, s10 -; GFX9-NEXT: s_cselect_b32 s9, s9, s10 +; GFX9-NEXT: s_cmp_lt_i32 s8, s9 +; GFX9-NEXT: s_cselect_b32 s8, s8, s9 ; GFX9-NEXT: s_cmp_lt_i32 s4, s3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 ; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s9 +; GFX9-NEXT: s_sub_i32 s3, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX9-NEXT: s_sext_i32_i16 s3, s2 ; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s9, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s10, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_sub_i32 s6, s9, s6 -; GFX9-NEXT: s_sub_i32 s9, s10, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s8, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s4, -1 +; GFX9-NEXT: s_cselect_b32 s9, s4, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_lshr_b32 s9, s8, 16 +; GFX9-NEXT: s_sub_i32 s6, s8, s6 +; GFX9-NEXT: s_sub_i32 s8, s9, s13 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, -1 +; GFX9-NEXT: s_cselect_b32 s4, s4, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX9-NEXT: s_sub_i32 s4, s4, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: s_sext_i32_i16 s7, s5 @@ -6095,123 +6019,119 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX10-LABEL: s_ssubsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s7, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 +; GFX10-NEXT: s_sext_i32_i16 s6, s0 +; GFX10-NEXT: s_sext_i32_i16 s7, -1 ; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_movk_i32 s12, 0x7fff -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_movk_i32 s14, 0x8000 -; GFX10-NEXT: s_cselect_b32 s11, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s12, s12 -; GFX10-NEXT: s_lshr_b32 s12, s10, 16 -; GFX10-NEXT: s_lshr_b32 s13, s11, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_sext_i32_i16 s16, s3 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, -1 +; GFX10-NEXT: s_mov_b32 s13, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s10, s8, -1 +; GFX10-NEXT: s_sext_i32_i16 s15, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX10-NEXT: s_mov_b32 s10, 0x7fff7fff +; GFX10-NEXT: s_lshr_b32 s11, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s8, s8, -1 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s12 -; GFX10-NEXT: s_lshr_b32 s10, s7, 16 -; GFX10-NEXT: s_lshr_b32 s12, s14, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s8 -; GFX10-NEXT: s_sub_i32 s7, s7, s14 -; GFX10-NEXT: s_sub_i32 s10, s10, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s11 +; GFX10-NEXT: s_lshr_b32 s9, s6, 16 +; GFX10-NEXT: s_mov_b32 s11, 0x8000 +; GFX10-NEXT: s_sext_i32_i16 s14, s8 +; GFX10-NEXT: s_sub_i32 s6, s6, s13 +; GFX10-NEXT: s_sub_i32 s9, s9, s11 ; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s10 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s15 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX10-NEXT: s_cselect_b32 s14, s14, s15 ; GFX10-NEXT: s_cmp_gt_i32 s8, s3 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_cselect_b32 s3, s8, s3 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s15, s3 -; GFX10-NEXT: s_sext_i32_i16 s16, s4 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s14, s3 +; GFX10-NEXT: s_sext_i32_i16 s15, s4 ; GFX10-NEXT: s_sext_i32_i16 s8, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s3, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX10-NEXT: s_lshr_b32 s8, s3, 16 ; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_sub_i32 s3, s7, s8 -; GFX10-NEXT: s_sext_i32_i16 s7, s1 +; GFX10-NEXT: s_sub_i32 s3, s6, s8 +; GFX10-NEXT: s_sext_i32_i16 s6, s1 ; GFX10-NEXT: s_ashr_i32 s8, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s15, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s15 -; GFX10-NEXT: s_lshr_b32 s15, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s15, s15, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s14, s8, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX10-NEXT: s_lshr_b32 s14, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s14, s14, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s8, s8, -1 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s15 -; GFX10-NEXT: s_lshr_b32 s10, s7, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s8 -; GFX10-NEXT: s_sub_i32 s7, s7, s14 -; GFX10-NEXT: s_sub_i32 s10, s10, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s14 +; GFX10-NEXT: s_lshr_b32 s9, s6, 16 +; GFX10-NEXT: s_sext_i32_i16 s14, s8 +; GFX10-NEXT: s_sub_i32 s6, s6, s13 +; GFX10-NEXT: s_sub_i32 s9, s9, s11 ; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s10 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s15 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX10-NEXT: s_cselect_b32 s14, s14, s15 ; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s15, s4 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s14, s4 ; GFX10-NEXT: s_sext_i32_i16 s8, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, s7 -; GFX10-NEXT: s_cselect_b32 s4, s4, s7 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s4, s6 +; GFX10-NEXT: s_cselect_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 ; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s4, s7, s8 -; GFX10-NEXT: s_sext_i32_i16 s7, s2 +; GFX10-NEXT: s_sub_i32 s4, s6, s8 +; GFX10-NEXT: s_sext_i32_i16 s6, s2 ; GFX10-NEXT: s_ashr_i32 s8, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s15, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s15 -; GFX10-NEXT: s_lshr_b32 s15, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s11, s15, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s10, s11 +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s14, s8, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX10-NEXT: s_lshr_b32 s14, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s10, s14, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s7, s8, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s9, s10 ; GFX10-NEXT: s_lshr_b32 s8, s6, 16 ; GFX10-NEXT: s_sext_i32_i16 s9, s7 ; GFX10-NEXT: s_sext_i32_i16 s10, s5 -; GFX10-NEXT: s_sub_i32 s6, s6, s14 -; GFX10-NEXT: s_sub_i32 s8, s8, s12 +; GFX10-NEXT: s_sub_i32 s6, s6, s13 +; GFX10-NEXT: s_sub_i32 s8, s8, s11 ; GFX10-NEXT: s_ashr_i32 s7, s7, 16 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 ; GFX10-NEXT: s_cmp_gt_i32 s9, s10 @@ -6423,36 +6343,33 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v8, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v9, v0, s6 +; GFX9-NEXT: v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v8, v8, v9 +; GFX9-NEXT: v_pk_min_i16 v10, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v11, 0x80008000 ; GFX9-NEXT: v_pk_max_i16 v4, v8, v4 -; GFX9-NEXT: v_pk_sub_i16 v9, v9, s5 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_sub_i16 v10, v10, v11 +; GFX9-NEXT: v_pk_min_i16 v4, v4, v10 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v8, v1, s6 +; GFX9-NEXT: v_pk_max_i16 v4, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_min_i16 v8, v1, -1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, s5 +; GFX9-NEXT: v_pk_sub_i16 v8, v8, v11 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v5, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 +; GFX9-NEXT: v_pk_max_i16 v4, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_min_i16 v5, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v5, v5, v11 ; GFX9-NEXT: v_pk_max_i16 v4, v4, v6 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v5, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 +; GFX9-NEXT: v_pk_max_i16 v4, v3, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_min_i16 v5, v3, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v5, v5, v11 ; GFX9-NEXT: v_pk_max_i16 v4, v4, v7 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 ; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 @@ -6462,30 +6379,25 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v8, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s4 -; GFX10-NEXT: v_pk_max_i16 v12, v3, s4 -; GFX10-NEXT: v_pk_min_i16 v9, v0, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, v8, s5 -; GFX10-NEXT: v_pk_max_i16 v8, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v10, v10, s5 -; GFX10-NEXT: v_pk_sub_i16 v12, v12, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v1, s4 -; GFX10-NEXT: v_pk_min_i16 v13, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v8, v8, s5 -; GFX10-NEXT: v_pk_min_i16 v14, v3, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v10, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v12, v3, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v9, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v11, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v15, v8, 0x7fff7fff +; GFX10-NEXT: v_pk_max_i16 v8, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v10, v10, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v12, v12, 0x7fff7fff +; GFX10-NEXT: v_pk_min_i16 v13, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v14, v3, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v8, v8, 0x7fff7fff ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_sub_i16 v9, v9, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v5, v10, v5 -; GFX10-NEXT: v_pk_sub_i16 v11, v11, s6 -; GFX10-NEXT: v_pk_sub_i16 v9, v9, s6 +; GFX10-NEXT: v_pk_sub_i16 v11, v11, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v15, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v10, v13, s6 -; GFX10-NEXT: v_pk_sub_i16 v8, v14, s6 +; GFX10-NEXT: v_pk_sub_i16 v10, v13, 0x80008000 +; GFX10-NEXT: v_pk_sub_i16 v8, v14, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 ; GFX10-NEXT: v_pk_min_i16 v19, v4, v9 ; GFX10-NEXT: v_pk_min_i16 v11, v5, v11 @@ -6799,136 +6711,132 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s10, -1, -1 +; GFX9-NEXT: s_sext_i32_i16 s10, s0 +; GFX9-NEXT: s_ashr_i32 s11, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s12, -1 +; GFX9-NEXT: s_cmp_gt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s13, s10, s12 +; GFX9-NEXT: s_cmp_gt_i32 s11, -1 +; GFX9-NEXT: s_cselect_b32 s14, s11, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_movk_i32 s15, 0x7fff +; GFX9-NEXT: s_sub_i32 s13, s13, s8 +; GFX9-NEXT: s_sub_i32 s14, s14, s15 +; GFX9-NEXT: s_cmp_lt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s10, s10, s12 +; GFX9-NEXT: s_cmp_lt_i32 s11, -1 +; GFX9-NEXT: s_cselect_b32 s11, s11, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_mov_b32 s9, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: s_mov_b32 s14, 0x8000 +; GFX9-NEXT: s_sub_i32 s10, s10, s9 +; GFX9-NEXT: s_sub_i32 s11, s11, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_sext_i32_i16 s11, s13 +; GFX9-NEXT: s_sext_i32_i16 s16, s4 +; GFX9-NEXT: s_ashr_i32 s13, s13, 16 +; GFX9-NEXT: s_ashr_i32 s4, s4, 16 +; GFX9-NEXT: s_cmp_gt_i32 s11, s16 +; GFX9-NEXT: s_cselect_b32 s11, s11, s16 +; GFX9-NEXT: s_cmp_gt_i32 s13, s4 +; GFX9-NEXT: s_cselect_b32 s4, s13, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX9-NEXT: s_sext_i32_i16 s11, s4 ; GFX9-NEXT: s_sext_i32_i16 s13, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_ashr_i32 s12, s0, 16 +; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s14, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_cselect_b32 s15, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s8 -; GFX9-NEXT: s_lshr_b32 s15, s14, 16 -; GFX9-NEXT: s_lshr_b32 s16, s8, 16 -; GFX9-NEXT: s_sub_i32 s14, s14, s8 -; GFX9-NEXT: s_sub_i32 s15, s15, s16 ; GFX9-NEXT: s_cmp_lt_i32 s11, s13 ; GFX9-NEXT: s_cselect_b32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s9, 0x8000 -; GFX9-NEXT: s_cselect_b32 s12, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s15, s9, 16 -; GFX9-NEXT: s_sub_i32 s11, s11, s9 -; GFX9-NEXT: s_sub_i32 s12, s12, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_sext_i32_i16 s12, s14 -; GFX9-NEXT: s_sext_i32_i16 s17, s4 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s12, s17 -; GFX9-NEXT: s_cselect_b32 s12, s12, s17 -; GFX9-NEXT: s_cmp_gt_i32 s14, s4 -; GFX9-NEXT: s_cselect_b32 s4, s14, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_sext_i32_i16 s14, s11 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_cmp_lt_i32 s12, s14 -; GFX9-NEXT: s_cselect_b32 s12, s12, s14 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX9-NEXT: s_lshr_b32 s11, s0, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 +; GFX9-NEXT: s_cmp_lt_i32 s4, s10 +; GFX9-NEXT: s_cselect_b32 s4, s4, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX9-NEXT: s_lshr_b32 s10, s0, 16 +; GFX9-NEXT: s_lshr_b32 s11, s4, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s11, s12 +; GFX9-NEXT: s_sub_i32 s4, s10, s11 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s11, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s12, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s14, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_sub_i32 s12, s12, s8 -; GFX9-NEXT: s_sub_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s11, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX9-NEXT: s_ashr_i32 s10, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s11, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s10, -1 +; GFX9-NEXT: s_cselect_b32 s13, s10, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_lshr_b32 s13, s11, 16 +; GFX9-NEXT: s_sub_i32 s11, s11, s8 +; GFX9-NEXT: s_sub_i32 s13, s13, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s10, -1 +; GFX9-NEXT: s_cselect_b32 s10, s10, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 ; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s11, s11, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_sext_i32_i16 s11, s12 -; GFX9-NEXT: s_sext_i32_i16 s14, s5 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 +; GFX9-NEXT: s_sub_i32 s10, s10, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_sext_i32_i16 s10, s11 +; GFX9-NEXT: s_sext_i32_i16 s13, s5 +; GFX9-NEXT: s_ashr_i32 s11, s11, 16 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s14 -; GFX9-NEXT: s_cselect_b32 s11, s11, s14 -; GFX9-NEXT: s_cmp_gt_i32 s12, s5 -; GFX9-NEXT: s_cselect_b32 s5, s12, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s11, s5 -; GFX9-NEXT: s_sext_i32_i16 s11, s5 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 +; GFX9-NEXT: s_cmp_gt_i32 s10, s13 +; GFX9-NEXT: s_cselect_b32 s10, s10, s13 +; GFX9-NEXT: s_cmp_gt_i32 s11, s5 +; GFX9-NEXT: s_cselect_b32 s5, s11, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s5 +; GFX9-NEXT: s_sext_i32_i16 s10, s5 +; GFX9-NEXT: s_sext_i32_i16 s11, s4 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 +; GFX9-NEXT: s_cmp_lt_i32 s10, s11 +; GFX9-NEXT: s_cselect_b32 s10, s10, s11 ; GFX9-NEXT: s_cmp_lt_i32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s11 +; GFX9-NEXT: s_sub_i32 s4, s5, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s2 ; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s11, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s12, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_sub_i32 s11, s11, s8 -; GFX9-NEXT: s_sub_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s10, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s11, s5, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: s_sub_i32 s10, s10, s8 +; GFX9-NEXT: s_sub_i32 s11, s11, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s5, s5, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 ; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s15 +; GFX9-NEXT: s_sub_i32 s5, s5, s14 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s11 -; GFX9-NEXT: s_sext_i32_i16 s12, s6 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_sext_i32_i16 s5, s10 +; GFX9-NEXT: s_sext_i32_i16 s11, s6 +; GFX9-NEXT: s_ashr_i32 s10, s10, 16 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_cmp_gt_i32 s11, s6 -; GFX9-NEXT: s_cselect_b32 s6, s11, s6 +; GFX9-NEXT: s_cmp_gt_i32 s5, s11 +; GFX9-NEXT: s_cselect_b32 s5, s5, s11 +; GFX9-NEXT: s_cmp_gt_i32 s10, s6 +; GFX9-NEXT: s_cselect_b32 s6, s10, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_sext_i32_i16 s6, s5 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 +; GFX9-NEXT: s_sext_i32_i16 s10, s4 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s6, s11 -; GFX9-NEXT: s_cselect_b32 s6, s6, s11 +; GFX9-NEXT: s_cmp_lt_i32 s6, s10 +; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lt_i32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s4 @@ -6939,23 +6847,23 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s6, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s11, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s6, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s10, s5, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s10 +; GFX9-NEXT: s_lshr_b32 s10, s6, 16 ; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s8, s11, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_sub_i32 s8, s10, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s5, s5, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s15 +; GFX9-NEXT: s_sub_i32 s5, s5, s14 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_sext_i32_i16 s5, s6 ; GFX9-NEXT: s_sext_i32_i16 s8, s7 @@ -6984,166 +6892,162 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX10-LABEL: s_ssubsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 +; GFX10-NEXT: s_sext_i32_i16 s8, s0 +; GFX10-NEXT: s_sext_i32_i16 s9, -1 ; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_movk_i32 s14, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_movk_i32 s16, 0x8000 -; GFX10-NEXT: s_cselect_b32 s13, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s16, s16, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s14, s14 -; GFX10-NEXT: s_lshr_b32 s14, s12, 16 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s14, s14, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_sext_i32_i16 s18, s4 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s12, s10, -1 +; GFX10-NEXT: s_sext_i32_i16 s17, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX10-NEXT: s_mov_b32 s12, 0x7fff7fff +; GFX10-NEXT: s_lshr_b32 s13, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s13, s13, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s10, s10, -1 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s14 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_lshr_b32 s14, s16, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s13 +; GFX10-NEXT: s_lshr_b32 s11, s8, 16 +; GFX10-NEXT: s_mov_b32 s13, 0x8000 +; GFX10-NEXT: s_sext_i32_i16 s16, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s11, s11, s13 ; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 ; GFX10-NEXT: s_cmp_gt_i32 s10, s4 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 +; GFX10-NEXT: s_sext_i32_i16 s11, s8 ; GFX10-NEXT: s_cselect_b32 s4, s10, s4 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s17, s4 -; GFX10-NEXT: s_sext_i32_i16 s18, s5 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX10-NEXT: s_sext_i32_i16 s17, s5 ; GFX10-NEXT: s_sext_i32_i16 s10, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 +; GFX10-NEXT: s_cmp_lt_i32 s10, s11 +; GFX10-NEXT: s_cselect_b32 s10, s10, s11 +; GFX10-NEXT: s_cmp_lt_i32 s4, s8 +; GFX10-NEXT: s_cselect_b32 s4, s4, s8 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s10, s4 ; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_sub_i32 s4, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s1 +; GFX10-NEXT: s_sub_i32 s4, s8, s10 +; GFX10-NEXT: s_sext_i32_i16 s8, s1 ; GFX10-NEXT: s_ashr_i32 s10, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s17, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s16, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 +; GFX10-NEXT: s_lshr_b32 s16, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s16, s16, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s10, s10, -1 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s17 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s16 +; GFX10-NEXT: s_lshr_b32 s11, s8, 16 +; GFX10-NEXT: s_sext_i32_i16 s16, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s11, s11, s13 ; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 ; GFX10-NEXT: s_cmp_gt_i32 s10, s5 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 +; GFX10-NEXT: s_sext_i32_i16 s11, s8 ; GFX10-NEXT: s_cselect_b32 s5, s10, s5 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX10-NEXT: s_sext_i32_i16 s18, s6 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX10-NEXT: s_sext_i32_i16 s17, s6 ; GFX10-NEXT: s_sext_i32_i16 s10, s5 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s5, s9 -; GFX10-NEXT: s_cselect_b32 s5, s5, s9 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_cmp_lt_i32 s10, s11 +; GFX10-NEXT: s_cselect_b32 s10, s10, s11 +; GFX10-NEXT: s_cmp_lt_i32 s5, s8 +; GFX10-NEXT: s_cselect_b32 s5, s5, s8 +; GFX10-NEXT: s_lshr_b32 s8, s1, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s5, s10, s5 ; GFX10-NEXT: s_lshr_b32 s10, s5, 16 ; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_sub_i32 s5, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s2 +; GFX10-NEXT: s_sub_i32 s5, s8, s10 +; GFX10-NEXT: s_sext_i32_i16 s8, s2 ; GFX10-NEXT: s_ashr_i32 s10, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s17, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s16, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 +; GFX10-NEXT: s_lshr_b32 s16, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s16, s16, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s10, s10, -1 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s17 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s16 +; GFX10-NEXT: s_lshr_b32 s11, s8, 16 +; GFX10-NEXT: s_sext_i32_i16 s16, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s11, s11, s13 ; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 ; GFX10-NEXT: s_cmp_gt_i32 s10, s6 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 +; GFX10-NEXT: s_sext_i32_i16 s11, s8 ; GFX10-NEXT: s_cselect_b32 s6, s10, s6 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX10-NEXT: s_sext_i32_i16 s10, s6 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s6, s9 -; GFX10-NEXT: s_cselect_b32 s6, s6, s9 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_cmp_lt_i32 s10, s11 +; GFX10-NEXT: s_cselect_b32 s10, s10, s11 +; GFX10-NEXT: s_cmp_lt_i32 s6, s8 +; GFX10-NEXT: s_cselect_b32 s6, s6, s8 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s6, s10, s6 ; GFX10-NEXT: s_lshr_b32 s10, s6, 16 ; GFX10-NEXT: s_sub_i32 s2, s2, s6 -; GFX10-NEXT: s_sub_i32 s6, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s3 +; GFX10-NEXT: s_sub_i32 s6, s8, s10 +; GFX10-NEXT: s_sext_i32_i16 s8, s3 ; GFX10-NEXT: s_ashr_i32 s10, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s13, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s8, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s12, s13 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s16, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 +; GFX10-NEXT: s_lshr_b32 s16, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s12, s16, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s9, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s11, s12 ; GFX10-NEXT: s_lshr_b32 s10, s8, 16 ; GFX10-NEXT: s_sext_i32_i16 s11, s9 ; GFX10-NEXT: s_sext_i32_i16 s12, s7 -; GFX10-NEXT: s_sub_i32 s8, s8, s16 -; GFX10-NEXT: s_sub_i32 s10, s10, s14 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s10, s10, s13 ; GFX10-NEXT: s_ashr_i32 s9, s9, 16 ; GFX10-NEXT: s_ashr_i32 s7, s7, 16 ; GFX10-NEXT: s_cmp_gt_i32 s11, s12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 5570309a5be7d..3a742fbcbd919 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2371,8 +2371,7 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v1, v2, v1 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2381,9 +2380,8 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, s4, v0 ; GFX10-NEXT: v_pk_min_u16 v1, v2, v1 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2439,8 +2437,7 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_uaddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX9-NEXT: s_xor_b32 s2, s0, s2 +; GFX9-NEXT: s_xor_b32 s2, s0, -1 ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 @@ -2460,15 +2457,14 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX10-LABEL: s_uaddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, -1, -1 +; GFX10-NEXT: s_xor_b32 s2, s0, -1 ; GFX10-NEXT: s_mov_b32 s3, 0xffff -; GFX10-NEXT: s_xor_b32 s2, s0, s2 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: s_and_b32 s2, s2, s3 ; GFX10-NEXT: s_and_b32 s3, s1, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: s_cmp_lt_u32 s2, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10-NEXT: s_cmp_lt_u32 s4, s1 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 @@ -2522,17 +2518,15 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: uaddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: s_xor_b32 s1, s0, s1 +; GFX9-NEXT: s_xor_b32 s1, s0, -1 ; GFX9-NEXT: v_pk_min_u16 v0, s1, v0 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 +; GFX10-NEXT: s_xor_b32 s1, s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_xor_b32 s1, s0, s1 ; GFX10-NEXT: v_pk_min_u16 v0, s1, v0 ; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -2578,17 +2572,15 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: uaddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v1, v1, s0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v0 ; GFX10-NEXT: v_pk_min_u16 v1, v1, s0 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -2671,11 +2663,10 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v2, v4, v2 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v1 ; GFX9-NEXT: v_pk_min_u16 v2, v2, v3 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2684,10 +2675,9 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v4, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v5, s4, v1 ; GFX10-NEXT: v_pk_min_u16 v2, v4, v2 ; GFX10-NEXT: v_pk_min_u16 v3, v5, v3 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 @@ -2782,28 +2772,27 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_uaddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 16 -; GFX9-NEXT: s_and_b32 s5, s5, s7 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_cmp_lt_u32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s5, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_xor_b32 s4, s0, -1 +; GFX9-NEXT: s_mov_b32 s6, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_and_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_cmp_lt_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_cmp_lt_u32 s5, s7 +; GFX9-NEXT: s_cselect_b32 s4, s5, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_xor_b32 s2, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s2, s1, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s3, s3, s7 +; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s3, s3, s6 ; GFX9-NEXT: s_cmp_lt_u32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s2, s2, s3 ; GFX9-NEXT: s_cmp_lt_u32 s4, s5 @@ -2818,38 +2807,37 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX10-LABEL: s_uaddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_mov_b32 s6, 0xffff -; GFX10-NEXT: s_xor_b32 s5, s0, s4 -; GFX10-NEXT: s_and_b32 s8, s2, s6 -; GFX10-NEXT: s_lshr_b32 s7, s5, 16 -; GFX10-NEXT: s_and_b32 s5, s5, s6 +; GFX10-NEXT: s_xor_b32 s4, s0, -1 +; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_and_b32 s7, s2, s5 +; GFX10-NEXT: s_and_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s5, s8 +; GFX10-NEXT: s_cmp_lt_u32 s4, s7 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s5, s8 -; GFX10-NEXT: s_cmp_lt_u32 s7, s2 -; GFX10-NEXT: s_cselect_b32 s2, s7, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: s_cselect_b32 s4, s4, s7 +; GFX10-NEXT: s_cmp_lt_u32 s6, s2 +; GFX10-NEXT: s_cselect_b32 s2, s6, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_xor_b32 s2, s1, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s7 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s6, s3, s6 +; GFX10-NEXT: s_xor_b32 s2, s1, -1 +; GFX10-NEXT: s_add_i32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s5, s3, s5 ; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s2, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cmp_lt_u32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 +; GFX10-NEXT: s_cmp_lt_u32 s2, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_cselect_b32 s2, s2, s5 +; GFX10-NEXT: s_cmp_lt_u32 s6, s3 +; GFX10-NEXT: s_cselect_b32 s3, s6, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s4 +; GFX10-NEXT: s_add_i32 s3, s3, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) @@ -2955,14 +2943,13 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v3, v6, v3 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX9-NEXT: v_pk_min_u16 v3, v3, v4 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_pk_min_u16 v3, v3, v5 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2971,11 +2958,10 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v1 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v6, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, s4, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, s4, v2 ; GFX10-NEXT: v_pk_min_u16 v3, v6, v3 ; GFX10-NEXT: v_pk_min_u16 v4, v7, v4 ; GFX10-NEXT: v_pk_min_u16 v5, v8, v5 @@ -3108,43 +3094,42 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_uaddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 16 -; GFX9-NEXT: s_and_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_cmp_lt_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_cmp_lt_u32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s7, s8, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s7, s7, s8 -; GFX9-NEXT: s_xor_b32 s3, s1, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: s_xor_b32 s6, s0, -1 +; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_and_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s3, s3, s8 +; GFX9-NEXT: s_cmp_lt_u32 s6, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_cmp_lt_u32 s7, s9 +; GFX9-NEXT: s_cselect_b32 s6, s7, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s4, s4, s9 +; GFX9-NEXT: s_add_i32 s0, s0, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_xor_b32 s3, s1, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_lshr_b32 s7, s4, 16 +; GFX9-NEXT: s_and_b32 s3, s3, s8 +; GFX9-NEXT: s_and_b32 s4, s4, s8 ; GFX9-NEXT: s_cmp_lt_u32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_u32 s7, s8 -; GFX9-NEXT: s_cselect_b32 s4, s7, s8 +; GFX9-NEXT: s_cmp_lt_u32 s6, s7 +; GFX9-NEXT: s_cselect_b32 s4, s6, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_xor_b32 s3, s2, s6 +; GFX9-NEXT: s_add_i32 s4, s4, s6 +; GFX9-NEXT: s_xor_b32 s3, s2, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s5, s5, s9 +; GFX9-NEXT: s_and_b32 s3, s3, s8 +; GFX9-NEXT: s_and_b32 s5, s5, s8 ; GFX9-NEXT: s_cmp_lt_u32 s3, s5 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5 ; GFX9-NEXT: s_cmp_lt_u32 s4, s6 @@ -3159,48 +3144,47 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX10-LABEL: s_uaddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_xor_b32 s7, s0, s6 -; GFX10-NEXT: s_and_b32 s10, s3, s8 -; GFX10-NEXT: s_lshr_b32 s9, s7, 16 -; GFX10-NEXT: s_and_b32 s7, s7, s8 +; GFX10-NEXT: s_xor_b32 s6, s0, -1 +; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s6, 16 +; GFX10-NEXT: s_and_b32 s9, s3, s7 +; GFX10-NEXT: s_and_b32 s6, s6, s7 ; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s7, s10 +; GFX10-NEXT: s_cmp_lt_u32 s6, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s7, s10 -; GFX10-NEXT: s_cmp_lt_u32 s9, s3 -; GFX10-NEXT: s_cselect_b32 s3, s9, s3 -; GFX10-NEXT: s_and_b32 s10, s4, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 +; GFX10-NEXT: s_cselect_b32 s6, s6, s9 +; GFX10-NEXT: s_cmp_lt_u32 s8, s3 +; GFX10-NEXT: s_cselect_b32 s3, s8, s3 +; GFX10-NEXT: s_and_b32 s9, s4, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s3 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_xor_b32 s3, s1, s6 -; GFX10-NEXT: s_add_i32 s7, s7, s9 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_xor_b32 s3, s1, -1 +; GFX10-NEXT: s_add_i32 s6, s6, s8 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s7 ; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s10 -; GFX10-NEXT: s_cmp_lt_u32 s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 +; GFX10-NEXT: s_cmp_lt_u32 s3, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s9 +; GFX10-NEXT: s_cmp_lt_u32 s8, s4 +; GFX10-NEXT: s_cselect_b32 s4, s8, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 ; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_xor_b32 s3, s2, s6 -; GFX10-NEXT: s_add_i32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 -; GFX10-NEXT: s_and_b32 s8, s5, s8 +; GFX10-NEXT: s_xor_b32 s3, s2, -1 +; GFX10-NEXT: s_add_i32 s4, s4, s8 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s7 +; GFX10-NEXT: s_and_b32 s7, s5, s7 ; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s8 +; GFX10-NEXT: s_cmp_lt_u32 s3, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s8 -; GFX10-NEXT: s_cmp_lt_u32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: s_cmp_lt_u32 s8, s5 +; GFX10-NEXT: s_cselect_b32 s5, s8, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshr_b32 s6, s3, 16 @@ -3324,17 +3308,16 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v8, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v4, v8, v4 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX9-NEXT: v_pk_min_u16 v4, v4, v5 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v2 ; GFX9-NEXT: v_pk_min_u16 v4, v4, v6 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v3 ; GFX9-NEXT: v_pk_min_u16 v4, v4, v7 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3343,12 +3326,11 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v15, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v19, -1, v1 +; GFX10-NEXT: v_xor_b32_e32 v23, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v15, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v19, s4, v1 -; GFX10-NEXT: v_xor_b32_e32 v23, s4, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, s4, v3 ; GFX10-NEXT: v_pk_min_u16 v11, v15, v4 ; GFX10-NEXT: v_pk_min_u16 v15, v19, v5 ; GFX10-NEXT: v_pk_min_u16 v19, v23, v6 @@ -3519,58 +3501,57 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_uaddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_mov_b32 s11, 0xffff -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_and_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_cmp_lt_u32 s9, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cmp_lt_u32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s9, s10, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s9, s9, s10 -; GFX9-NEXT: s_xor_b32 s4, s1, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX9-NEXT: s_xor_b32 s8, s0, -1 +; GFX9-NEXT: s_mov_b32 s10, 0xffff +; GFX9-NEXT: s_lshr_b32 s9, s8, 16 +; GFX9-NEXT: s_lshr_b32 s11, s4, 16 +; GFX9-NEXT: s_and_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_cmp_lt_u32 s8, s4 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_cmp_lt_u32 s9, s11 +; GFX9-NEXT: s_cselect_b32 s8, s9, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s10, s5, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_xor_b32 s4, s1, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 +; GFX9-NEXT: s_lshr_b32 s9, s5, 16 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s5, s5, s10 ; GFX9-NEXT: s_cmp_lt_u32 s4, s5 ; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s9, s10 -; GFX9-NEXT: s_cselect_b32 s5, s9, s10 +; GFX9-NEXT: s_cmp_lt_u32 s8, s9 +; GFX9-NEXT: s_cselect_b32 s5, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s4, s2, s8 +; GFX9-NEXT: s_add_i32 s5, s5, s8 +; GFX9-NEXT: s_xor_b32 s4, s2, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s9, s6, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s6, s6, s11 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lt_u32 s4, s6 ; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_u32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 +; GFX9-NEXT: s_cmp_lt_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s5, s5, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16 ; GFX9-NEXT: s_add_i32 s2, s2, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_xor_b32 s4, s3, s8 +; GFX9-NEXT: s_xor_b32 s4, s3, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s7, s7, s11 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s7, s7, s10 ; GFX9-NEXT: s_cmp_lt_u32 s4, s7 ; GFX9-NEXT: s_cselect_b32 s4, s4, s7 ; GFX9-NEXT: s_cmp_lt_u32 s5, s6 @@ -3585,63 +3566,62 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX10-LABEL: s_uaddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX10-NEXT: s_mov_b32 s10, 0xffff -; GFX10-NEXT: s_xor_b32 s9, s0, s8 -; GFX10-NEXT: s_and_b32 s12, s4, s10 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 -; GFX10-NEXT: s_and_b32 s9, s9, s10 +; GFX10-NEXT: s_xor_b32 s8, s0, -1 +; GFX10-NEXT: s_mov_b32 s9, 0xffff +; GFX10-NEXT: s_lshr_b32 s10, s8, 16 +; GFX10-NEXT: s_and_b32 s11, s4, s9 +; GFX10-NEXT: s_and_b32 s8, s8, s9 ; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s9, s12 +; GFX10-NEXT: s_cmp_lt_u32 s8, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s9, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s4 -; GFX10-NEXT: s_cselect_b32 s4, s11, s4 -; GFX10-NEXT: s_and_b32 s12, s5, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_cselect_b32 s8, s8, s11 +; GFX10-NEXT: s_cmp_lt_u32 s10, s4 +; GFX10-NEXT: s_cselect_b32 s4, s10, s4 +; GFX10-NEXT: s_and_b32 s11, s5, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_xor_b32 s4, s1, s8 -; GFX10-NEXT: s_add_i32 s9, s9, s11 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 +; GFX10-NEXT: s_xor_b32 s4, s1, -1 +; GFX10-NEXT: s_add_i32 s8, s8, s10 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s9 ; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s5 -; GFX10-NEXT: s_cselect_b32 s5, s11, s5 -; GFX10-NEXT: s_and_b32 s12, s6, s10 +; GFX10-NEXT: s_cmp_lt_u32 s4, s11 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_cselect_b32 s4, s4, s11 +; GFX10-NEXT: s_cmp_lt_u32 s10, s5 +; GFX10-NEXT: s_cselect_b32 s5, s10, s5 +; GFX10-NEXT: s_and_b32 s11, s6, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s4, s2, s8 -; GFX10-NEXT: s_add_i32 s5, s5, s11 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 +; GFX10-NEXT: s_xor_b32 s4, s2, -1 +; GFX10-NEXT: s_add_i32 s5, s5, s10 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s9 ; GFX10-NEXT: s_lshr_b32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s12 +; GFX10-NEXT: s_cmp_lt_u32 s4, s11 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s6 -; GFX10-NEXT: s_cselect_b32 s6, s11, s6 +; GFX10-NEXT: s_cselect_b32 s4, s4, s11 +; GFX10-NEXT: s_cmp_lt_u32 s10, s6 +; GFX10-NEXT: s_cselect_b32 s6, s10, s6 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_xor_b32 s4, s3, s8 -; GFX10-NEXT: s_add_i32 s6, s6, s11 -; GFX10-NEXT: s_lshr_b32 s8, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 -; GFX10-NEXT: s_and_b32 s10, s7, s10 +; GFX10-NEXT: s_xor_b32 s4, s3, -1 +; GFX10-NEXT: s_add_i32 s6, s6, s10 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s9 +; GFX10-NEXT: s_and_b32 s9, s7, s9 ; GFX10-NEXT: s_lshr_b32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s10 +; GFX10-NEXT: s_cmp_lt_u32 s4, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s10 -; GFX10-NEXT: s_cmp_lt_u32 s8, s7 -; GFX10-NEXT: s_cselect_b32 s7, s8, s7 +; GFX10-NEXT: s_cselect_b32 s4, s4, s9 +; GFX10-NEXT: s_cmp_lt_u32 s10, s7 +; GFX10-NEXT: s_cselect_b32 s7, s10, s7 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 ; GFX10-NEXT: s_lshr_b32 s7, s4, 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 2512aaaeb082c..474f6655bda2c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -50,16 +50,14 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; ; GFX900-LABEL: scalar_xnor_v2i16_one_use: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX900-NEXT: s_xor_b32 s0, s0, s1 -; GFX900-NEXT: s_xor_b32 s0, s0, s2 +; GFX900-NEXT: s_xor_b32 s0, s0, -1 ; GFX900-NEXT: ; return to shader part epilog ; ; GFX906-LABEL: scalar_xnor_v2i16_one_use: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX906-NEXT: s_xor_b32 s0, s0, s1 -; GFX906-NEXT: s_xor_b32 s0, s0, s2 +; GFX906-NEXT: s_xor_b32 s0, s0, -1 ; GFX906-NEXT: ; return to shader part epilog entry: %xor = xor <2 x i16> %a, %b @@ -150,7 +148,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX900-LABEL: scalar_xnor_v4i16_one_use: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX900-NEXT: s_mov_b32 s4, -1 ; GFX900-NEXT: s_mov_b32 s5, s4 ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] @@ -158,7 +156,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX906-LABEL: scalar_xnor_v4i16_one_use: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX906-NEXT: s_mov_b32 s4, -1 ; GFX906-NEXT: s_mov_b32 s5, s4 ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] From 6f961a1e7ec2373dceda74a1a49f93b009db1b72 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 15:41:58 -0400 Subject: [PATCH 0133/1035] AMDGPU/GlobalISel: Legalize GDS atomics I noticed these don't use the _gfx9, non-m0 reading variants but not sure if that's a bug or not. It's the same in the DAG. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 5 +- .../inst-select-atomic-cmpxchg-region.mir | 173 ++++++++++++++++++ .../inst-select-atomicrmw-fadd-region.mir | 116 ++++++++++++ .../inst-select-atomicrmw-xchg-region.mir | 83 +++++++++ 4 files changed, 375 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 80633bbc6179d..f72db8a61aab8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1196,14 +1196,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, G_ATOMICRMW_UMIN}) .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, - {S64, GlobalPtr}, {S64, LocalPtr}}); + {S64, GlobalPtr}, {S64, LocalPtr}, + {S32, RegionPtr}, {S64, RegionPtr}}); if (ST.hasFlatAddressSpace()) { Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } if (ST.hasLDSFPAtomics()) { getActionDefinitionsBuilder(G_ATOMICRMW_FADD) - .legalFor({{S32, LocalPtr}}); + .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); } // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir new file mode 100644 index 0000000000000..834380d617912 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir @@ -0,0 +1,173 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s + + +--- +name: atomic_cmpxchg_s32_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX6-LABEL: name: atomic_cmpxchg_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s32_region + ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s32_region + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_ATOMIC_CMPXCHG %0, %1, %2 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %3 + +... + +--- +name: atomic_cmpxchg_s32_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX6-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_CONSTANT i32 4 + %4:vgpr(p2) = G_PTR_ADD %0, %3 + %5:vgpr(s32) = G_ATOMIC_CMPXCHG %4, %1, %2 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %5 + +... + +--- +name: atomic_cmpxchg_s64_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + + ; GFX6-LABEL: name: atomic_cmpxchg_s64_region + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s64_region + ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s64_region + ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX9: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s64) = COPY $vgpr1_vgpr2 + %2:vgpr(s64) = COPY $vgpr3_vgpr4 + %3:vgpr(s64) = G_ATOMIC_CMPXCHG %0, %1, %2 :: (load store seq_cst 8, addrspace 2) + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: atomic_cmpxchg_s64_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + + ; GFX6-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX9: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s64) = COPY $vgpr1_vgpr2 + %2:vgpr(s64) = COPY $vgpr3_vgpr4 + %3:vgpr(s32) = G_CONSTANT i32 4 + %4:vgpr(p2) = G_PTR_ADD %0, %3 + %5:vgpr(s64) = G_ATOMIC_CMPXCHG %0, %1, %2 :: (load store seq_cst 8, addrspace 2) + $vgpr0_vgpr1 = COPY %5 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir new file mode 100644 index 0000000000000..26619a1753eb5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir @@ -0,0 +1,116 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s + +# GFX6/7 selection should fail. +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s + +--- +name: atomicrmw_fadd_s32_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX9-LABEL: name: atomicrmw_fadd_s32_region + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX6-LABEL: name: atomicrmw_fadd_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p2) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[COPY]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %2 + +... + +--- +name: atomicrmw_fadd_s32_region_noret +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_noret + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9-LABEL: name: atomicrmw_fadd_s32_region_noret + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6-LABEL: name: atomicrmw_fadd_s32_region_noret + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p2) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_FADD [[COPY]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p2), %1 :: (load store seq_cst 4, addrspace 2) + +... + +--- +name: atomicrmw_fadd_s32_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_gep4 + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX9-LABEL: name: atomicrmw_fadd_s32_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX6-LABEL: name: atomicrmw_fadd_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p2) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p2) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[PTR_ADD]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_CONSTANT i32 4 + %3:vgpr(p2) = G_PTR_ADD %0, %2 + %4:vgpr(s32) = G_ATOMICRMW_FADD %3(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %4 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir new file mode 100644 index 0000000000000..0552551f35105 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s + + +--- +name: atomicrmw_xchg_s32_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: atomicrmw_xchg_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX7-LABEL: name: atomicrmw_xchg_s32_region + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX9-LABEL: name: atomicrmw_xchg_s32_region + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_ATOMICRMW_XCHG %0(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %2 + +... + +--- +name: atomicrmw_xchg_s32_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: atomicrmw_xchg_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %3:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 %3, [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX7-LABEL: name: atomicrmw_xchg_s32_region_gep4 + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX9-LABEL: name: atomicrmw_xchg_s32_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_CONSTANT i32 4 + %3:vgpr(p2) = G_PTR_ADD %0, %2 + %4:vgpr(s32) = G_ATOMICRMW_XCHG %3(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %4 + +... From bcf5184a68d1d851895692bae6eed16a74b519db Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 17:22:22 -0400 Subject: [PATCH 0134/1035] AMDGPU/GlobalISel: Make sure <2 x s1> phis are scalarized --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 +- .../AMDGPU/GlobalISel/legalize-phi.mir | 91 +++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f72db8a61aab8..c21414d59ba0c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -415,11 +415,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor(AllS64Vectors) .legalFor(AddrSpaces64) .legalFor(AddrSpaces32) + .legalIf(isPointer(0)) .clampScalar(0, S32, S256) .widenScalarToNextPow2(0, 32) .clampMaxNumElements(0, S32, 16) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .legalIf(isPointer(0)); + .scalarize(0); if (ST.hasVOP3PInsts()) { assert(ST.hasIntClamp() && "all targets with VOP3P should support clamp"); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir index 81408b79b11f7..9a91d908bb7b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir @@ -1547,3 +1547,94 @@ body: | S_SETPC_B64 undef $sgpr30_sgpr31 ... +--- +name: test_phi_v2s1 +tracksRegLiveness: true + +body: | + ; CHECK-LABEL: name: test_phi_v2s1 + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[AND1]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND2]](s32), [[AND3]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP2]](s1), %bb.1 + ; CHECK: G_BR %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C3]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND4]](s32), [[AND5]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C4]] + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C4]] + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND6]](s32), [[AND7]] + ; CHECK: G_BR %bb.2 + ; CHECK: bb.2: + ; CHECK: [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP3]](s1), %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.0, [[ICMP4]](s1), %bb.1 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s1) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C5]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: S_SETPC_B64 undef $sgpr30_sgpr31 + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $vgpr1 + %2:_(<2 x s16>) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr1 + %4:_(s32) = G_CONSTANT i32 0 + %5:_(<2 x s1>) = G_ICMP intpred(eq), %0, %1 + %6:_(s1) = G_ICMP intpred(eq), %3, %4 + G_BRCOND %6, %bb.1 + G_BR %bb.2 + + bb.1: + successors: %bb.2 + + %7:_(<2 x s1>) = G_ICMP intpred(ne), %0, %2 + G_BR %bb.2 + + bb.2: + %8:_(<2 x s1>) = G_PHI %5, %bb.0, %7, %bb.1 + %9:_(<2 x s32>) = G_ZEXT %8 + $vgpr0_vgpr1 = COPY %9 + S_SETPC_B64 undef $sgpr30_sgpr31 +... From 7c09c173a294698d06bcd1cd8c95fd331a31ccdc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 11:14:27 -0400 Subject: [PATCH 0135/1035] AMDGPU/GlobalISel: Reorder G_CONSTANT legality rules The legal cases should be the first rules. --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c21414d59ba0c..c2200f4b2d616 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -525,9 +525,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_CONSTANT) .legalFor({S1, S32, S64, S16, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) + .legalIf(isPointer(0)) .clampScalar(0, S32, S64) - .widenScalarToNextPow2(0) - .legalIf(isPointer(0)); + .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({S32, S64, S16}) From 3e8bb7a00027c97fcf0025c9e068593d8c0ed71d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 10:47:33 -0400 Subject: [PATCH 0136/1035] GlobalISel: Handle fewerElementsVector for G_PTR_ADD --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 1 + .../AMDGPU/GlobalISel/legalize-ptr-add.mir | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 5dcb5b3271d8a..49d0dceb34d37 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3469,6 +3469,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_ADD: case G_SUB: case G_MUL: + case G_PTR_ADD: case G_SMULH: case G_UMULH: case G_FADD: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir index 709e9a24493c6..1d76ddc61f4b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir @@ -143,3 +143,45 @@ body: | %2:_(p999) = G_PTR_ADD %0, %1 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_gep_v2p1_v2i64 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_gep_v2p1_v2i64 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[UV:%[0-9]+]]:_(p1), [[UV1:%[0-9]+]]:_(p1) = G_UNMERGE_VALUES [[COPY]](<2 x p1>) + ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV]], [[UV2]](s64) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV1]], [[UV3]](s64) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[PTR_ADD]](p1), [[PTR_ADD1]](p1) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + %0:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + %2:_(<2 x p1>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 +... + +--- +name: test_gep_v2p3_v2s32 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_gep_v2p3_v2s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV]], [[UV2]](s32) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV1]], [[UV3]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[PTR_ADD]](p3), [[PTR_ADD1]](p3) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + %0:_(<2 x p3>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(<2 x p3>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... From f6176f8a5f02d7c3ee6d3b70c46045cfdce230d4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 11:00:35 -0400 Subject: [PATCH 0137/1035] GlobalISel: Handle G_PTR_ADD in narrowScalar --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 1 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 6 +- .../AMDGPU/GlobalISel/legalize-ptr-add.mir | 192 ++++++++++++++++++ 3 files changed, 196 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 49d0dceb34d37..441e5ca6d2098 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -1204,6 +1204,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_PTRMASK: { if (TypeIdx != 1) return UnableToLegalize; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c2200f4b2d616..673c5fc1e840c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -733,10 +733,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } - // FIXME: Clamp offset operand. getActionDefinitionsBuilder(G_PTR_ADD) - .legalIf(isPointer(0)) - .scalarize(0); + .legalIf(all(isPointer(0), sameSize(0, 1))) + .scalarize(0) + .scalarSameSizeAs(1, 0); getActionDefinitionsBuilder(G_PTRMASK) .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir index 1d76ddc61f4b6..e014376446a94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir @@ -185,3 +185,195 @@ body: | %2:_(<2 x p3>) = G_PTR_ADD %0, %1 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_gep_global_s16_idx +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_gep_global_s16_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ANYEXT]], 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[SEXT_INREG]](s64) + ; CHECK: $vgpr0_vgpr1 = COPY [[PTR_ADD]](p1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(p1) = G_PTR_ADD %0, %2 + $vgpr0_vgpr1 = COPY %3 +... + +--- +name: test_gep_global_s32_idx +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_gep_global_s32_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[SEXT]](s64) + ; CHECK: $vgpr0_vgpr1 = COPY [[PTR_ADD]](p1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(p1) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: test_gep_global_s96_idx +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 + + ; CHECK-LABEL: name: test_gep_global_s96_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[COPY1]](s96) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[TRUNC]](s64) + ; CHECK: $vgpr0_vgpr1 = COPY [[PTR_ADD]](p1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s96) = COPY $vgpr2_vgpr3_vgpr4 + %2:_(p1) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: test_gep_local_i16_idx +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_gep_local_i16_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[SEXT_INREG]](s32) + ; CHECK: $vgpr0 = COPY [[PTR_ADD]](p3) + %0:_(p3) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %1 + %3:_(p3) = G_PTR_ADD %0, %2 + $vgpr0 = COPY %3 +... + +--- +name: test_gep_local_i64_idx +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2 + + ; CHECK-LABEL: name: test_gep_local_i64_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr1_vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[TRUNC]](s32) + ; CHECK: $vgpr0 = COPY [[PTR_ADD]](p3) + %0:_(p3) = COPY $vgpr0 + %1:_(s64) = COPY $vgpr1_vgpr2 + %2:_(p3) = G_PTR_ADD %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: test_gep_v2p1_v2i32 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_gep_v2p1_v2i32 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(p1), [[UV1:%[0-9]+]]:_(p1) = G_UNMERGE_VALUES [[COPY]](<2 x p1>) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[UV2]](s32) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV]], [[SEXT]](s64) + ; CHECK: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[UV3]](s32) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV1]], [[SEXT1]](s64) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[PTR_ADD]](p1), [[PTR_ADD1]](p1) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + %0:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s32>) = COPY $vgpr4_vgpr5 + %2:_(<2 x p1>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 +... + +--- +name: test_gep_v2p1_v2i96 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6, $vgpr7_vgpr8_vgpr9 + + ; CHECK-LABEL: name: test_gep_v2p1_v2i96 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr4_vgpr5_vgpr6 + ; CHECK: [[COPY2:%[0-9]+]]:_(s96) = COPY $vgpr7_vgpr8_vgpr9 + ; CHECK: [[UV:%[0-9]+]]:_(p1), [[UV1:%[0-9]+]]:_(p1) = G_UNMERGE_VALUES [[COPY]](<2 x p1>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[COPY1]](s96) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV]], [[TRUNC]](s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s64) = G_TRUNC [[COPY2]](s96) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV1]], [[TRUNC1]](s64) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[PTR_ADD]](p1), [[PTR_ADD1]](p1) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + %0:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s96) = COPY $vgpr4_vgpr5_vgpr6 + %2:_(s96) = COPY $vgpr7_vgpr8_vgpr9 + %3:_(<2 x s96>) = G_BUILD_VECTOR %1, %2 + %4:_(<2 x p1>) = G_PTR_ADD %0, %3 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 +... + +--- +name: test_gep_v2p3_v2s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_gep_v2p3_v2s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV]], [[SEXT_INREG]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV1]], [[SEXT_INREG1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[PTR_ADD]](p3), [[PTR_ADD1]](p3) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + %0:_(<2 x p3>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s16>) = COPY $vgpr2 + %2:_(<2 x p3>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: test_gep_v2p3_v2s64 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_gep_v2p3_v2s64 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) + ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UV2]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV]], [[TRUNC]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV3]](s64) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV1]], [[TRUNC1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[PTR_ADD]](p3), [[PTR_ADD1]](p3) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + %0:_(<2 x p3>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<2 x p3>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... From d35e2c101d22770a4fd5e387f6ae29bc94437426 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 11:56:33 -0400 Subject: [PATCH 0138/1035] AMDGPU/GlobalISel: Fix not constraining ds_append/consume operands --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 7 +++++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll | 11 +++++++++++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll | 11 +++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8bc597664634a..777c8c6c2ee69 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1313,12 +1313,15 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) .addReg(PtrBase); - BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) + if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) + return false; + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) .addImm(Offset) .addImm(IsGDS ? -1 : 0) .cloneMemRefs(MI); MI.eraseFromParent(); - return true; + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll index ce1551e44e51b..2da96c4480608 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll @@ -133,6 +133,17 @@ define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 ret void } +; Make sure this selects successfully with no use. The result register needs to be constrained. +; GCN-LABEL: {{^}}ds_append_lds_no_use: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}} +define amdgpu_kernel void @ds_append_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) + ret void +} + declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll index 175c0cf7760ac..40f20bc795222 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll @@ -127,6 +127,17 @@ define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 ret void } +; Make sure this selects successfully with no use. The result register needs to be constrained. +; GCN-LABEL: {{^}}ds_consume_lds_no_use: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}} +define amdgpu_kernel void @ds_consume_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) + ret void +} + declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 From d135744c34dc7a6315f1d2d65a969a2791a97534 Mon Sep 17 00:00:00 2001 From: Vincent Zhao Date: Sun, 26 Jul 2020 20:10:07 +0530 Subject: [PATCH 0139/1035] [MLIR][Affine] Add test for non-hyperrectangular loop tiling This diff provides a concrete test case for the error that will be raised when the iteration space is non hyper-rectangular. The corresponding emission method for this error message has been changed as well. Differential Revision: https://reviews.llvm.org/D84531 --- mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp | 5 ++--- .../Dialect/Affine/loop-tiling-unsupported.mlir | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 mlir/test/Dialect/Affine/loop-tiling-unsupported.mlir diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp index ddb00bdd8f0e5..d9b2b6ac30438 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp @@ -218,9 +218,8 @@ mlir::tilePerfectlyNested(MutableArrayRef input, FlatAffineConstraints cst; getIndexSet(input, &cst); if (!cst.isHyperRectangular(0, width)) { - llvm::dbgs() << "tiled code generation unimplemented for the " - "non-hyperrectangular case, op:" - << *rootAffineForOp << "\n"; + rootAffineForOp.emitError("tiled code generation unimplemented for the " + "non-hyperrectangular case"); return failure(); } diff --git a/mlir/test/Dialect/Affine/loop-tiling-unsupported.mlir b/mlir/test/Dialect/Affine/loop-tiling-unsupported.mlir new file mode 100644 index 0000000000000..cb558848b21cd --- /dev/null +++ b/mlir/test/Dialect/Affine/loop-tiling-unsupported.mlir @@ -0,0 +1,15 @@ +// RUN: mlir-opt %s -affine-loop-tile="tile-size=32" -split-input-file -verify-diagnostics + +// ----- + +#ub = affine_map<(d0)[s0] -> (d0, s0)> +func @non_hyperrect_loop() { + %N = constant 128 : index + // expected-error@+1 {{tiled code generation unimplemented for the non-hyperrectangular case}} + affine.for %i = 0 to %N { + affine.for %j = 0 to min #ub(%i)[%N] { + affine.yield + } + } + return +} From 17eafe0841d6e523d410771c8d4de99d5881c59d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 26 Jul 2020 16:03:53 +0100 Subject: [PATCH 0140/1035] [X86][SSE] lowerV2I64Shuffle - use undef elements in PSHUFD mask widening If we lower a v2i64 shuffle to PSHUFD, we currently clamp undef elements to 0, (elements 0,1 of the v4i32) which can result in the shuffle referencing more elements of the source vector than expected, affecting later shuffle combines and KnownBits/SimplifyDemanded calls. By ensuring we widen the undef mask element we allow getV4X86ShuffleImm8 to use inline elements as the default, which are more likely to fold. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +- llvm/test/CodeGen/X86/avg.ll | 32 ++-- llvm/test/CodeGen/X86/avx-cvt.ll | 2 +- llvm/test/CodeGen/X86/avx512-hadd-hsub.ll | 8 +- .../X86/avx512-intrinsics-fast-isel.ll | 32 ++-- llvm/test/CodeGen/X86/buildvec-extract.ll | 6 +- llvm/test/CodeGen/X86/cast-vsel.ll | 14 +- .../X86/clear_upper_vector_element_bits.ll | 4 +- llvm/test/CodeGen/X86/combine-movmsk-avx.ll | 4 +- llvm/test/CodeGen/X86/combine-sdiv.ll | 6 +- llvm/test/CodeGen/X86/combine-shl.ll | 6 +- llvm/test/CodeGen/X86/combine-sra.ll | 4 +- llvm/test/CodeGen/X86/combine-srl.ll | 2 +- llvm/test/CodeGen/X86/combine-udiv.ll | 12 +- llvm/test/CodeGen/X86/combine-urem.ll | 2 +- .../X86/div-rem-pair-recomposition-signed.ll | 16 +- .../div-rem-pair-recomposition-unsigned.ll | 16 +- llvm/test/CodeGen/X86/extract-store.ll | 2 +- llvm/test/CodeGen/X86/extractelement-index.ll | 6 +- llvm/test/CodeGen/X86/extractelement-load.ll | 38 ++-- llvm/test/CodeGen/X86/gather-addresses.ll | 8 +- llvm/test/CodeGen/X86/haddsub-2.ll | 36 ++-- ...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 6 +- .../test/CodeGen/X86/horizontal-reduce-add.ll | 42 ++--- .../CodeGen/X86/horizontal-reduce-smax.ll | 140 +++++++------- .../CodeGen/X86/horizontal-reduce-smin.ll | 140 +++++++------- .../CodeGen/X86/horizontal-reduce-umax.ll | 142 +++++++------- .../CodeGen/X86/horizontal-reduce-umin.ll | 142 +++++++------- llvm/test/CodeGen/X86/i128-add.ll | 2 +- llvm/test/CodeGen/X86/inline-asm-x-i128.ll | 2 +- llvm/test/CodeGen/X86/known-bits-vector.ll | 4 +- .../test/CodeGen/X86/known-signbits-vector.ll | 6 +- llvm/test/CodeGen/X86/madd.ll | 110 +++++------ llvm/test/CodeGen/X86/masked_compressstore.ll | 16 +- llvm/test/CodeGen/X86/masked_gather.ll | 110 +++++------ llvm/test/CodeGen/X86/masked_load.ll | 16 +- llvm/test/CodeGen/X86/masked_store.ll | 16 +- llvm/test/CodeGen/X86/masked_store_trunc.ll | 6 +- .../CodeGen/X86/masked_store_trunc_ssat.ll | 6 +- .../CodeGen/X86/masked_store_trunc_usat.ll | 6 +- .../X86/merge-consecutive-stores-nt.ll | 16 +- .../CodeGen/X86/min-legal-vector-width.ll | 16 +- llvm/test/CodeGen/X86/nontemporal-2.ll | 4 +- llvm/test/CodeGen/X86/oddshuffles.ll | 22 +-- llvm/test/CodeGen/X86/phaddsub-extract.ll | 76 ++++---- llvm/test/CodeGen/X86/pmul.ll | 10 +- llvm/test/CodeGen/X86/pmulh.ll | 102 +++++----- llvm/test/CodeGen/X86/pr15267.ll | 2 +- llvm/test/CodeGen/X86/pr39733.ll | 2 +- llvm/test/CodeGen/X86/pr42452.ll | 2 +- llvm/test/CodeGen/X86/pr42905.ll | 2 +- llvm/test/CodeGen/X86/pr44976.ll | 2 +- llvm/test/CodeGen/X86/pr45378.ll | 2 +- llvm/test/CodeGen/X86/pr46189.ll | 2 +- llvm/test/CodeGen/X86/pr46455.ll | 2 +- .../CodeGen/X86/prefer-avx256-mask-shuffle.ll | 2 +- llvm/test/CodeGen/X86/psubus.ll | 18 +- llvm/test/CodeGen/X86/sad.ll | 60 +++--- llvm/test/CodeGen/X86/sdiv_fix.ll | 12 +- llvm/test/CodeGen/X86/sdiv_fix_sat.ll | 24 +-- llvm/test/CodeGen/X86/setcc-wide-types.ll | 48 ++--- llvm/test/CodeGen/X86/shrink_vmul.ll | 16 +- llvm/test/CodeGen/X86/slow-pmulld.ll | 80 ++++---- llvm/test/CodeGen/X86/smul_fix_sat.ll | 8 +- .../CodeGen/X86/split-extend-vector-inreg.ll | 2 +- llvm/test/CodeGen/X86/split-vector-rem.ll | 16 +- .../CodeGen/X86/sse-intrinsics-fast-isel.ll | 4 +- llvm/test/CodeGen/X86/sse41.ll | 36 ++-- llvm/test/CodeGen/X86/trunc-subvector.ll | 4 +- llvm/test/CodeGen/X86/udiv_fix.ll | 8 +- llvm/test/CodeGen/X86/udiv_fix_sat.ll | 8 +- llvm/test/CodeGen/X86/uint_to_fp-3.ll | 4 +- llvm/test/CodeGen/X86/umul_fix_sat.ll | 8 +- .../CodeGen/X86/urem-seteq-vec-nonsplat.ll | 4 +- llvm/test/CodeGen/X86/var-permute-128.ll | 12 +- .../CodeGen/X86/vec-strict-inttofp-128.ll | 34 ++-- .../CodeGen/X86/vec-strict-inttofp-256.ll | 2 +- llvm/test/CodeGen/X86/vec_cast2.ll | 2 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 120 ++++++------ llvm/test/CodeGen/X86/vec_saddo.ll | 28 +-- llvm/test/CodeGen/X86/vec_smulo.ll | 136 +++++++------- llvm/test/CodeGen/X86/vec_ssubo.ll | 28 +-- llvm/test/CodeGen/X86/vec_uaddo.ll | 28 +-- llvm/test/CodeGen/X86/vec_umulo.ll | 78 ++++---- llvm/test/CodeGen/X86/vec_usubo.ll | 28 +-- .../X86/vector-constrained-fp-intrinsics.ll | 30 +-- llvm/test/CodeGen/X86/vector-fshl-128.ll | 24 +-- llvm/test/CodeGen/X86/vector-fshl-256.ll | 8 +- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 20 +- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 10 +- llvm/test/CodeGen/X86/vector-fshr-128.ll | 24 +-- llvm/test/CodeGen/X86/vector-fshr-256.ll | 8 +- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 20 +- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 10 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 20 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll | 16 +- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 4 +- llvm/test/CodeGen/X86/vector-pcmp.ll | 2 +- llvm/test/CodeGen/X86/vector-reduce-add.ll | 132 ++++++------- .../test/CodeGen/X86/vector-reduce-and-cmp.ll | 112 +++++------ llvm/test/CodeGen/X86/vector-reduce-and.ll | 112 +++++------ llvm/test/CodeGen/X86/vector-reduce-mul.ll | 176 +++++++++--------- llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 2 +- llvm/test/CodeGen/X86/vector-reduce-or.ll | 112 +++++------ llvm/test/CodeGen/X86/vector-reduce-smax.ll | 108 +++++------ llvm/test/CodeGen/X86/vector-reduce-smin.ll | 108 +++++------ llvm/test/CodeGen/X86/vector-reduce-umax.ll | 108 +++++------ llvm/test/CodeGen/X86/vector-reduce-umin.ll | 108 +++++------ llvm/test/CodeGen/X86/vector-reduce-xor.ll | 112 +++++------ llvm/test/CodeGen/X86/vector-rem.ll | 8 +- llvm/test/CodeGen/X86/vector-rotate-128.ll | 18 +- llvm/test/CodeGen/X86/vector-rotate-256.ll | 8 +- llvm/test/CodeGen/X86/vector-sext.ll | 130 ++++++------- .../test/CodeGen/X86/vector-shift-ashr-128.ll | 14 +- .../test/CodeGen/X86/vector-shift-ashr-256.ll | 8 +- .../CodeGen/X86/vector-shift-ashr-sub128.ll | 6 +- .../test/CodeGen/X86/vector-shift-lshr-128.ll | 14 +- .../test/CodeGen/X86/vector-shift-lshr-256.ll | 8 +- .../CodeGen/X86/vector-shift-lshr-sub128.ll | 6 +- llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 8 +- llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 8 +- .../test/CodeGen/X86/vector-shuffle-128-v8.ll | 22 +-- .../CodeGen/X86/vector-shuffle-256-v16.ll | 20 +- .../CodeGen/X86/vector-shuffle-256-v32.ll | 4 +- .../test/CodeGen/X86/vector-shuffle-256-v8.ll | 2 +- .../CodeGen/X86/vector-shuffle-512-v32.ll | 2 +- .../CodeGen/X86/vector-shuffle-512-v64.ll | 4 +- .../CodeGen/X86/vector-shuffle-combining.ll | 2 +- llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll | 4 +- llvm/test/CodeGen/X86/vector-zext.ll | 62 +++--- llvm/test/CodeGen/X86/vsel-cmp-load.ll | 2 +- llvm/test/CodeGen/X86/vselect-avx.ll | 2 +- llvm/test/CodeGen/X86/vselect-pcmp.ll | 4 +- llvm/test/CodeGen/X86/vshift-4.ll | 4 +- llvm/test/CodeGen/X86/widen_conv-4.ll | 4 +- .../X86/x86-setcc-int-to-fp-combine.ll | 2 +- llvm/test/CodeGen/X86/xor.ll | 2 +- 137 files changed, 2028 insertions(+), 2031 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 24bc264df129f..7c134a8c7cb92 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -13723,9 +13723,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); - int WidenedMask[4] = { - std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, - std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), + Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), + Mask[1] < 0 ? -1 : (Mask[1] * 2), + Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index d2638a1681e85..1411318d8176d 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -160,9 +160,9 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -172,10 +172,10 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vmovdqa (%rsi), %xmm6 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] @@ -454,24 +454,24 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] @@ -482,10 +482,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] @@ -493,10 +493,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] @@ -504,10 +504,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll index 653a88edd26a0..b3fa8ac9aeeda 100644 --- a/llvm/test/CodeGen/X86/avx-cvt.ll +++ b/llvm/test/CodeGen/X86/avx-cvt.ll @@ -33,7 +33,7 @@ define <8 x float> @sitofp02(<8 x i16> %a) { ; AVX-LABEL: sitofp02: ; AVX: # %bb.0: ; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll index 1fd3e15c3e019..b504646336def 100644 --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -5,7 +5,7 @@ define i32 @hadd_16(<16 x i32> %x225) { ; KNL-LABEL: hadd_16: ; KNL: # %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -14,7 +14,7 @@ define i32 @hadd_16(<16 x i32> %x225) { ; ; SKX-LABEL: hadd_16: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -32,7 +32,7 @@ define i32 @hadd_16(<16 x i32> %x225) { define i32 @hsub_16(<16 x i32> %x225) { ; KNL-LABEL: hsub_16: ; KNL: # %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -41,7 +41,7 @@ define i32 @hsub_16(<16 x i32> %x225) { ; ; SKX-LABEL: hsub_16: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index f115f9a6ef382..650bbe23b86e5 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6550,7 +6550,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6563,7 +6563,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6602,7 +6602,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6636,7 +6636,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6668,7 +6668,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6681,7 +6681,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6706,7 +6706,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6719,7 +6719,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6747,7 +6747,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6762,7 +6762,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6807,7 +6807,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6844,7 +6844,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6882,7 +6882,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6898,7 +6898,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6928,7 +6928,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6943,7 +6943,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index 863ab4dee1238..2195526f94c39 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -114,21 +114,21 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll index 2e044548404eb..6e9e4fd00636d 100644 --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -31,7 +31,7 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: pmovsxwd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -39,9 +39,9 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; AVX1: # %bb.0: ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 ; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 @@ -87,7 +87,7 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: retq ; @@ -459,7 +459,7 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE41-NEXT: packssdw %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm3, dj+4096(%rax) @@ -480,9 +480,9 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll index b96f44ec3073f..16a993316d7e5 100644 --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -696,7 +696,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { ; SSE2-LABEL: _clearupper16xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %r10 ; SSE2-NEXT: movq %r10, %r8 ; SSE2-NEXT: shrq $56, %r8 @@ -878,7 +878,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; SSE2-LABEL: _clearupper32xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %r10, %r8 ; SSE2-NEXT: shrq $56, %r8 diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll index b18c7246104d4..17d01e1d3362c 100644 --- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll @@ -93,7 +93,7 @@ define i32 @movmskps_sext_v4i64(<4 x i32> %a0) { ; AVX1-LABEL: movmskps_sext_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovmskpd %ymm0, %eax @@ -116,7 +116,7 @@ define i32 @movmskps_sext_v8i32(<8 x i16> %a0) { ; AVX1-LABEL: movmskps_sext_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1a52ebfc6cd90..ce411b5e8f06b 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2959,7 +2959,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; SSE41-LABEL: pr38658: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 @@ -2984,7 +2984,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; AVX1-LABEL: pr38658: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 @@ -3058,7 +3058,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; XOP-LABEL: pr38658: ; XOP: # %bb.0: -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; XOP-NEXT: vpmovsxbw %xmm1, %xmm1 ; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index a6950873daf55..383d1866aa1d9 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -311,7 +311,7 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -344,7 +344,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -377,7 +377,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { ; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index e2f3f2dc7523f..28a73cdb6a41e 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -152,7 +152,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrad %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrad %xmm4, %xmm5 @@ -272,7 +272,7 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index 36fbdaf537025..2e886defafd4c 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -400,7 +400,7 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index e6d7aac926162..c44342d00357a 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -233,7 +233,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -249,7 +249,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -307,7 +307,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -324,7 +324,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -384,7 +384,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -401,7 +401,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll index b21ed8ec60cef..cd0b21d02969c 100644 --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -213,7 +213,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: psrld %xmm5, %xmm6 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 753aee85f319c..59101503b5a93 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -565,9 +565,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X86-NEXT: cltd ; X86-NEXT: idivl %esi ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm3, %eax -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi @@ -608,9 +608,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx @@ -657,11 +657,11 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm1, (%esp) ; X86-NEXT: calll __divdi3 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload @@ -707,9 +707,9 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 43fc23c836a9b..d787f91ababba 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -565,9 +565,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm3, %eax -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi @@ -608,9 +608,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx @@ -657,11 +657,11 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: movd %xmm1, (%esp) ; X86-NEXT: calll __udivdi3 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload @@ -707,9 +707,9 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll index c29fac61af3fe..d50c2ed920753 100644 --- a/llvm/test/CodeGen/X86/extract-store.ll +++ b/llvm/test/CodeGen/X86/extract-store.ll @@ -314,7 +314,7 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind { ; ; SSE2-X64-LABEL: extract_i64_1: ; SSE2-X64: # %bb.0: -; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-X64-NEXT: movq %xmm0, (%rdi) ; SSE2-X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll index cf06f8dcb13e1..f2e01e93361ef 100644 --- a/llvm/test/CodeGen/X86/extractelement-index.ll +++ b/llvm/test/CodeGen/X86/extractelement-index.ll @@ -351,7 +351,7 @@ define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind { define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v2i64_1: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; @@ -371,7 +371,7 @@ define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind { define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_1: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; @@ -392,7 +392,7 @@ define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind { define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_3: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 94628c70d989e..b694859b757c0 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -10,13 +10,13 @@ define i32 @t(<2 x i64>* %val) nounwind { ; X32-SSE2-LABEL: t: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: pshufd $78, (%eax), %xmm0 # xmm0 = mem[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3] ; X32-SSE2-NEXT: movd %xmm0, %eax ; X32-SSE2-NEXT: retl ; ; X64-SSSE3-LABEL: t: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: pshufd $78, (%rdi), %xmm0 # xmm0 = mem[2,3,0,1] +; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3] ; X64-SSSE3-NEXT: movd %xmm0, %eax ; X64-SSSE3-NEXT: retq ; @@ -60,13 +60,13 @@ define void @t3(<2 x double>* %a0) { ; ; X64-SSSE3-LABEL: t3: ; X64-SSSE3: # %bb.0: # %bb -; X64-SSSE3-NEXT: movsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSSE3-NEXT: movsd %xmm0, (%rax) ; X64-SSSE3-NEXT: retq ; ; X64-AVX-LABEL: t3: ; X64-AVX: # %bb.0: # %bb -; X64-AVX-NEXT: vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: vmovsd %xmm0, (%rax) ; X64-AVX-NEXT: retq bb: @@ -139,7 +139,7 @@ define float @t6(<8 x float> *%a0) { ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1 -; X32-SSE2-NEXT: movss {{\.LCPI.*}}, %xmm2 # xmm2 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: andps %xmm1, %xmm2 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1 ; X32-SSE2-NEXT: orps %xmm2, %xmm1 @@ -151,10 +151,10 @@ define float @t6(<8 x float> *%a0) { ; ; X64-SSSE3-LABEL: t6: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movshdup (%rdi), %xmm1 # xmm1 = mem[1,1,3,3] +; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 -; X64-SSSE3-NEXT: movss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: andps %xmm0, %xmm2 ; X64-SSSE3-NEXT: andnps %xmm1, %xmm0 ; X64-SSSE3-NEXT: orps %xmm2, %xmm0 @@ -162,10 +162,10 @@ define float @t6(<8 x float> *%a0) { ; ; X64-AVX-LABEL: t6: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %vecload = load <8 x float>, <8 x float>* %a0, align 32 @@ -184,7 +184,7 @@ define void @PR43971(<8 x float> *%a0, float *%a1) { ; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1 -; X32-SSE2-NEXT: movss (%eax), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: andps %xmm1, %xmm2 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1 ; X32-SSE2-NEXT: orps %xmm2, %xmm1 @@ -193,10 +193,10 @@ define void @PR43971(<8 x float> *%a0, float *%a1) { ; ; X64-SSSE3-LABEL: PR43971: ; X64-SSSE3: # %bb.0: # %entry -; X64-SSSE3-NEXT: movss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: xorps %xmm1, %xmm1 ; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1 -; X64-SSSE3-NEXT: movss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: andps %xmm1, %xmm2 ; X64-SSSE3-NEXT: andnps %xmm0, %xmm1 ; X64-SSSE3-NEXT: orps %xmm2, %xmm1 @@ -205,10 +205,10 @@ define void @PR43971(<8 x float> *%a0, float *%a1) { ; ; X64-AVX-LABEL: PR43971: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 -; X64-AVX-NEXT: vmovss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovss %xmm0, (%rsi) ; X64-AVX-NEXT: retq @@ -231,7 +231,7 @@ define float @PR43971_1(<8 x float> *%a0) nounwind { ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1 -; X32-SSE2-NEXT: movss {{\.LCPI.*}}, %xmm2 # xmm2 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: andps %xmm1, %xmm2 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1 ; X32-SSE2-NEXT: orps %xmm2, %xmm1 @@ -242,10 +242,10 @@ define float @PR43971_1(<8 x float> *%a0) nounwind { ; ; X64-SSSE3-LABEL: PR43971_1: ; X64-SSSE3: # %bb.0: # %entry -; X64-SSSE3-NEXT: movshdup (%rdi), %xmm1 # xmm1 = mem[1,1,3,3] +; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 -; X64-SSSE3-NEXT: movss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: andps %xmm0, %xmm2 ; X64-SSSE3-NEXT: andnps %xmm1, %xmm0 ; X64-SSSE3-NEXT: orps %xmm2, %xmm0 @@ -253,10 +253,10 @@ define float @PR43971_1(<8 x float> *%a0) nounwind { ; ; X64-AVX-LABEL: PR43971_1: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll index 1cd85e6e582c4..00f84a6e4b159 100644 --- a/llvm/test/CodeGen/X86/gather-addresses.ll +++ b/llvm/test/CodeGen/X86/gather-addresses.ll @@ -17,7 +17,7 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; LIN-SSE2-NEXT: movd %xmm0, %eax ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %ecx -; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %edx ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; LIN-SSE2-NEXT: movd %xmm0, %esi @@ -56,7 +56,7 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; WIN-SSE2-NEXT: movd %xmm0, %r8d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %r9d -; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %r10d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; WIN-SSE2-NEXT: movd %xmm0, %edx @@ -141,7 +141,7 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; LIN-SSE2-NEXT: movd %xmm0, %eax ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %edx -; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; LIN-SSE2-NEXT: movd %xmm1, %esi ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; LIN-SSE2-NEXT: movd %xmm0, %edi @@ -184,7 +184,7 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; WIN-SSE2-NEXT: movd %xmm0, %eax ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %ecx -; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; WIN-SSE2-NEXT: movd %xmm1, %r8d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; WIN-SSE2-NEXT: movd %xmm0, %edx diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index c4d470a6cd69e..e36c0479448ea 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -127,7 +127,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx ; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx @@ -136,7 +136,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -181,7 +181,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-LABEL: phadd_d_test2: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx @@ -192,7 +192,7 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: addl %eax, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi ; SSE3-NEXT: movd %esi, %xmm0 @@ -243,7 +243,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx ; SSE3-NEXT: subl %ecx, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %ecx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx @@ -252,7 +252,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -297,7 +297,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-LABEL: phsub_d_test2: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx @@ -306,7 +306,7 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: subl %edx, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi @@ -513,7 +513,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm4, %r8d ; SSE3-NEXT: addl %ecx, %r8d -; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm4, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %r9d @@ -522,7 +522,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %edx, %esi -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -531,7 +531,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %r10d ; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %ecx @@ -540,7 +540,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: addl %eax, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %r11d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax @@ -819,7 +819,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: movd %xmm0, %edx @@ -830,7 +830,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { ; SSE-NEXT: subl %esi, %edx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE-NEXT: movd %xmm0, %esi -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movd %xmm0, %edi ; SSE-NEXT: subl %edi, %esi ; SSE-NEXT: movd %esi, %xmm0 @@ -1133,7 +1133,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SSE3-NEXT: movd %xmm4, %r8d ; SSE3-NEXT: addl %ecx, %r8d -; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm4, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %r9d @@ -1142,7 +1142,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %edx, %esi -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi @@ -1151,7 +1151,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %r10d ; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %ecx @@ -1160,7 +1160,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: addl %eax, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %r11d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index 6663459f49d56..740e4b291f5f9 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -554,7 +554,7 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrld %xmm2, %xmm5 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrld %xmm2, %xmm4 @@ -640,7 +640,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrld %xmm2, %xmm5 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrld %xmm2, %xmm4 @@ -677,7 +677,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrld %xmm2, %xmm5 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrld %xmm2, %xmm4 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll index 64d8de9aead78..dab7785c85cc1 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll @@ -11,7 +11,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; SSE2-LABEL: PR37890_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -20,7 +20,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; SSSE3-SLOW-LABEL: PR37890_v4i32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -36,7 +36,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; AVX1-SLOW-LABEL: PR37890_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -52,7 +52,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; AVX2-LABEL: PR37890_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -70,7 +70,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { define i16 @PR37890_v8i16(<8 x i16> %a) { ; SSE2-LABEL: PR37890_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddw %xmm1, %xmm0 @@ -83,7 +83,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; SSSE3-SLOW-LABEL: PR37890_v8i16: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 @@ -105,7 +105,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; AVX1-SLOW-LABEL: PR37890_v8i16: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -126,7 +126,7 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; AVX2-LABEL: PR37890_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -151,7 +151,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; SSE2-LABEL: PR37890_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -161,7 +161,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; SSSE3-SLOW-LABEL: PR37890_v8i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -180,7 +180,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -202,7 +202,7 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -225,7 +225,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; SSE2-LABEL: PR37890_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddw %xmm1, %xmm0 @@ -239,7 +239,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; SSSE3-SLOW-LABEL: PR37890_v16i16: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 @@ -264,7 +264,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -291,7 +291,7 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -322,7 +322,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -334,7 +334,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 @@ -346,7 +346,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm1 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: movd %xmm0, %eax @@ -359,7 +359,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -385,7 +385,7 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll index dc8f60248c672..a17b1db2c1783 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; X86-SSE42-NEXT: movd %xmm2, %eax @@ -49,7 +49,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX-LABEL: test_reduce_v2i64: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax @@ -58,7 +58,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -80,7 +80,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; X64-SSE42-NEXT: movq %xmm2, %rax @@ -88,7 +88,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -96,7 +96,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -104,7 +104,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -118,7 +118,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -135,7 +135,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -144,7 +144,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -153,7 +153,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -170,7 +170,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -179,7 +179,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -198,7 +198,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -229,7 +229,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -273,7 +273,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -328,7 +328,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -419,7 +419,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -444,7 +444,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -457,7 +457,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -470,7 +470,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -496,7 +496,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -519,7 +519,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -531,7 +531,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -543,7 +543,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -554,7 +554,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -577,7 +577,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -595,7 +595,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -606,7 +606,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -618,7 +618,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -633,7 +633,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -651,7 +651,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 @@ -662,7 +662,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -674,7 +674,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -686,7 +686,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -710,7 +710,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -758,7 +758,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-LABEL: test_reduce_v16i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -838,7 +838,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -916,7 +916,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -1072,7 +1072,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1104,7 +1104,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: movapd %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1122,7 +1122,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1137,7 +1137,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1193,7 +1193,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm5 ; X64-SSE2-NEXT: pandn %xmm2, %xmm1 ; X64-SSE2-NEXT: por %xmm5, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1223,7 +1223,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: movapd %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1240,7 +1240,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1254,7 +1254,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1267,7 +1267,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1303,7 +1303,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1323,7 +1323,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm1 ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 @@ -1337,7 +1337,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1350,7 +1350,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1375,7 +1375,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm0, %xmm1 ; X64-SSE2-NEXT: pandn %xmm4, %xmm0 ; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1395,7 +1395,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm1 ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 @@ -1409,7 +1409,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1422,7 +1422,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1436,7 +1436,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1465,7 +1465,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1521,7 +1521,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1622,7 +1622,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1716,7 +1716,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm0, %xmm1 ; X64-SSE2-NEXT: pandn %xmm4, %xmm0 ; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -1837,7 +1837,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1869,7 +1869,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1914,7 +1914,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1946,7 +1946,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1991,7 +1991,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2047,7 +2047,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2119,7 +2119,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2175,7 +2175,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll index 987da0f68082c..17c3a9fd4a010 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -50,7 +50,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX-LABEL: test_reduce_v2i64: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax @@ -59,7 +59,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -81,7 +81,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -90,7 +90,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -98,7 +98,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -106,7 +106,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -120,7 +120,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -137,7 +137,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -146,7 +146,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -155,7 +155,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -172,7 +172,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -181,7 +181,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -200,7 +200,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -231,7 +231,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -275,7 +275,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -330,7 +330,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -421,7 +421,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -447,7 +447,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -460,7 +460,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -473,7 +473,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -499,7 +499,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -523,7 +523,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -535,7 +535,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -547,7 +547,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -558,7 +558,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -581,7 +581,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -599,7 +599,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -610,7 +610,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -622,7 +622,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -637,7 +637,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -655,7 +655,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 @@ -666,7 +666,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -678,7 +678,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -690,7 +690,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -714,7 +714,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -762,7 +762,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-LABEL: test_reduce_v16i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -842,7 +842,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 @@ -920,7 +920,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm2 ; X64-SSE2-NEXT: por %xmm0, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 @@ -1076,7 +1076,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm5, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 @@ -1108,7 +1108,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: movapd %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1126,7 +1126,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1141,7 +1141,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1197,7 +1197,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm3 ; X64-SSE2-NEXT: pandn %xmm5, %xmm1 ; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1227,7 +1227,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: movapd %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1244,7 +1244,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1258,7 +1258,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1271,7 +1271,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1307,7 +1307,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm4, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1327,7 +1327,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pminsd %xmm3, %xmm1 ; X86-SSE42-NEXT: pminsd %xmm2, %xmm1 ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 @@ -1341,7 +1341,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1354,7 +1354,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1379,7 +1379,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm4 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm4, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1399,7 +1399,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pminsd %xmm3, %xmm1 ; X64-SSE42-NEXT: pminsd %xmm2, %xmm1 ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 @@ -1413,7 +1413,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1426,7 +1426,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1440,7 +1440,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1469,7 +1469,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pminsw %xmm3, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1525,7 +1525,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pminsw %xmm3, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1626,7 +1626,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm4, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1720,7 +1720,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm4 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm4, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm1 @@ -1841,7 +1841,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1873,7 +1873,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1918,7 +1918,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1950,7 +1950,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1995,7 +1995,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2051,7 +2051,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2123,7 +2123,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2179,7 +2179,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index 6e59cd046cb0a..c69551aa3d883 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -52,7 +52,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -65,7 +65,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX2-LABEL: test_reduce_v2i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -77,7 +77,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -99,7 +99,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -110,7 +110,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -121,7 +121,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -132,7 +132,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -146,7 +146,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -169,7 +169,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -178,7 +178,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -187,7 +187,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -210,7 +210,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -219,7 +219,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -238,7 +238,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -275,7 +275,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -344,7 +344,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -385,7 +385,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -485,7 +485,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -514,7 +514,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -533,7 +533,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -551,7 +551,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -579,7 +579,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -606,7 +606,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -623,7 +623,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -640,7 +640,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -653,7 +653,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -680,7 +680,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -703,7 +703,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -714,7 +714,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -726,7 +726,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -745,7 +745,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -768,7 +768,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -791,7 +791,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -803,7 +803,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -830,7 +830,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -885,7 +885,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -965,7 +965,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -1026,7 +1026,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-LABEL: test_reduce_v32i8: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -1169,7 +1169,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1210,7 +1210,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm5, %xmm0 ; X86-SSE42-NEXT: pxor %xmm1, %xmm5 @@ -1238,7 +1238,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1260,7 +1260,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1318,7 +1318,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm5 ; X64-SSE2-NEXT: pandn %xmm2, %xmm1 ; X64-SSE2-NEXT: por %xmm5, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1357,7 +1357,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pxor %xmm1, %xmm5 @@ -1383,7 +1383,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1404,7 +1404,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1419,7 +1419,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1465,7 +1465,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm2, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1490,7 +1490,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxud %xmm2, %xmm1 ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 @@ -1504,7 +1504,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1517,7 +1517,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1552,7 +1552,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm5 ; X64-SSE2-NEXT: pandn %xmm2, %xmm1 ; X64-SSE2-NEXT: por %xmm5, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1577,7 +1577,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxud %xmm2, %xmm1 ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 @@ -1591,7 +1591,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1604,7 +1604,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1618,7 +1618,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1652,7 +1652,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1717,7 +1717,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1810,7 +1810,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1 ; X86-SSE2-NEXT: pmaxub %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 @@ -1879,7 +1879,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1 ; X64-SSE2-NEXT: pmaxub %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 @@ -1987,7 +1987,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2025,7 +2025,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2097,7 +2097,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2135,7 +2135,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2207,7 +2207,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2249,7 +2249,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2332,7 +2332,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2374,7 +2374,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 413b5f2ac4aa5..5f33520200d24 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -16,7 +16,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -40,7 +40,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE42-NEXT: pxor %xmm0, %xmm3 @@ -53,7 +53,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -66,7 +66,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X86-AVX2-LABEL: test_reduce_v2i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -78,7 +78,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v2i64: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -100,7 +100,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm3 ; X64-SSE42-NEXT: pxor %xmm0, %xmm3 @@ -112,7 +112,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -123,7 +123,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -134,7 +134,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: retq @@ -148,7 +148,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -171,7 +171,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v4i32: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -180,7 +180,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X86-AVX-LABEL: test_reduce_v4i32: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -189,7 +189,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v4i32: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X64-SSE2-NEXT: pxor %xmm2, %xmm3 @@ -212,7 +212,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v4i32: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -221,7 +221,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; ; X64-AVX-LABEL: test_reduce_v4i32: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -240,7 +240,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -271,7 +271,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -315,7 +315,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -350,7 +350,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -421,7 +421,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -451,7 +451,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -471,7 +471,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -489,7 +489,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -517,7 +517,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm0, %xmm2 @@ -545,7 +545,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pxor %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm2, %xmm3 @@ -563,7 +563,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -580,7 +580,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -593,7 +593,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -620,7 +620,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -643,7 +643,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-SSE42-LABEL: test_reduce_v8i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -654,7 +654,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -666,7 +666,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -685,7 +685,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: pandn %xmm1, %xmm4 ; X64-SSE2-NEXT: por %xmm0, %xmm4 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -708,7 +708,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-SSE42-LABEL: test_reduce_v8i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 @@ -719,7 +719,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -731,7 +731,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -743,7 +743,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -770,7 +770,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -885,7 +885,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -937,7 +937,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-LABEL: test_reduce_v32i8: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -1069,7 +1069,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm5, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 @@ -1111,7 +1111,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm4, %xmm0 ; X86-SSE42-NEXT: pxor %xmm1, %xmm4 @@ -1140,7 +1140,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1162,7 +1162,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1220,7 +1220,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm3 ; X64-SSE2-NEXT: pandn %xmm5, %xmm1 ; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm0, %xmm4 @@ -1260,7 +1260,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X64-SSE42-NEXT: pxor %xmm4, %xmm0 ; X64-SSE42-NEXT: pxor %xmm1, %xmm4 @@ -1287,7 +1287,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1308,7 +1308,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1323,7 +1323,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper @@ -1369,7 +1369,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE2-NEXT: pand %xmm1, %xmm3 ; X86-SSE2-NEXT: pandn %xmm6, %xmm1 ; X86-SSE2-NEXT: por %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1394,7 +1394,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-SSE42-NEXT: pminud %xmm3, %xmm1 ; X86-SSE42-NEXT: pminud %xmm2, %xmm1 ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 @@ -1408,7 +1408,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1421,7 +1421,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1456,7 +1456,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm3 ; X64-SSE2-NEXT: pandn %xmm6, %xmm1 ; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1481,7 +1481,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-SSE42-NEXT: pminud %xmm3, %xmm1 ; X64-SSE42-NEXT: pminud %xmm2, %xmm1 ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 @@ -1495,7 +1495,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1508,7 +1508,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1522,7 +1522,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1556,7 +1556,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pminsw %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1612,7 +1612,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pminsw %xmm1, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1694,7 +1694,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pminub %xmm3, %xmm1 ; X86-SSE2-NEXT: pminub %xmm2, %xmm1 ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 @@ -1754,7 +1754,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pminub %xmm3, %xmm1 ; X64-SSE2-NEXT: pminub %xmm2, %xmm1 ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 @@ -1851,7 +1851,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1883,7 +1883,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1928,7 +1928,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1960,7 +1960,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -2005,7 +2005,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -2041,7 +2041,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -2093,7 +2093,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -2129,7 +2129,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll index 8e75c4a575ee4..d128d75e64573 100644 --- a/llvm/test/CodeGen/X86/i128-add.ll +++ b/llvm/test/CodeGen/X86/i128-add.ll @@ -77,7 +77,7 @@ define <1 x i128> @add_v1i128(<1 x i128> %x, <1 x i128> %y) nounwind { ; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: movq %rsi, %xmm1 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: addq $1, %rax ; X64-NEXT: adcq $0, %rdx diff --git a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll index 552ce96a53a51..7aee1d175494e 100644 --- a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll +++ b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll @@ -16,7 +16,7 @@ define { i64, i64 } @foo(i64 %0, i64 %1) { ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movq %xmm0, %rdx ; CHECK-NEXT: retq %3 = zext i64 %1 to i128 diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll index a1606e93e2e23..3b6912a9d9461 100644 --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -116,14 +116,14 @@ define <4 x i32> @knownbits_mask_shuffle_shuffle_undef_sext(<8 x i16> %a0) nounw ; X32-LABEL: knownbits_mask_shuffle_shuffle_undef_sext: ; X32: # %bb.0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-NEXT: vpmovsxwd %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_shuffle_undef_sext: ; X64: # %bb.0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: vpmovsxwd %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <8 x i16> %a0, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index b18b8079fd236..6c4d0a919ef0f 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -252,7 +252,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) ; X86-LABEL: signbits_sext_shuffle_sitofp: ; X86: # %bb.0: ; X86-NEXT: vpmovsxdq %xmm0, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X86-NEXT: vpmovsxdq %xmm0, %xmm0 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] @@ -264,7 +264,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) ; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] @@ -478,7 +478,7 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] ; X64-AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X64-AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 ; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 ; X64-AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm4 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 7bde3facc735e..93097e2b98fb7 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -25,7 +25,7 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB0_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -48,7 +48,7 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read ; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne .LBB0_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -103,7 +103,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -128,7 +128,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -153,7 +153,7 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -218,7 +218,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -251,7 +251,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -278,7 +278,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -305,7 +305,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -386,7 +386,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -433,7 +433,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -466,7 +466,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -498,7 +498,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -527,7 +527,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -593,7 +593,7 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -616,7 +616,7 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly ; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne .LBB4_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -675,7 +675,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -701,7 +701,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -727,7 +727,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -833,7 +833,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -861,7 +861,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -889,7 +889,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -982,7 +982,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1033,7 +1033,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1068,7 +1068,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1102,7 +1102,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1132,7 +1132,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1195,7 +1195,7 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -1218,7 +1218,7 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read ; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1279,7 +1279,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: jne .LBB9_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -1310,7 +1310,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1336,7 +1336,7 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1414,7 +1414,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1459,7 +1459,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1491,7 +1491,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1519,7 +1519,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1627,7 +1627,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; SSE2-NEXT: paddd %xmm5, %xmm9 ; SSE2-NEXT: paddd %xmm10, %xmm9 ; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1700,7 +1700,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1744,7 +1744,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1778,7 +1778,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2074,7 +2074,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { ; ; AVX1-LABEL: pmaddwd_negative2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 @@ -2647,7 +2647,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; SSE2-NEXT: movdqu (%rcx), %xmm2 ; SSE2-NEXT: pmaddwd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2661,7 +2661,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2708,7 +2708,7 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2730,7 +2730,7 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2798,13 +2798,13 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; SSE2-NEXT: jne .LBB33_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2839,14 +2839,14 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovd %xmm1, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2874,14 +2874,14 @@ define i64 @sum_and_sum_of_squares(i8* %a, i32 %n) { ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vmovd %xmm1, %ecx ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2953,7 +2953,7 @@ define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { ; SSE2-NEXT: jne .LBB34_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -2980,7 +2980,7 @@ define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -3007,7 +3007,7 @@ define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) { ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index 24ae0c77af2fe..c93543ddda7ae 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -2478,7 +2478,7 @@ define void @compressstore_v2i64_v2i1(i64* %base, <2 x i64> %V, <2 x i1> %mask) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB7_4 ; SSE2-NEXT: LBB7_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -2574,7 +2574,7 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $4, %al @@ -2585,7 +2585,7 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB8_8 ; SSE2-NEXT: LBB8_7: ## %cond.store7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -2762,7 +2762,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB9_4 ; SSE2-NEXT: LBB9_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $4, %al @@ -2773,7 +2773,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB9_8 ; SSE2-NEXT: LBB9_7: ## %cond.store7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $16, %al @@ -2784,7 +2784,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB9_12 ; SSE2-NEXT: LBB9_11: ## %cond.store13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: testb $64, %al @@ -2795,7 +2795,7 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB9_16 ; SSE2-NEXT: LBB9_15: ## %cond.store19 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -3068,7 +3068,7 @@ define void @compressstore_v4i32_v4i32(i32* %base, <4 x i32> %V, <4 x i32> %trig ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB10_6 ; SSE2-NEXT: LBB10_5: ## %cond.store4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testb $8, %al diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll index dbd95213a60de..c3020b5f467c6 100644 --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -34,23 +34,23 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; SSE-NEXT: retq ; SSE-NEXT: .LBB0_1: # %cond.load ; SSE-NEXT: movq %xmm0, %rcx -; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; SSE-NEXT: testb $2, %al ; SSE-NEXT: je .LBB0_4 ; SSE-NEXT: .LBB0_3: # %cond.load1 ; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: insertps $16, (%rcx), %xmm3 # xmm3 = xmm3[0],mem[0],xmm3[2,3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; SSE-NEXT: testb $4, %al ; SSE-NEXT: je .LBB0_6 ; SSE-NEXT: .LBB0_5: # %cond.load4 ; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: insertps $32, (%rcx), %xmm3 # xmm3 = xmm3[0,1],mem[0],xmm3[3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB0_8 ; SSE-NEXT: .LBB0_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm1, %rax -; SSE-NEXT: insertps $48, (%rax), %xmm3 # xmm3 = xmm3[0,1,2],mem[0] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; @@ -63,14 +63,14 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX1-NEXT: je .LBB0_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB0_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB0_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB0_4: # %else2 ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -84,12 +84,12 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB0_5: # %cond.load4 ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB0_8 ; AVX1-NEXT: .LBB0_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -103,14 +103,14 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX2-NEXT: je .LBB0_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB0_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB0_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB0_4: # %else2 ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -124,12 +124,12 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB0_5: # %cond.load4 ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB0_8 ; AVX2-NEXT: .LBB0_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -159,7 +159,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: pmovsxdq %xmm0, %xmm4 ; SSE-NEXT: psllq $2, %xmm4 ; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm5 @@ -168,7 +168,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: je .LBB1_2 ; SSE-NEXT: # %bb.1: # %cond.load ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: movd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; SSE-NEXT: .LBB1_2: # %else ; SSE-NEXT: psllq $2, %xmm0 @@ -176,7 +176,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: je .LBB1_4 ; SSE-NEXT: # %bb.3: # %cond.load1 ; SSE-NEXT: pextrq $1, %xmm4, %rcx -; SSE-NEXT: insertps $16, (%rcx), %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SSE-NEXT: .LBB1_4: # %else2 ; SSE-NEXT: paddq %xmm0, %xmm3 ; SSE-NEXT: testb $4, %al @@ -189,12 +189,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; SSE-NEXT: retq ; SSE-NEXT: .LBB1_5: # %cond.load4 ; SSE-NEXT: movq %xmm3, %rcx -; SSE-NEXT: insertps $32, (%rcx), %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB1_8 ; SSE-NEXT: .LBB1_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm3, %rax -; SSE-NEXT: insertps $48, (%rax), %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -202,7 +202,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq %rdi, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4 ; AVX1-NEXT: vpsllq $2, %xmm4, %xmm4 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm4 @@ -217,14 +217,14 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX1-NEXT: je .LBB1_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB1_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB1_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB1_4: # %else2 ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -238,12 +238,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB1_5: # %cond.load4 ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB1_8 ; AVX1-NEXT: .LBB1_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -262,14 +262,14 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX2-NEXT: je .LBB1_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB1_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB1_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB1_4: # %else2 ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -283,12 +283,12 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB1_5: # %cond.load4 ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB1_8 ; AVX2-NEXT: .LBB1_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -328,7 +328,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; SSE-NEXT: je .LBB2_2 ; SSE-NEXT: # %bb.1: # %cond.load ; SSE-NEXT: movq %xmm0, %rcx -; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; SSE-NEXT: .LBB2_2: # %else ; SSE-NEXT: psllq $2, %xmm1 @@ -336,7 +336,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; SSE-NEXT: je .LBB2_4 ; SSE-NEXT: # %bb.3: # %cond.load1 ; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: insertps $16, (%rcx), %xmm3 # xmm3 = xmm3[0],mem[0],xmm3[2,3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; SSE-NEXT: .LBB2_4: # %else2 ; SSE-NEXT: paddq %xmm1, %xmm4 ; SSE-NEXT: testb $4, %al @@ -349,12 +349,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; SSE-NEXT: retq ; SSE-NEXT: .LBB2_5: # %cond.load4 ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: insertps $32, (%rcx), %xmm3 # xmm3 = xmm3[0,1],mem[0],xmm3[3] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; SSE-NEXT: testb $8, %al ; SSE-NEXT: je .LBB2_8 ; SSE-NEXT: .LBB2_7: # %cond.load7 ; SSE-NEXT: pextrq $1, %xmm4, %rax -; SSE-NEXT: insertps $48, (%rax), %xmm3 # xmm3 = xmm3[0,1,2],mem[0] +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; @@ -375,14 +375,14 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX1-NEXT: je .LBB2_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX1-NEXT: .LBB2_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB2_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB2_4: # %else2 ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -396,12 +396,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB2_5: # %cond.load4 ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je .LBB2_8 ; AVX1-NEXT: .LBB2_7: # %cond.load7 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -419,14 +419,14 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX2-NEXT: je .LBB2_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB2_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB2_4 ; AVX2-NEXT: # %bb.3: # %cond.load1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vinsertps $16, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB2_4: # %else2 ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -440,12 +440,12 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB2_5: # %cond.load4 ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vinsertps $32, (%rcx), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je .LBB2_8 ; AVX2-NEXT: .LBB2_7: # %cond.load7 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vinsertps $48, (%rax), %xmm2, %xmm2 # xmm2 = xmm2[0,1,2],mem[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: vmovaps %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -480,7 +480,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: pxor %xmm6, %xmm6 @@ -513,7 +513,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE-NEXT: pextrq $1, %xmm4, %rcx ; SSE-NEXT: pinsrb $3, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_8: # %else8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: testb $16, %al ; SSE-NEXT: je .LBB3_10 @@ -542,7 +542,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE-NEXT: pextrq $1, %xmm1, %rcx ; SSE-NEXT: pinsrb $7, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_16: # %else20 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: testl $256, %eax # imm = 0x100 ; SSE-NEXT: je .LBB3_18 @@ -571,7 +571,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; SSE-NEXT: pextrq $1, %xmm1, %rcx ; SSE-NEXT: pinsrb $11, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_24: # %else32 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE-NEXT: je .LBB3_26 @@ -611,7 +611,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %rdi, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6 ; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -626,7 +626,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vpinsrb $0, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_2: # %else -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB3_4 @@ -657,7 +657,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vpinsrb $4, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_10: # %else11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6 ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB3_12 @@ -689,7 +689,7 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: vpinsrb $8, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_18: # %else23 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 ; AVX1-NEXT: testl $512, %eax # imm = 0x200 ; AVX1-NEXT: je .LBB3_20 @@ -1040,7 +1040,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; SSE-NEXT: je .LBB4_1 ; SSE-NEXT: # %bb.2: # %cond.load ; SSE-NEXT: movq %xmm5, %rcx -; SSE-NEXT: movd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: testb $2, %al ; SSE-NEXT: jne .LBB4_4 ; SSE-NEXT: jmp .LBB4_5 @@ -1105,7 +1105,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; SSE-NEXT: je .LBB4_19 ; SSE-NEXT: # %bb.20: # %cond.load23 ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: movd (%rcx), %xmm5 # xmm5 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE-NEXT: testb $2, %al ; SSE-NEXT: jne .LBB4_22 ; SSE-NEXT: jmp .LBB4_23 @@ -1174,7 +1174,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; SSE-NEXT: je .LBB4_37 ; SSE-NEXT: # %bb.38: # %cond.load72 ; SSE-NEXT: movq %xmm4, %rcx -; SSE-NEXT: movd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: testb $2, %al ; SSE-NEXT: jne .LBB4_40 ; SSE-NEXT: jmp .LBB4_41 @@ -1260,7 +1260,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: je .LBB4_2 ; AVX1-NEXT: # %bb.1: # %cond.load ; AVX1-NEXT: vmovq %xmm3, %rdx -; AVX1-NEXT: vmovd (%rdx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: .LBB4_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_4 @@ -1334,7 +1334,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: je .LBB4_18 ; AVX1-NEXT: # %bb.17: # %cond.load23 ; AVX1-NEXT: vmovq %xmm7, %rcx -; AVX1-NEXT: vmovd (%rcx), %xmm4 # xmm4 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; AVX1-NEXT: .LBB4_18: # %else27 ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_20 @@ -1405,7 +1405,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: je .LBB4_34 ; AVX1-NEXT: # %bb.33: # %cond.load72 ; AVX1-NEXT: vmovq %xmm7, %rcx -; AVX1-NEXT: vmovd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: .LBB4_34: # %else76 ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_36 @@ -1491,7 +1491,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: je .LBB4_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: vmovd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: .LBB4_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_4 @@ -1534,7 +1534,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: je .LBB4_18 ; AVX2-NEXT: # %bb.17: # %cond.load23 ; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: vmovd (%rcx), %xmm2 # xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX2-NEXT: .LBB4_18: # %else27 ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_20 @@ -1678,7 +1678,7 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: jmp .LBB4_32 ; AVX2-NEXT: .LBB4_33: # %cond.load72 ; AVX2-NEXT: vmovq %xmm3, %rcx -; AVX2-NEXT: vmovd (%rcx), %xmm0 # xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB4_36 ; AVX2-NEXT: .LBB4_35: # %cond.load78 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index a14d537bc25da..e8dc7412eef8b 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -153,7 +153,7 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, < ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 @@ -233,7 +233,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 @@ -458,12 +458,12 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; ; AVX1-LABEL: load_v8f64_v8i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 @@ -479,7 +479,7 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; ; AVX2-LABEL: load_v8f64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3 @@ -1778,12 +1778,12 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6 ; ; AVX1-LABEL: load_v8i64_v8i16: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 @@ -1799,7 +1799,7 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6 ; ; AVX2-LABEL: load_v8i64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 29678e8495c22..389281726d273 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -838,7 +838,7 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> % ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB7_4 ; SSE2-NEXT: LBB7_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -922,7 +922,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> % ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, 8(%rdi) ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB8_6 @@ -931,7 +931,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> % ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB8_8 ; SSE2-NEXT: LBB8_7: ## %cond.store5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, 24(%rdi) ; SSE2-NEXT: retq ; @@ -1158,7 +1158,7 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> % ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB11_6 ; SSE2-NEXT: LBB11_5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB11_8 @@ -1280,7 +1280,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> % ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB12_6 ; SSE2-NEXT: LBB12_5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB12_8 @@ -1299,7 +1299,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> % ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB12_14 ; SSE2-NEXT: LBB12_13: ## %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB12_16 @@ -4674,7 +4674,7 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub ; AVX1: ## %bb.0: ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) @@ -4853,7 +4853,7 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; SSE2-NEXT: testb $4, %cl ; SSE2-NEXT: je LBB25_6 ; SSE2-NEXT: LBB25_5: ## %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 341a349911180..640e145c20231 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -60,7 +60,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: .LBB0_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: jne .LBB0_7 @@ -75,7 +75,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 ; SSE2-NEXT: .LBB0_13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 @@ -1030,7 +1030,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 ; SSE2-NEXT: .LBB3_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index c120684c42610..17c113f098eb4 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -141,7 +141,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm4, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm3, %xmm2 @@ -176,7 +176,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 ; SSE2-NEXT: .LBB0_13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 @@ -1579,7 +1579,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 ; SSE2-NEXT: .LBB3_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 0160733732d70..2ca9ebb0d5c6d 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -87,7 +87,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm0, %xmm3 @@ -122,7 +122,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 ; SSE2-NEXT: .LBB0_13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 @@ -1351,7 +1351,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 ; SSE2-NEXT: .LBB3_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll index 8b3756a1fa241..6dcc47b9a65c4 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -309,7 +309,7 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -320,7 +320,7 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] @@ -348,12 +348,12 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, (%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) ; X64-SSE2-NEXT: movq %xmm1, %rax ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) ; X64-SSE2-NEXT: retq @@ -422,7 +422,7 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -433,7 +433,7 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] @@ -461,12 +461,12 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, (%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) ; X64-SSE2-NEXT: movq %xmm1, %rax ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-SSE2-NEXT: movq %xmm0, %rax ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 2980a1665db14..666df4a1b960d 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -197,7 +197,7 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -263,7 +263,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -327,7 +327,7 @@ define i32 @sad_16i8_256() "min-legal-vector-width"="256" { ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -388,7 +388,7 @@ define i32 @sad_16i8_512() "min-legal-vector-width"="512" { ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -955,10 +955,10 @@ define void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal ; CHECK-LABEL: zext_v16i8_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero @@ -977,10 +977,10 @@ define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal ; CHECK-LABEL: sext_v16i8_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3 ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 ; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll index c83c675e78890..4e83a7ce7231b 100644 --- a/llvm/test/CodeGen/X86/nontemporal-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-2.ll @@ -595,14 +595,14 @@ define void @test_extract_f64(<2 x double> %arg, double* %dst) { define void @test_extract_i64(<2 x i64> %arg, i64* %dst) { ; SSE2-LABEL: test_extract_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movntiq %rax, (%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_extract_i64: ; SSE4A: # %bb.0: -; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE4A-NEXT: movq %xmm0, %rax ; SSE4A-NEXT: movntiq %rax, (%rdi) ; SSE4A-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 2f74c830221b0..d24fd3f024d49 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -9,7 +9,7 @@ define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind { ; SSE2-LABEL: v3i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movq %xmm2, 16(%rdi) ; SSE2-NEXT: movdqa %xmm0, (%rdi) @@ -285,7 +285,7 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind { ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7] ; SSE2-NEXT: movw %ax, 12(%rdi) ; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -391,7 +391,7 @@ define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind { ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: movq %xmm2, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -539,7 +539,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind { ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] @@ -637,7 +637,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: retq ; @@ -1202,7 +1202,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE2-NEXT: movups 32(%rdi), %xmm10 ; SSE2-NEXT: movups 48(%rdi), %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3] @@ -1215,7 +1215,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2] ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3] @@ -1243,12 +1243,12 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1] ; SSE42-NEXT: movdqa %xmm9, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1] ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,2] @@ -1476,7 +1476,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] @@ -1489,7 +1489,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll index dd258c5f424a4..7f3eb0898cc86 100644 --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -46,7 +46,7 @@ define i32 @extract_extract01_v4i32_add_i32(<4 x i32> %x) { define i32 @extract_extract23_v4i32_add_i32(<4 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -114,7 +114,7 @@ define i32 @extract_extract01_v4i32_add_i32_commute(<4 x i32> %x) { define i32 @extract_extract23_v4i32_add_i32_commute(<4 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -326,7 +326,7 @@ define i32 @extract_extract01_v4i32_sub_i32(<4 x i32> %x) { define i32 @extract_extract23_v4i32_sub_i32(<4 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4i32_sub_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %eax ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx @@ -382,7 +382,7 @@ define i32 @extract_extract01_v4i32_sub_i32_commute(<4 x i32> %x) { define i32 @extract_extract23_v4i32_sub_i32_commute(<4 x i32> %x) { ; SSE3-LABEL: extract_extract23_v4i32_sub_i32_commute: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm1, %ecx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %eax @@ -557,7 +557,7 @@ define i32 @extract_extract01_v8i32_add_i32(<8 x i32> %x) { define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -594,7 +594,7 @@ define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) { define i32 @extract_extract67_v8i32_add_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -684,7 +684,7 @@ define i32 @extract_extract01_v8i32_add_i32_commute(<8 x i32> %x) { define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -721,7 +721,7 @@ define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) { define i32 @extract_extract67_v8i32_add_i32_commute(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax @@ -1119,7 +1119,7 @@ define i32 @extract_extract01_v8i32_sub_i32(<8 x i32> %x) { define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8i32_sub_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm1, %eax ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx @@ -1156,7 +1156,7 @@ define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) { define i32 @extract_extract67_v8i32_sub_i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8i32_sub_i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %eax ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-SLOW-NEXT: movd %xmm0, %ecx @@ -1672,7 +1672,7 @@ define i32 @extract_extract01_v4i32_add_i32_uses3(<4 x i32> %x, i32* %p1, i32* % define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_add_v8i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1681,7 +1681,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_add_v8i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -1689,7 +1689,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_add_v8i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1715,7 +1715,7 @@ define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_add_v16i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1724,7 +1724,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_add_v16i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -1732,7 +1732,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_add_v16i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1758,7 +1758,7 @@ define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 @@ -1767,7 +1767,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_sub_v8i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 ; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax @@ -1775,7 +1775,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_sub_v8i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1785,7 +1785,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { ; ; AVX-FAST-LABEL: partial_reduction_sub_v8i32: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax @@ -1802,7 +1802,7 @@ define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 @@ -1811,7 +1811,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; SSE3-FAST-LABEL: partial_reduction_sub_v16i32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 ; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax @@ -1819,7 +1819,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_sub_v16i32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1829,7 +1829,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX1-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax @@ -1838,7 +1838,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX2-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1848,7 +1848,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ; ; AVX512-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1868,7 +1868,7 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { define i16 @hadd16_8(<8 x i16> %x223) { ; SSE3-SLOW-LABEL: hadd16_8: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddw %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddw %xmm1, %xmm0 @@ -1890,7 +1890,7 @@ define i16 @hadd16_8(<8 x i16> %x223) { ; ; AVX-SLOW-LABEL: hadd16_8: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -1921,7 +1921,7 @@ define i16 @hadd16_8(<8 x i16> %x223) { define i32 @hadd32_4(<4 x i32> %x225) { ; SSE3-SLOW-LABEL: hadd32_4: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1937,7 +1937,7 @@ define i32 @hadd32_4(<4 x i32> %x225) { ; ; AVX-SLOW-LABEL: hadd32_4: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1961,7 +1961,7 @@ define i32 @hadd32_4(<4 x i32> %x225) { define i32 @hadd32_8(<8 x i32> %x225) { ; SSE3-SLOW-LABEL: hadd32_8: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -1970,7 +1970,7 @@ define i32 @hadd32_8(<8 x i32> %x225) { ; ; SSE3-FAST-LABEL: hadd32_8: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -1978,7 +1978,7 @@ define i32 @hadd32_8(<8 x i32> %x225) { ; ; AVX-SLOW-LABEL: hadd32_8: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2004,7 +2004,7 @@ define i32 @hadd32_8(<8 x i32> %x225) { define i32 @hadd32_16(<16 x i32> %x225) { ; SSE3-SLOW-LABEL: hadd32_16: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 @@ -2013,7 +2013,7 @@ define i32 @hadd32_16(<16 x i32> %x225) { ; ; SSE3-FAST-LABEL: hadd32_16: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 ; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSE3-FAST-NEXT: movd %xmm1, %eax @@ -2021,7 +2021,7 @@ define i32 @hadd32_16(<16 x i32> %x225) { ; ; AVX-SLOW-LABEL: hadd32_16: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2119,7 +2119,7 @@ define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 { define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_8_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-NEXT: paddd %xmm0, %xmm1 ; SSE3-NEXT: phaddd %xmm1, %xmm1 ; SSE3-NEXT: movd %xmm1, %eax @@ -2143,7 +2143,7 @@ define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize { define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_16_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE3-NEXT: paddd %xmm0, %xmm1 ; SSE3-NEXT: phaddd %xmm1, %xmm1 ; SSE3-NEXT: movd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 30f87e3d9b27b..4285e7b603f80 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1083,7 +1083,7 @@ define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) { ; ; SSE41-LABEL: mul_v4i64_zero_upper_left: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -1252,14 +1252,14 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] @@ -1306,7 +1306,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 @@ -1324,7 +1324,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; ; AVX2-LABEL: mul_v8i64_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 14fc182a334a8..31e113f9a003f 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -434,7 +434,7 @@ define <8 x i32> @mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmulhuw %xmm1, %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -467,7 +467,7 @@ define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmulhw %xmm1, %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -498,7 +498,7 @@ define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmulhw %xmm1, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -538,9 +538,9 @@ define <16 x i32> @mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmulhuw %xmm3, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm5, %xmm1 @@ -589,9 +589,9 @@ define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmulhw %xmm3, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm5, %xmm1 @@ -639,9 +639,9 @@ define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) { ; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 ; SSE41-NEXT: pmulhw %xmm3, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm5, %xmm1 @@ -702,16 +702,16 @@ define <32 x i32> @mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhuw %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; SSE41-NEXT: pmulhuw %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; SSE41-NEXT: pmulhuw %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; SSE41-NEXT: pmulhuw %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -800,16 +800,16 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; SSE41-NEXT: pmulhw %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; SSE41-NEXT: pmulhw %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; SSE41-NEXT: pmulhw %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -901,16 +901,16 @@ define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 ; SSE41-NEXT: pmulhw %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm5, %xmm5 ; SSE41-NEXT: pmulhw %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm6, %xmm6 ; SSE41-NEXT: pmulhw %xmm7, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm7, %xmm7 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 @@ -1026,28 +1026,28 @@ define <64 x i32> @mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1204,28 +1204,28 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1386,28 +1386,28 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm8, %xmm8 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm9, %xmm9 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm10, %xmm10 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm11, %xmm11 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm12, %xmm12 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm13, %xmm13 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm14, %xmm14 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm15, %xmm15 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 @@ -1541,7 +1541,7 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -1552,7 +1552,7 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX2-NEXT: retq ; @@ -1648,7 +1648,7 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -1659,7 +1659,7 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX2-NEXT: retq ; @@ -1775,7 +1775,7 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 @@ -1786,7 +1786,7 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll index 73acb76ce55fc..b8ecadba81c0e 100644 --- a/llvm/test/CodeGen/X86/pr15267.ll +++ b/llvm/test/CodeGen/X86/pr15267.ll @@ -75,7 +75,7 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind { ; CHECK-NEXT: negl %eax ; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr39733.ll b/llvm/test/CodeGen/X86/pr39733.ll index 75f9dc51b85eb..31bd5b71d0a6e 100644 --- a/llvm/test/CodeGen/X86/pr39733.ll +++ b/llvm/test/CodeGen/X86/pr39733.ll @@ -21,7 +21,7 @@ define void @test55() { ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vmovaps %xmm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rsp) diff --git a/llvm/test/CodeGen/X86/pr42452.ll b/llvm/test/CodeGen/X86/pr42452.ll index f2f0cd2d3ce6b..d3a1dad42bd39 100644 --- a/llvm/test/CodeGen/X86/pr42452.ll +++ b/llvm/test/CodeGen/X86/pr42452.ll @@ -8,7 +8,7 @@ define void @foo(i1 %c, <2 x i64> %x) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: # kill: def $dil killed $dil killed $edi ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rcx ; CHECK-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill diff --git a/llvm/test/CodeGen/X86/pr42905.ll b/llvm/test/CodeGen/X86/pr42905.ll index 310a173f824e9..6ebe5be45a4f8 100644 --- a/llvm/test/CodeGen/X86/pr42905.ll +++ b/llvm/test/CodeGen/X86/pr42905.ll @@ -7,7 +7,7 @@ define <4 x double> @autogen_SD30452(i1 %L230) { ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [151829,151829] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm2, %rax ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2sd %rax, %xmm2 diff --git a/llvm/test/CodeGen/X86/pr44976.ll b/llvm/test/CodeGen/X86/pr44976.ll index 871937d295508..72ced2b1fa0f4 100644 --- a/llvm/test/CodeGen/X86/pr44976.ll +++ b/llvm/test/CodeGen/X86/pr44976.ll @@ -57,7 +57,7 @@ define <3 x i32> @f_29(<12 x i16> %a, <12 x i16> %b) { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,1] ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,2,3] ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; CHECK-NEXT: paddd %xmm3, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll index 681e06ada65d2..36b2de07bcc83 100644 --- a/llvm/test/CodeGen/X86/pr45378.ll +++ b/llvm/test/CodeGen/X86/pr45378.ll @@ -77,7 +77,7 @@ define i1 @parseHeaders2_scalar_and(i64 * %ptr) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: testq %rcx, %rax ; SSE2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/pr46189.ll b/llvm/test/CodeGen/X86/pr46189.ll index 558483754c680..97190c10ef7c9 100644 --- a/llvm/test/CodeGen/X86/pr46189.ll +++ b/llvm/test/CodeGen/X86/pr46189.ll @@ -21,7 +21,7 @@ define { i64, i64 } @PR46189(double %0, double %1, double %2, double %3, double ; SSE-NEXT: cvttpd2dq %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movq %xmm0, %rdx ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll index e5ed94aa54934..c232d548faefd 100644 --- a/llvm/test/CodeGen/X86/pr46455.ll +++ b/llvm/test/CodeGen/X86/pr46455.ll @@ -10,7 +10,7 @@ define void @EntryModule(i8** %buffer_table) { ; CHECK-NEXT: vcmpneqps (%rax), %ymm0, %ymm0 ; CHECK-NEXT: vpsrld $31, %xmm0, %xmm1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; CHECK-NEXT: vpsubd %xmm0, %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 0706254f4e5c0..bdbc563816864 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -135,7 +135,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 28ab2e1968dbe..d785b10f9c327 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -522,7 +522,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test13: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm5, %xmm3 @@ -697,7 +697,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero @@ -735,9 +735,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6 @@ -772,7 +772,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; ; AVX2-LABEL: test14: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4 @@ -873,7 +873,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test15: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm4, %xmm5 @@ -1005,7 +1005,7 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test16: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmaxud %xmm1, %xmm4 @@ -1871,10 +1871,10 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmaxud %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index f55a58048e227..a197e795754ad 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -27,7 +27,7 @@ define i32 @sad_16i8() nounwind { ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -54,7 +54,7 @@ define i32 @sad_16i8() nounwind { ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -79,7 +79,7 @@ define i32 @sad_16i8() nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -104,7 +104,7 @@ define i32 @sad_16i8() nounwind { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -172,7 +172,7 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -207,7 +207,7 @@ define i32 @sad_32i8() nounwind { ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -234,7 +234,7 @@ define i32 @sad_32i8() nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -261,7 +261,7 @@ define i32 @sad_32i8() nounwind { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -346,7 +346,7 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -397,7 +397,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -431,7 +431,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -463,7 +463,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -492,7 +492,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -628,7 +628,7 @@ define i32 @sad_4i8() nounwind { ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -649,7 +649,7 @@ define i32 @sad_4i8() nounwind { ; AVX-NEXT: addq $4, %rax ; AVX-NEXT: jne .LBB4_1 ; AVX-NEXT: # %bb.2: # %middle.block -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -760,7 +760,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rdx), %xmm1 ; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq @@ -769,7 +769,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n ; AVX: # %bb.0: ; AVX-NEXT: vmovdqu (%rdi), %xmm0 ; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -803,7 +803,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq @@ -815,7 +815,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq @@ -826,7 +826,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper @@ -838,7 +838,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper @@ -883,7 +883,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; SSE2-NEXT: paddq %xmm0, %xmm2 ; SSE2-NEXT: paddq %xmm1, %xmm2 ; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq @@ -901,7 +901,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq @@ -915,7 +915,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper @@ -931,7 +931,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper @@ -945,7 +945,7 @@ define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* n ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper @@ -987,7 +987,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; SSE2-NEXT: psadbw %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1002,7 +1002,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1051,7 +1051,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; SSE2-NEXT: movdqu (%rcx), %xmm2 ; SSE2-NEXT: psadbw %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -1065,7 +1065,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index 02400d28c0e2c..3b3a1b57ecd02 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -443,13 +443,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movdqa %xmm1, %xmm4 ; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax @@ -457,9 +457,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx @@ -478,9 +478,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: idivq %rdi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X64-NEXT: movq %xmm2, %rsi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rsi diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index e5348b9febb4d..512488e8f8725 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -651,14 +651,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx ; X64-NEXT: movq %rbx, %r13 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: shldq $31, %rbx, %r13 -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rbp @@ -709,8 +709,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -720,8 +720,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movq %rbx, %r12 ; X64-NEXT: sarq $63, %r12 ; X64-NEXT: shldq $31, %rbx, %r12 -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -773,14 +773,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx ; X64-NEXT: movq %rbx, %r13 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: shldq $31, %rbx, %r13 -; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[2,3,0,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rbp diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index 74a6ae58f9f9f..8a788f41d5cc2 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -75,16 +75,16 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) { define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rcx ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: movq %xmm1, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rsi ; SSE2-NEXT: xorq %rcx, %rsi ; SSE2-NEXT: orq %rdi, %rsi @@ -155,16 +155,16 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rcx ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: movq %xmm1, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rsi ; SSE2-NEXT: xorq %rcx, %rsi ; SSE2-NEXT: orq %rdi, %rsi @@ -235,28 +235,28 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: ne_i512: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rsi ; SSE2-NEXT: movq %xmm0, %r11 ; SSE2-NEXT: movq %xmm2, %r8 ; SSE2-NEXT: movq %xmm1, %r9 ; SSE2-NEXT: movq %xmm3, %r10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: xorq %rdx, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx ; SSE2-NEXT: orq %rcx, %rdx @@ -426,28 +426,28 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; SSE2-LABEL: eq_i512: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rsi ; SSE2-NEXT: movq %xmm0, %r11 ; SSE2-NEXT: movq %xmm2, %r8 ; SSE2-NEXT: movq %xmm1, %r9 ; SSE2-NEXT: movq %xmm3, %r10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: xorq %rdx, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx ; SSE2-NEXT: orq %rcx, %rdx diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 6a5fab8469fa7..17cfc53266909 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2094,9 +2094,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi ; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; X86-SSE-NEXT: movd %xmm7, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; X86-SSE-NEXT: movd %xmm7, %esi ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi @@ -2137,9 +2137,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi ; X86-SSE-NEXT: movd %edx, %xmm4 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE-NEXT: movd %xmm2, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X86-SSE-NEXT: movd %xmm1, %esi ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %esi @@ -2336,9 +2336,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx ; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; X64-SSE-NEXT: movd %xmm7, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; X64-SSE-NEXT: movd %xmm7, %ecx ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx @@ -2379,9 +2379,9 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx ; X64-SSE-NEXT: movd %edx, %xmm4 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X64-SSE-NEXT: movd %xmm2, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-SSE-NEXT: movd %xmm1, %ecx ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 0b79b62f84a11..53deafc9a4b43 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -224,7 +224,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM32: # %bb.0: ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLM32-NEXT: movdqa %xmm1, %xmm4 ; SLM32-NEXT: movdqa %xmm3, %xmm5 @@ -244,7 +244,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM64: # %bb.0: ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLM64-NEXT: movdqa %xmm1, %xmm4 ; SLM64-NEXT: movdqa %xmm3, %xmm5 @@ -270,7 +270,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLOW32-NEXT: movdqa %xmm1, %xmm4 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLOW32-NEXT: movdqa %xmm3, %xmm0 ; SLOW32-NEXT: pmulhw %xmm2, %xmm0 @@ -291,7 +291,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLOW64-NEXT: movdqa %xmm1, %xmm4 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SLOW64-NEXT: movdqa %xmm3, %xmm0 ; SLOW64-NEXT: pmulhw %xmm2, %xmm0 @@ -306,7 +306,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SSE4-32: # %bb.0: ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -322,7 +322,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SSE4-64: # %bb.0: ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -336,7 +336,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -346,7 +346,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; ; AVX2-64-LABEL: test_mul_v16i32_v16i8: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -494,7 +494,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; ; SSE4-32-LABEL: test_mul_v8i32_v8i16: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -504,7 +504,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; ; SSE4-64-LABEL: test_mul_v8i32_v8i16: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -609,9 +609,9 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; ; SSE4-32-LABEL: test_mul_v16i32_v16i16: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -625,9 +625,9 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; ; SSE4-64-LABEL: test_mul_v16i32_v16i16: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -880,7 +880,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] ; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -896,7 +896,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] ; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -910,7 +910,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLOW32: # %bb.0: ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -926,7 +926,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLOW64: # %bb.0: ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -942,7 +942,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SSE4-32: # %bb.0: ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -958,7 +958,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SSE4-64: # %bb.0: ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -972,7 +972,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize: ; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -982,7 +982,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] @@ -1077,7 +1077,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; SLM32-LABEL: test_mul_v8i32_v8i16_minsize: ; SLM32: # %bb.0: ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLM32-NEXT: pmulld %xmm2, %xmm0 @@ -1087,7 +1087,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; SLM64-LABEL: test_mul_v8i32_v8i16_minsize: ; SLM64: # %bb.0: ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLM64-NEXT: pmulld %xmm2, %xmm0 @@ -1096,7 +1096,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize: ; SLOW32: # %bb.0: -; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1106,7 +1106,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize: ; SLOW64: # %bb.0: -; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1116,7 +1116,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1126,7 +1126,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; ; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] @@ -1155,9 +1155,9 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1171,9 +1171,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1187,9 +1187,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize: ; SLOW32: # %bb.0: -; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -1203,9 +1203,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize: ; SLOW64: # %bb.0: -; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -1219,9 +1219,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize: ; SSE4-32: # %bb.0: -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -1235,9 +1235,9 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; ; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize: ; SSE4-64: # %bb.0: -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 284f51d7422e6..e0c1b762c1507 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -206,10 +206,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X64-NEXT: cmovll %ecx, %edx ; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %edx ; X64-NEXT: movslq %edx, %rdx -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %esi ; X64-NEXT: movslq %esi, %rsi ; X64-NEXT: imulq %rdx, %rsi @@ -476,9 +476,9 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF ; X64-NEXT: imull %edx, %ecx ; X64-NEXT: cmovol %edi, %ecx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm2, %esi ; X64-NEXT: movl %esi, %edi ; X64-NEXT: imull %edx, %edi diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll index 7f0e2cfa44b02..0abac92099055 100644 --- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll +++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll @@ -5,7 +5,7 @@ define <4 x i64> @autogen_SD88863() { ; CHECK-LABEL: autogen_SD88863: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] diff --git a/llvm/test/CodeGen/X86/split-vector-rem.ll b/llvm/test/CodeGen/X86/split-vector-rem.ll index ef03075ac65d6..959c9bd0ff769 100644 --- a/llvm/test/CodeGen/X86/split-vector-rem.ll +++ b/llvm/test/CodeGen/X86/split-vector-rem.ll @@ -12,9 +12,9 @@ define <8 x i32> @foo(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx @@ -41,9 +41,9 @@ define <8 x i32> @foo(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx @@ -79,9 +79,9 @@ define <8 x i32> @bar(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; CHECK-NEXT: movd %xmm5, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx @@ -108,9 +108,9 @@ define <8 x i32> @bar(<8 x i32> %t, <8 x i32> %u) { ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; CHECK-NEXT: movd %xmm4, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 85400656e2e54..206814cfcf1c7 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2791,8 +2791,8 @@ define void @test_mm_storeh_pi(x86_mmx *%a0, <4 x float> %a1) nounwind { ; ; X64-SSE2-LABEL: test_mm_storeh_pi: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pshufd $78, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x4e] -; X64-SSE2-NEXT: # xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: punpckhqdq %xmm0, %xmm0 # encoding: [0x66,0x0f,0x6d,0xc0] +; X64-SSE2-NEXT: # xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0] ; X64-SSE2-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07] ; X64-SSE2-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 460987cd74dfe..e2973ebbab89e 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -692,24 +692,24 @@ entry: define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: insertps_from_shufflevector_i32_2: ; SSE: ## %bb.0: ## %entry -; SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e] -; SSE-NEXT: ## xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] +; SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c] ; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: insertps_from_shufflevector_i32_2: ; AVX1: ## %bb.0: ## %entry -; AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: insertps_from_shufflevector_i32_2: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1] +; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] @@ -1875,8 +1875,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; X86-SSE-LABEL: insertps_pr20411: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e] -; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X86-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] +; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] ; X86-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; X86-SSE-NEXT: movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08] @@ -1885,8 +1885,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; X86-AVX1-LABEL: insertps_pr20411: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X86-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00] @@ -1895,8 +1895,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; X86-AVX512-LABEL: insertps_pr20411: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] @@ -1904,8 +1904,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; ; X64-SSE-LABEL: insertps_pr20411: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e] -; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X64-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] +; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] ; X64-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; X64-SSE-NEXT: movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f] @@ -1913,8 +1913,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; ; X64-AVX1-LABEL: insertps_pr20411: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X64-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07] @@ -1922,8 +1922,8 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* ; ; X64-AVX512-LABEL: insertps_pr20411: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e] -; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,0,1] +; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll index 05b2b6608addd..4c7acf60d3080 100644 --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -73,7 +73,7 @@ define <2 x i32> @test4(<8 x i32> %v) { define <2 x i32> @test5(<8 x i32> %v) { ; SSE2-LABEL: test5: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; @@ -175,7 +175,7 @@ define <2 x i32> @test9(<8 x i32> %v) { define <2 x i32> @test10(<8 x i32> %v) { ; SSE2-LABEL: test10: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll index 7540b394babdf..e42e527553f14 100644 --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -248,9 +248,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx @@ -264,9 +264,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll index 493515c418caa..f6c8baf2e9238 100644 --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -335,9 +335,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm7 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx @@ -369,9 +369,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx diff --git a/llvm/test/CodeGen/X86/uint_to_fp-3.ll b/llvm/test/CodeGen/X86/uint_to_fp-3.ll index 9efd9a5bef5f2..ca46b48b7731f 100644 --- a/llvm/test/CodeGen/X86/uint_to_fp-3.ll +++ b/llvm/test/CodeGen/X86/uint_to_fp-3.ll @@ -40,7 +40,7 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) { ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm1 ; X32-SSE-NEXT: movaps %xmm2, %xmm0 ; X32-SSE-NEXT: retl @@ -55,7 +55,7 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) { ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm1 ; X64-SSE-NEXT: movaps %xmm2, %xmm0 ; X64-SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 18b769f32ec94..ce744f93cdfe8 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -151,9 +151,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movl $-1, %eax ; X64-NEXT: cmoval %eax, %ecx ; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %ecx -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %edx ; X64-NEXT: imulq %rcx, %rdx ; X64-NEXT: movq %rdx, %rcx @@ -361,9 +361,9 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movl $-1, %ecx ; X64-NEXT: cmovol %ecx, %eax ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %edx ; X64-NEXT: mull %edx ; X64-NEXT: cmovol %ecx, %eax diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 9fdb08ba6d4e8..923aaa34f04d0 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -2411,7 +2411,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2516,7 +2516,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 8bc971e79f507..0a057852613ae 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -16,7 +16,7 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { ; SSE3: # %bb.0: ; SSE3-NEXT: movq %xmm1, %rax ; SSE3-NEXT: andl $1, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -29,7 +29,7 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %xmm1, %rax ; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -69,7 +69,7 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm1, %esi @@ -379,7 +379,7 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun ; SSE3: # %bb.0: ; SSE3-NEXT: movq %xmm1, %rax ; SSE3-NEXT: andl $1, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE3-NEXT: movq %xmm1, %rcx ; SSE3-NEXT: andl $1, %ecx ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -391,7 +391,7 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %xmm1, %rax ; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm1, %rcx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -430,7 +430,7 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; SSE3-NEXT: movd %xmm2, %ecx -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm1, %esi diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll index 04692995b7e95..94f7d7eeaf39d 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -135,7 +135,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fstps (%esp) @@ -154,7 +154,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: cvtsi2ss %rax, %xmm1 -; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: xorps %xmm0, %xmm0 ; SSE-64-NEXT: cvtsi2ss %rax, %xmm0 @@ -172,7 +172,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-32-NEXT: andl $-8, %esp ; SSE41-32-NEXT: subl $24, %esp ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fstps (%esp) @@ -191,7 +191,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-64: # %bb.0: ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: xorps %xmm0, %xmm0 ; SSE41-64-NEXT: cvtsi2ss %rax, %xmm0 @@ -209,7 +209,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstps (%esp) @@ -236,7 +236,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -271,7 +271,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] @@ -313,7 +313,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE-64-NEXT: # %bb.1: ; SSE-64-NEXT: addss %xmm0, %xmm0 ; SSE-64-NEXT: .LBB3_2: -; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-64-NEXT: movq %xmm1, %rax ; SSE-64-NEXT: movq %rax, %rcx ; SSE-64-NEXT: shrq %rcx @@ -340,7 +340,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-32-NEXT: .cfi_def_cfa_register %ebp ; SSE41-32-NEXT: andl $-8, %esp ; SSE41-32-NEXT: subl $24, %esp -; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] @@ -382,7 +382,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; SSE41-64-NEXT: # %bb.1: ; SSE41-64-NEXT: addss %xmm0, %xmm0 ; SSE41-64-NEXT: .LBB3_2: -; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-64-NEXT: movq %xmm1, %rax ; SSE41-64-NEXT: movq %rax, %rcx ; SSE41-64-NEXT: shrq %rcx @@ -410,7 +410,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax @@ -471,7 +471,7 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -1146,7 +1146,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $32, %esp ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -1164,7 +1164,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: cvtsi2sd %rax, %xmm1 -; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-64-NEXT: movq %xmm0, %rax ; SSE-64-NEXT: xorps %xmm0, %xmm0 ; SSE-64-NEXT: cvtsi2sd %rax, %xmm0 @@ -1182,7 +1182,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE41-32-NEXT: andl $-8, %esp ; SSE41-32-NEXT: subl $32, %esp ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp) ; SSE41-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -1200,7 +1200,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE41-64: # %bb.0: ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: cvtsi2sd %rax, %xmm1 -; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-64-NEXT: movq %xmm0, %rax ; SSE41-64-NEXT: xorps %xmm0, %xmm0 ; SSE41-64-NEXT: cvtsi2sd %rax, %xmm0 @@ -1218,7 +1218,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll index d97787e36cd85..a5519e68f73a3 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -281,7 +281,7 @@ define <8 x float> @sitofp_v8i16_v8f32(<8 x i16> %x) #0 { ; AVX1-LABEL: sitofp_v8i16_v8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll index ed703f1708478..c979050fc1809 100644 --- a/llvm/test/CodeGen/X86/vec_cast2.ll +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -18,7 +18,7 @@ define <8 x float> @cvt_v8i16_v8f32(<8 x i16> %src) { ; CHECK-LABEL: cvt_v8i16_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 6cb352c3f348f..a413752993b5a 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -95,7 +95,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 @@ -305,14 +305,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 @@ -421,7 +421,7 @@ define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { ; SSE-LABEL: sitofp_4i32_to_4f64: ; SSE: # %bb.0: ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq @@ -440,7 +440,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -448,7 +448,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -468,7 +468,7 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -476,7 +476,7 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -504,7 +504,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -512,7 +512,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -533,7 +533,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -541,7 +541,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1015,7 +1015,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] ; SSE41-NEXT: por %xmm3, %xmm2 ; SSE41-NEXT: subpd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: subpd %xmm3, %xmm1 @@ -1074,7 +1074,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1083,7 +1083,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1103,7 +1103,7 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1112,7 +1112,7 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1140,7 +1140,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1149,7 +1149,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1170,7 +1170,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1179,7 +1179,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -1209,7 +1209,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -1274,7 +1274,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { ; SSE2-LABEL: sitofp_2i64_to_4f32_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -1345,7 +1345,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -1464,7 +1464,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { ; AVX1-LABEL: sitofp_8i16_to_4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -1568,7 +1568,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -1576,7 +1576,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -1719,7 +1719,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm0 @@ -1728,7 +1728,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { ; AVX1-LABEL: sitofp_8i16_to_8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -1868,7 +1868,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB41_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB41_4 @@ -1969,7 +1969,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; SSE2-LABEL: uitofp_2i64_to_2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB42_1 @@ -2102,7 +2102,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB43_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB43_4 @@ -2462,7 +2462,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB49_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB49_4 @@ -2494,7 +2494,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB49_9: ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB49_10 @@ -2769,7 +2769,7 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm0 @@ -2906,7 +2906,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { ; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 @@ -3103,7 +3103,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; SSE2-NEXT: movdqa 16(%rdi), %xmm2 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 @@ -3111,7 +3111,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2sd %rax, %xmm2 @@ -3209,7 +3209,7 @@ define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE-NEXT: retq ; @@ -3229,7 +3229,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3237,7 +3237,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd (%rdi), %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3259,7 +3259,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3267,7 +3267,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbd (%rdi), %xmm1 ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3770,7 +3770,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: subpd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: subpd %xmm2, %xmm1 @@ -3831,7 +3831,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3839,7 +3839,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3861,7 +3861,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -3869,7 +3869,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3894,7 +3894,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -3902,7 +3902,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -4073,7 +4073,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movdqa 48(%rdi), %xmm3 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 @@ -4081,7 +4081,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -4090,7 +4090,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: xorps %xmm4, %xmm4 ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -4098,7 +4098,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 @@ -4378,7 +4378,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB83_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB83_4 @@ -4410,7 +4410,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB83_9: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB83_10 @@ -4729,7 +4729,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: addss %xmm3, %xmm3 ; SSE2-NEXT: .LBB87_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_4 @@ -4760,7 +4760,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB87_9: -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_10 @@ -4791,7 +4791,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm5 ; SSE2-NEXT: addss %xmm5, %xmm5 ; SSE2-NEXT: .LBB87_15: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_16 @@ -4826,7 +4826,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: .LBB87_21: ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_22 diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index dd3a733ab2178..4f071c064e5c8 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -107,7 +107,7 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; @@ -119,7 +119,7 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movq %xmm1, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; @@ -512,13 +512,13 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -644,7 +644,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -667,9 +667,9 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovdqa %xmm3, (%rdi) @@ -683,7 +683,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq @@ -769,7 +769,7 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -889,7 +889,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -924,7 +924,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 5fde07d1269df..b5fefe296d77e 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1193,13 +1193,13 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpacksswb %xmm1, %xmm5, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-NEXT: vpacksswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -1376,11 +1376,11 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: pmullw %xmm2, %xmm4 @@ -1407,7 +1407,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -1436,9 +1436,9 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1450,9 +1450,9 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -1478,7 +1478,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq @@ -1730,11 +1730,11 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: pmovsxbw %xmm3, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmovsxbw %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] @@ -1754,12 +1754,12 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm1, %xmm4 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmovsxbw %xmm2, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE41-NEXT: movdqa %xmm2, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm7, %xmm0 ; SSE41-NEXT: pmullw %xmm10, %xmm1 @@ -1783,7 +1783,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm8 ; SSE41-NEXT: psrad $31, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 @@ -1795,7 +1795,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm7 ; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 @@ -1841,9 +1841,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm4 ; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -1865,9 +1865,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm7 ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1882,14 +1882,14 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vmovdqa %xmm9, 16(%rdi) @@ -1929,9 +1929,9 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi) ; AVX2-NEXT: retq @@ -2411,8 +2411,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pand %xmm10, %xmm9 ; SSE41-NEXT: packuswb %xmm11, %xmm9 ; SSE41-NEXT: pmovsxbw %xmm3, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,2,3] ; SSE41-NEXT: pmullw %xmm12, %xmm8 ; SSE41-NEXT: pxor %xmm7, %xmm7 ; SSE41-NEXT: pcmpgtb %xmm9, %xmm7 @@ -2436,8 +2436,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pand %xmm10, %xmm12 ; SSE41-NEXT: packuswb %xmm3, %xmm12 ; SSE41-NEXT: pmovsxbw %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: pmullw %xmm7, %xmm3 ; SSE41-NEXT: pxor %xmm7, %xmm7 ; SSE41-NEXT: pcmpgtb %xmm12, %xmm7 @@ -2461,8 +2461,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pand %xmm10, %xmm11 ; SSE41-NEXT: packuswb %xmm7, %xmm11 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmullw %xmm6, %xmm2 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpgtb %xmm11, %xmm6 @@ -2487,8 +2487,8 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: packuswb %xmm6, %xmm5 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 ; SSE41-NEXT: pmullw %xmm7, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; SSE41-NEXT: pmullw %xmm4, %xmm0 @@ -2523,7 +2523,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2538,7 +2538,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2553,7 +2553,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2568,7 +2568,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2606,9 +2606,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm7 ; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 @@ -2628,9 +2628,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm7 ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 @@ -2652,9 +2652,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm11, %xmm7 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 ; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 @@ -2672,9 +2672,9 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm5 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm6 ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm2 @@ -2699,37 +2699,37 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 176(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 96(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 112(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -2794,15 +2794,15 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX2-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm7, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm8 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 @@ -2932,7 +2932,7 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -2971,9 +2971,9 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { ; SSE2-LABEL: smulo_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rcx ; SSE2-NEXT: movq %xmm1, %rdx ; SSE2-NEXT: movq %xmm0, %rsi @@ -2996,9 +2996,9 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; SSSE3-LABEL: smulo_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %r8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %rcx ; SSSE3-NEXT: movq %xmm1, %rdx ; SSSE3-NEXT: movq %xmm0, %rsi @@ -3158,7 +3158,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: movd %xmm2, %edx @@ -3213,7 +3213,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-NEXT: movd %xmm0, %ecx ; SSSE3-NEXT: movw %cx, 6(%rdi) ; SSSE3-NEXT: movd %xmm2, %edx diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 8ab9367c32f86..eb12f0dbcbff2 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -109,7 +109,7 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movq %xmm3, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; @@ -122,7 +122,7 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movq %xmm3, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; @@ -517,13 +517,13 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -649,7 +649,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -672,9 +672,9 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovdqa %xmm3, (%rdi) @@ -688,7 +688,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq @@ -774,7 +774,7 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -899,7 +899,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -934,7 +934,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 864f0b59f453f..9ff793b6b677f 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -120,7 +120,7 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: retq ; @@ -132,7 +132,7 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 ; SSSE3-NEXT: movq %xmm1, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, 8(%rdi) ; SSSE3-NEXT: retq ; @@ -601,13 +601,13 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6 -; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -727,7 +727,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -750,9 +750,9 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -766,7 +766,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi) ; AVX2-NEXT: retq @@ -850,7 +850,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -956,7 +956,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -988,7 +988,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index e08bbc363721c..87fe4922dfcb0 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1008,16 +1008,16 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm7 +; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 -; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpacksswb %xmm5, %xmm11, %xmm1 +; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -1217,7 +1217,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -1254,9 +1254,9 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -1278,7 +1278,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi) ; AVX2-NEXT: retq @@ -1560,7 +1560,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 @@ -1572,7 +1572,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 @@ -1647,14 +1647,14 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) @@ -1689,9 +1689,9 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi) ; AVX2-NEXT: retq @@ -2230,7 +2230,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2245,7 +2245,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2260,7 +2260,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2275,7 +2275,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2390,37 +2390,37 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX1-NEXT: vmovdqa %xmm4, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 240(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 176(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, 96(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, 112(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] @@ -2475,15 +2475,15 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 @@ -2608,7 +2608,7 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -2646,9 +2646,9 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { ; SSE2-LABEL: umulo_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r8 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movq %xmm1, %rdx @@ -2672,9 +2672,9 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; SSSE3-LABEL: umulo_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %r8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %r10 ; SSSE3-NEXT: movq %xmm0, %rax ; SSSE3-NEXT: movq %xmm1, %rdx @@ -2829,7 +2829,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: movw %ax, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: movd %xmm1, %edx @@ -2873,7 +2873,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: movd %xmm2, %eax ; SSSE3-NEXT: movw %ax, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: movw %cx, 6(%rdi) ; SSSE3-NEXT: movd %xmm1, %edx diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index 5302b3f8913df..155c5591ce11a 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -126,7 +126,7 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -140,7 +140,7 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-NEXT: movd %xmm0, 8(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq @@ -644,13 +644,13 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6 -; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 @@ -771,7 +771,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 @@ -793,9 +793,9 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -809,7 +809,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi) ; AVX2-NEXT: retq @@ -895,7 +895,7 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) @@ -1003,7 +1003,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, 9(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm1, %edx ; SSE2-NEXT: movw %dx, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] @@ -1035,7 +1035,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; SSSE3-NEXT: movd %xmm1, %ecx ; SSSE3-NEXT: movw %cx, 9(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-NEXT: movd %xmm1, %edx ; SSSE3-NEXT: movw %dx, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 22e97e7407533..62bc377b9cec3 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6296,7 +6296,7 @@ define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 @@ -6342,7 +6342,7 @@ define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 @@ -6375,7 +6375,7 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: movd %xmm1, %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2sd %eax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %eax, %xmm0 @@ -6414,7 +6414,7 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %eax, %xmm2 ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %eax, %xmm0 @@ -6535,7 +6535,7 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -6575,14 +6575,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 @@ -6642,7 +6642,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: cvtsi2ss %rax, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 @@ -6650,7 +6650,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 @@ -6970,7 +6970,7 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: addss %xmm0, %xmm0 ; CHECK-NEXT: .LBB174_2: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx @@ -7031,7 +7031,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: movd %xmm1, %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2sd %rax, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 @@ -7082,7 +7082,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %rax, %xmm2 ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 @@ -7157,7 +7157,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0] @@ -7458,7 +7458,7 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: addss %xmm2, %xmm2 ; CHECK-NEXT: .LBB182_2: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx @@ -7487,7 +7487,7 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: addss %xmm1, %xmm1 ; CHECK-NEXT: .LBB182_6: # %entry ; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index d8442048f65ec..0192d1e8137c1 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -30,7 +30,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psllq %xmm4, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -38,7 +38,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: psubq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrlq %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE2-NEXT: psrlq %xmm3, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE2-NEXT: orpd %xmm5, %xmm1 @@ -56,7 +56,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: psllq %xmm5, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7] @@ -64,7 +64,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: psubq %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrlq %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: psrlq %xmm0, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: por %xmm1, %xmm4 @@ -78,13 +78,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 @@ -212,7 +212,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psllq %xmm4, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -220,7 +220,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: psubq %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4 ; X32-SSE-NEXT: psrlq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm1 @@ -249,7 +249,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm6, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 @@ -285,7 +285,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm7 ; SSE41-NEXT: psrld %xmm6, %xmm7 @@ -465,7 +465,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrld %xmm6, %xmm3 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm1, %xmm6 ; X32-SSE-NEXT: psrld %xmm5, %xmm6 @@ -1366,7 +1366,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; X32-SSE-NEXT: psubq %xmm3, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrlq %xmm4, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 0688107ed5c0b..0cf4c172412a6 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -26,11 +26,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 @@ -38,12 +38,12 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6 ; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index c560f99916bed..59bef3a97b1fa 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -31,14 +31,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psllq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psllq %xmm1, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE2-NEXT: psrlq %xmm2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 @@ -52,14 +52,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psllq %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psllq %xmm1, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlq %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: psrlq %xmm2, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 @@ -70,14 +70,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 @@ -136,14 +136,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psllq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 @@ -745,14 +745,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psllq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 655b6e4c25048..fd0e1c7e2f3ab 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -24,11 +24,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -38,13 +38,13 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -520,7 +520,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index b7cc39a32d718..817bca051e0ae 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -30,7 +30,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrlq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrlq %xmm4, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -38,7 +38,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: psubq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psllq %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE2-NEXT: psllq %xmm3, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 @@ -58,7 +58,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrlq %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrlq %xmm4, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7] @@ -66,7 +66,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: psubq %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: psllq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: psllq %xmm0, %xmm3 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm3 @@ -80,13 +80,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -215,7 +215,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] @@ -223,7 +223,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X32-SSE-NEXT: psubq %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 @@ -251,7 +251,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm5, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 @@ -287,7 +287,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrld %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm5, %xmm6 @@ -469,7 +469,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 ; X32-SSE-NEXT: psrld %xmm5, %xmm3 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm1, %xmm6 ; X32-SSE-NEXT: psrld %xmm5, %xmm6 @@ -1380,7 +1380,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; X32-SSE-NEXT: psubq %xmm4, %xmm5 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm5, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm5, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; X32-SSE-NEXT: movdqa %xmm1, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index e9cb0a0586f09..f0848cfd2e49a 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -26,11 +26,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm6, %xmm1, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 @@ -38,12 +38,12 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vpsubq %xmm4, %xmm8, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm2, %xmm8, %xmm6 ; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index eadc8544f8afb..8fe7ba9e471a9 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -31,14 +31,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrlq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrlq %xmm1, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psllq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE2-NEXT: psllq %xmm2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 @@ -52,14 +52,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psrlq %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrlq %xmm1, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psllq %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: psllq %xmm2, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 @@ -70,14 +70,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -138,14 +138,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psrlq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 @@ -789,14 +789,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE-NEXT: psrlq %xmm1, %xmm5 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 61c45a118e47a..4e92bfc4f9136 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -24,11 +24,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -38,13 +38,13 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -566,7 +566,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index ac8e1998ceb88..22a3895687356 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -20,7 +20,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: sarq %rdx ; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: imulq %rcx ; SSE2-NEXT: movq %rdx, %rax @@ -199,7 +199,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; SSE41-NEXT: pmullw %xmm2, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 ; SSE41-NEXT: pmullw %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 @@ -223,7 +223,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -327,7 +327,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] @@ -371,7 +371,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -457,7 +457,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: subq %rax, %rdx ; SSE2-NEXT: addq %rcx, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rsi @@ -674,7 +674,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; SSE41-NEXT: pmullw %xmm2, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 ; SSE41-NEXT: pmullw %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 @@ -702,7 +702,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -836,7 +836,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm4 ; SSE41-NEXT: psrlw $8, %xmm4 @@ -879,7 +879,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index a9cf1aa80af8d..e06140f988d54 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -163,7 +163,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 @@ -182,7 +182,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 ; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -260,7 +260,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 @@ -289,7 +289,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -574,7 +574,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 @@ -598,7 +598,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -688,7 +688,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm4 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 @@ -725,7 +725,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm6, %xmm6 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 16be83c84fb3c..65131c0e3cb73 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -20,7 +20,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: addq %rdx, %rcx ; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi @@ -445,7 +445,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: subq %rdx, %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll index 2900ce2c8661e..724f6007623db 100644 --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -452,7 +452,7 @@ define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) { ; SSE42: # %bb.0: ; SSE42-NEXT: psrlw $15, %xmm0 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll index e4f785dca2b1a..a00e74fa1cacc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -14,21 +14,21 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddq %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: retq @@ -40,7 +40,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddq %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -84,7 +84,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: paddq %xmm2, %xmm1 ; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -96,7 +96,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -107,7 +107,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -119,7 +119,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -138,7 +138,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: paddq %xmm4, %xmm2 ; SSE-NEXT: paddq %xmm3, %xmm2 ; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -156,7 +156,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -169,7 +169,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -182,7 +182,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -236,7 +236,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddd %xmm1, %xmm0 @@ -245,7 +245,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX1-SLOW-LABEL: test_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -261,7 +261,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -270,7 +270,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -284,7 +284,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddd %xmm1, %xmm0 @@ -295,7 +295,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -317,7 +317,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -329,7 +329,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -346,7 +346,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 @@ -360,7 +360,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -375,7 +375,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax @@ -387,7 +387,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -401,7 +401,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -422,7 +422,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: paddd %xmm4, %xmm2 ; SSE-NEXT: paddd %xmm3, %xmm2 ; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 @@ -442,7 +442,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -463,7 +463,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax @@ -477,7 +477,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -492,7 +492,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -608,7 +608,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -621,7 +621,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX1-SLOW-LABEL: test_v8i16: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -642,7 +642,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX2-LABEL: test_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -654,7 +654,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX512-LABEL: test_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -671,7 +671,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -686,7 +686,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -713,7 +713,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -728,7 +728,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -748,7 +748,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: paddw %xmm3, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 @@ -766,7 +766,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -784,7 +784,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -799,7 +799,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -816,7 +816,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -840,7 +840,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: paddw %xmm4, %xmm2 ; SSE-NEXT: paddw %xmm3, %xmm2 ; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 @@ -864,7 +864,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -888,7 +888,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -905,7 +905,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -923,7 +923,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -1142,7 +1142,7 @@ define i8 @test_v8i8_load(<8 x i8>* %p) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddb %xmm0, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: psadbw %xmm1, %xmm0 @@ -1152,7 +1152,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1162,7 +1162,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512-LABEL: test_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1177,7 +1177,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddb %xmm0, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: psadbw %xmm1, %xmm0 @@ -1189,7 +1189,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1202,7 +1202,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1215,7 +1215,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1233,7 +1233,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: paddb %xmm2, %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: paddb %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: psadbw %xmm0, %xmm1 @@ -1248,7 +1248,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1262,7 +1262,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1277,7 +1277,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1299,7 +1299,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: paddb %xmm4, %xmm2 ; SSE-NEXT: paddb %xmm3, %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: psadbw %xmm0, %xmm1 @@ -1320,7 +1320,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1336,7 +1336,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 @@ -1352,7 +1352,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll index c94f96958f5b0..17a3d6f46e98c 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -13,7 +13,7 @@ define i1 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax @@ -22,7 +22,7 @@ define i1 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: testq %rax, %rax @@ -37,7 +37,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax @@ -48,7 +48,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -60,7 +60,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax @@ -72,7 +72,7 @@ define i1 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: testq %rax, %rax @@ -90,7 +90,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax @@ -102,7 +102,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -115,7 +115,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax @@ -129,7 +129,7 @@ define i1 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: testq %rax, %rax @@ -151,7 +151,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax @@ -165,7 +165,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -180,7 +180,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax @@ -195,7 +195,7 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: testq %rax, %rax @@ -237,7 +237,7 @@ define i1 @test_v2i32(<2 x i32> %a0) { define i1 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -248,7 +248,7 @@ define i1 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -265,7 +265,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -278,7 +278,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -292,7 +292,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -306,7 +306,7 @@ define i1 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -326,7 +326,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -340,7 +340,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -355,7 +355,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -371,7 +371,7 @@ define i1 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -395,7 +395,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -411,7 +411,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -428,7 +428,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -445,7 +445,7 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i1 @test_v4i16(<4 x i16> %a0) { define i1 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -532,7 +532,7 @@ define i1 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -551,7 +551,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -567,7 +567,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -583,7 +583,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -599,7 +599,7 @@ define i1 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -621,7 +621,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -638,7 +638,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -655,7 +655,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -673,7 +673,7 @@ define i1 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -699,7 +699,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -718,7 +718,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -737,7 +737,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -756,7 +756,7 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -865,7 +865,7 @@ define i1 @test_v8i8(<8 x i8> %a0) { define i1 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -882,7 +882,7 @@ define i1 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -903,7 +903,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -922,7 +922,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -940,7 +940,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -958,7 +958,7 @@ define i1 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -982,7 +982,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -1002,7 +1002,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -1021,7 +1021,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1041,7 +1041,7 @@ define i1 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1069,7 +1069,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -1091,7 +1091,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -1112,7 +1112,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1133,7 +1133,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll index 2fc924b1b1257..4a00c22a26701 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -13,14 +13,14 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq @@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 @@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index bafe112c5dfc8..f7e1a72f9a91f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -15,7 +15,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -31,7 +31,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -45,7 +45,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -59,7 +59,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512BWVL-LABEL: test_v2i64: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -74,7 +74,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512DQ-LABEL: test_v2i64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -82,7 +82,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512DQVL-LABEL: test_v2i64: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: retq @@ -103,7 +103,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -128,7 +128,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -152,7 +152,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -176,7 +176,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -200,7 +200,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -218,7 +218,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -228,7 +228,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper @@ -270,7 +270,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -312,7 +312,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -344,7 +344,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -377,7 +377,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -410,7 +410,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -429,7 +429,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -441,7 +441,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper @@ -523,7 +523,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 @@ -599,7 +599,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -647,7 +647,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -688,7 +688,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -729,7 +729,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -749,7 +749,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vzeroupper @@ -762,7 +762,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper @@ -810,7 +810,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 @@ -821,7 +821,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE41-LABEL: test_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE41-NEXT: pmulld %xmm1, %xmm0 @@ -830,7 +830,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -839,7 +839,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -856,7 +856,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,0,2,2] ; SSE2-NEXT: pmuludq %xmm3, %xmm0 @@ -867,7 +867,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE41-LABEL: test_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE41-NEXT: pmulld %xmm1, %xmm0 @@ -878,7 +878,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -890,7 +890,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -902,7 +902,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -926,7 +926,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm2 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,0,2,2] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 @@ -939,7 +939,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE41-NEXT: pmulld %xmm3, %xmm1 ; SSE41-NEXT: pmulld %xmm2, %xmm1 ; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 @@ -953,7 +953,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -966,7 +966,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -980,7 +980,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1016,7 +1016,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pmuludq %xmm5, %xmm1 ; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,0,2,2] ; SSE2-NEXT: pmuludq %xmm11, %xmm1 @@ -1033,7 +1033,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE41-NEXT: pmulld %xmm4, %xmm2 ; SSE41-NEXT: pmulld %xmm3, %xmm2 ; SSE41-NEXT: pmulld %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 @@ -1053,7 +1053,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1068,7 +1068,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1083,7 +1083,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1165,7 +1165,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pmullw %xmm1, %xmm0 @@ -1178,7 +1178,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1190,7 +1190,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX512-LABEL: test_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1207,7 +1207,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pmullw %xmm1, %xmm0 @@ -1222,7 +1222,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1237,7 +1237,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1252,7 +1252,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1272,7 +1272,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pmullw %xmm3, %xmm1 ; SSE-NEXT: pmullw %xmm2, %xmm1 ; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pmullw %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 @@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1306,7 +1306,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1323,7 +1323,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1340,7 +1340,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1357,7 +1357,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1374,7 +1374,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1398,7 +1398,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pmullw %xmm4, %xmm2 ; SSE-NEXT: pmullw %xmm3, %xmm2 ; SSE-NEXT: pmullw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pmullw %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 @@ -1422,7 +1422,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1440,7 +1440,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1458,7 +1458,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1476,7 +1476,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1496,7 +1496,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1516,7 +1516,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1634,7 +1634,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE41-LABEL: test_v8i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] @@ -1650,7 +1650,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX-LABEL: test_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1663,7 +1663,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; AVX512-LABEL: test_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1711,7 +1711,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm0 @@ -1735,7 +1735,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1764,7 +1764,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512BW-LABEL: test_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1786,7 +1786,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512BWVL-LABEL: test_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1808,7 +1808,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512DQ-LABEL: test_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1833,7 +1833,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX512DQVL-LABEL: test_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -1908,7 +1908,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm4, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm2 @@ -1937,7 +1937,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1982,7 +1982,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 @@ -2010,7 +2010,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 @@ -2158,7 +2158,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm1 @@ -2196,7 +2196,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2288,10 +2288,8 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmovwb %ymm2, %xmm2 +; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero @@ -2495,7 +2493,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE41-NEXT: packuswb %xmm0, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm0 @@ -2551,7 +2549,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2657,10 +2655,8 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 8223e6bd23804..e1253975d5a6a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -823,7 +823,7 @@ define i1 @test_v128i8(<128 x i8> %a0) { define i1 @trunc_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: trunc_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: testw %ax, %ax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll index 14eb3d27d8dff..95bff8e03afae 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -13,14 +13,14 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq @@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: por %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: por %xmm0, %xmm1 @@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll index c0705ab96e039..26bbfed521968 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 @@ -54,7 +54,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE42-NEXT: movq %xmm2, %rax @@ -62,7 +62,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax @@ -71,7 +71,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -79,7 +79,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -106,7 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -139,7 +139,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -158,7 +158,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -170,7 +170,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -194,7 +194,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -204,7 +204,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -262,7 +262,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -317,7 +317,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -343,7 +343,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: movapd %xmm2, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -360,7 +360,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -387,7 +387,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -399,7 +399,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -517,7 +517,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -616,7 +616,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -654,7 +654,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: movapd %xmm6, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm7, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm7, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 @@ -681,7 +681,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -713,7 +713,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -726,7 +726,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -778,7 +778,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -795,7 +795,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 @@ -804,7 +804,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -813,7 +813,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -831,7 +831,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -849,7 +849,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 @@ -860,7 +860,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -913,7 +913,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -933,7 +933,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pmaxsd %xmm3, %xmm1 ; SSE4-NEXT: pmaxsd %xmm2, %xmm1 ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 @@ -947,7 +947,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -960,7 +960,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -974,7 +974,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1023,7 +1023,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -1047,7 +1047,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pmaxsd %xmm4, %xmm2 ; SSE4-NEXT: pmaxsd %xmm3, %xmm2 ; SSE4-NEXT: pmaxsd %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 @@ -1067,7 +1067,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1082,7 +1082,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1097,7 +1097,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1179,7 +1179,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1224,7 +1224,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-LABEL: test_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm3, %xmm1 ; SSE2-NEXT: pmaxsw %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1368,7 +1368,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm4, %xmm2 ; SSE2-NEXT: pmaxsw %xmm3, %xmm2 ; SSE2-NEXT: pmaxsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1622,7 +1622,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -1697,7 +1697,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -1803,7 +1803,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -1937,7 +1937,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll index e8259e147133c..6f561c27d71dc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -53,7 +53,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -62,7 +62,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax @@ -71,7 +71,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -79,7 +79,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -106,7 +106,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -138,7 +138,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -158,7 +158,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -170,7 +170,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -194,7 +194,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -204,7 +204,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -262,7 +262,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -317,7 +317,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -343,7 +343,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: movapd %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -360,7 +360,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -387,7 +387,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -399,7 +399,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -517,7 +517,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -616,7 +616,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -654,7 +654,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: movapd %xmm7, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm7, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 @@ -681,7 +681,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -713,7 +713,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -726,7 +726,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -778,7 +778,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -795,7 +795,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminsd %xmm1, %xmm0 @@ -804,7 +804,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -813,7 +813,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -831,7 +831,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -849,7 +849,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsd %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminsd %xmm1, %xmm0 @@ -860,7 +860,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -913,7 +913,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -933,7 +933,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pminsd %xmm3, %xmm1 ; SSE4-NEXT: pminsd %xmm2, %xmm1 ; SSE4-NEXT: pminsd %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pminsd %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 @@ -947,7 +947,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -960,7 +960,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -974,7 +974,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1023,7 +1023,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -1047,7 +1047,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pminsd %xmm4, %xmm2 ; SSE4-NEXT: pminsd %xmm3, %xmm2 ; SSE4-NEXT: pminsd %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pminsd %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 @@ -1067,7 +1067,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1082,7 +1082,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1097,7 +1097,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1179,7 +1179,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1224,7 +1224,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-LABEL: test_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm3, %xmm1 ; SSE2-NEXT: pminsw %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1368,7 +1368,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm4, %xmm2 ; SSE2-NEXT: pminsw %xmm3, %xmm2 ; SSE2-NEXT: pminsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pminsw %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1622,7 +1622,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -1697,7 +1697,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 @@ -1803,7 +1803,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -1937,7 +1937,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 75e8133581338..a11fff0a5b5c8 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 @@ -54,7 +54,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm3 @@ -65,7 +65,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -77,7 +77,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -85,7 +85,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -112,7 +112,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -145,7 +145,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -168,7 +168,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm3 @@ -185,7 +185,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -202,7 +202,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -216,7 +216,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -226,7 +226,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -284,7 +284,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -339,7 +339,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -374,7 +374,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm5, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm5, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm5 @@ -400,7 +400,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -421,7 +421,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -436,7 +436,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -448,7 +448,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -566,7 +566,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -665,7 +665,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -725,7 +725,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm9, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm7, %xmm0 ; SSE42-NEXT: pxor %xmm9, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm9 @@ -769,7 +769,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -798,7 +798,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -814,7 +814,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -827,7 +827,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -882,7 +882,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -905,7 +905,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm1, %xmm0 @@ -914,7 +914,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -923,7 +923,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -945,7 +945,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -968,7 +968,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxud %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm1, %xmm0 @@ -979,7 +979,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -991,7 +991,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1003,7 +1003,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1042,7 +1042,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1067,7 +1067,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pmaxud %xmm3, %xmm1 ; SSE4-NEXT: pmaxud %xmm2, %xmm1 ; SSE4-NEXT: pmaxud %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 @@ -1081,7 +1081,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1094,7 +1094,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1108,7 +1108,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1179,7 +1179,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1208,7 +1208,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pmaxud %xmm4, %xmm2 ; SSE4-NEXT: pmaxud %xmm3, %xmm2 ; SSE4-NEXT: pmaxud %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 @@ -1228,7 +1228,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1243,7 +1243,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1258,7 +1258,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1368,7 +1368,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1433,7 +1433,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1520,7 +1520,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -1625,7 +1625,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pmaxsw %xmm5, %xmm1 ; SSE2-NEXT: pmaxsw %xmm4, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 @@ -1839,7 +1839,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -1909,7 +1909,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-LABEL: test_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -2004,7 +2004,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pmaxub %xmm3, %xmm1 ; SSE2-NEXT: pmaxub %xmm2, %xmm1 ; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 @@ -2113,7 +2113,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pmaxub %xmm4, %xmm2 ; SSE2-NEXT: pmaxub %xmm3, %xmm2 ; SSE2-NEXT: pmaxub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index 8f6a7266d97b9..9da8d61223efd 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -14,7 +14,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE2-LABEL: test_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -36,7 +36,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41-LABEL: test_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -53,7 +53,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-LABEL: test_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] ; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pxor %xmm0, %xmm3 @@ -65,7 +65,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -77,7 +77,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -85,7 +85,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: test_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq @@ -112,7 +112,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 @@ -144,7 +144,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -168,7 +168,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm3 @@ -186,7 +186,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -203,7 +203,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -217,7 +217,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -227,7 +227,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -285,7 +285,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -340,7 +340,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm5 @@ -376,7 +376,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm4, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE42-NEXT: movdqa %xmm3, %xmm0 ; SSE42-NEXT: pxor %xmm4, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm4 @@ -403,7 +403,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -424,7 +424,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -439,7 +439,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -451,7 +451,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -569,7 +569,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm8 @@ -668,7 +668,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm9 @@ -728,7 +728,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE42-NEXT: xorpd %xmm8, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE42-NEXT: movdqa %xmm7, %xmm0 ; SSE42-NEXT: pxor %xmm8, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm8 @@ -773,7 +773,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -802,7 +802,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -818,7 +818,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper @@ -831,7 +831,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper @@ -886,7 +886,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 @@ -909,7 +909,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; SSE4-LABEL: test_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminud %xmm1, %xmm0 @@ -918,7 +918,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -927,7 +927,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX512-LABEL: test_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -949,7 +949,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -972,7 +972,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE4-LABEL: test_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminud %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE4-NEXT: pminud %xmm1, %xmm0 @@ -983,7 +983,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1007,7 +1007,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1046,7 +1046,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm6, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1071,7 +1071,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE4-NEXT: pminud %xmm3, %xmm1 ; SSE4-NEXT: pminud %xmm2, %xmm1 ; SSE4-NEXT: pminud %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE4-NEXT: pminud %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 @@ -1085,7 +1085,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1098,7 +1098,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1112,7 +1112,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1183,7 +1183,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -1212,7 +1212,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE4-NEXT: pminud %xmm4, %xmm2 ; SSE4-NEXT: pminud %xmm3, %xmm2 ; SSE4-NEXT: pminud %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE4-NEXT: pminud %xmm2, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 @@ -1232,7 +1232,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1247,7 +1247,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1262,7 +1262,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1372,7 +1372,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1418,7 +1418,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1482,7 +1482,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -1562,7 +1562,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-NEXT: pminsw %xmm5, %xmm1 ; SSE2-NEXT: pminsw %xmm4, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 @@ -1750,7 +1750,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminub %xmm1, %xmm0 @@ -1799,7 +1799,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE2-LABEL: test_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pminub %xmm1, %xmm0 @@ -1869,7 +1869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE2-NEXT: pminub %xmm3, %xmm1 ; SSE2-NEXT: pminub %xmm2, %xmm1 ; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pminub %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 @@ -1951,7 +1951,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-NEXT: pminub %xmm4, %xmm2 ; SSE2-NEXT: pminub %xmm3, %xmm2 ; SSE2-NEXT: pminub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: pminub %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll index 35e6db38a584f..2d69190d9d18a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -13,14 +13,14 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE-LABEL: test_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq @@ -32,7 +32,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq @@ -41,7 +41,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -51,7 +51,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -61,7 +61,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -76,7 +76,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -86,7 +86,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -97,7 +97,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -109,7 +109,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -128,7 +128,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq @@ -140,7 +140,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -153,7 +153,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper @@ -166,7 +166,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper @@ -200,7 +200,7 @@ define i32 @test_v2i32(<2 x i32> %a0) { define i32 @test_v4i32(<4 x i32> %a0) { ; SSE-LABEL: test_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -209,7 +209,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -223,7 +223,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -234,7 +234,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -246,7 +246,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -258,7 +258,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -275,7 +275,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -287,7 +287,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -300,7 +300,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -314,7 +314,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -335,7 +335,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -349,7 +349,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -364,7 +364,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -379,7 +379,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -443,7 +443,7 @@ define i16 @test_v4i16(<4 x i16> %a0) { define i16 @test_v8i16(<8 x i16> %a0) { ; SSE-LABEL: test_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -456,7 +456,7 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -473,7 +473,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE-LABEL: test_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -488,7 +488,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -503,7 +503,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -538,7 +538,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -554,7 +554,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -570,7 +570,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -587,7 +587,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -611,7 +611,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -629,7 +629,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -647,7 +647,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -665,7 +665,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -763,7 +763,7 @@ define i8 @test_v8i8(<8 x i8> %a0) { define i8 @test_v16i8(<16 x i8> %a0) { ; SSE-LABEL: test_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -779,7 +779,7 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; ; AVX-LABEL: test_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -798,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; SSE-LABEL: test_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 @@ -816,7 +816,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -833,7 +833,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -850,7 +850,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -872,7 +872,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -891,7 +891,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -909,7 +909,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -928,7 +928,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -954,7 +954,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 @@ -975,7 +975,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -995,7 +995,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -1015,7 +1015,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rem.ll b/llvm/test/CodeGen/X86/vector-rem.ll index 15e1198696659..deaab1c9161b9 100644 --- a/llvm/test/CodeGen/X86/vector-rem.ll +++ b/llvm/test/CodeGen/X86/vector-rem.ll @@ -11,9 +11,9 @@ define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) nounwind { ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx @@ -49,9 +49,9 @@ define <4 x i32> @bar(<4 x i32> %t, <4 x i32> %u) nounwind { ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; CHECK-NEXT: movd %xmm3, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 67dd15ee87abe..d140fb5c09295 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -24,13 +24,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psllq %xmm1, %xmm4 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: psrlq %xmm2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm4, %xmm0 @@ -42,13 +42,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41-NEXT: psubq %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psllq %xmm1, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlq %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: psrlq %xmm2, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm4, %xmm0 @@ -59,11 +59,11 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -117,13 +117,13 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: psubq %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 ; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm2, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm4, %xmm0 @@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: psllq %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 31fe575024753..a850ab5ba7822 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -21,20 +21,20 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index d162f5c4a97ab..2a4efccc07c7f 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -72,7 +72,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; SSE41-LABEL: sext_16i8_to_16i16: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -80,7 +80,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; AVX1-LABEL: sext_16i8_to_16i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -107,7 +107,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss ; X32-SSE41-LABEL: sext_16i8_to_16i16: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -149,9 +149,9 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -160,12 +160,12 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; AVX1-LABEL: sext_32i8_to_32i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -210,9 +210,9 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5 ; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm3 ; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 ; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -367,7 +367,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxbd %xmm0, %xmm3 @@ -380,9 +380,9 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -391,7 +391,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX2-LABEL: sext_16i8_to_16i32: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -421,7 +421,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxbd %xmm2, %xmm2 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm3 @@ -736,7 +736,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: sext_8i16_to_8i32: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -744,7 +744,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: sext_8i16_to_8i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -771,7 +771,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE41-LABEL: sext_8i16_to_8i32: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -813,9 +813,9 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -824,12 +824,12 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; AVX1-LABEL: sext_16i16_to_16i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -866,9 +866,9 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm3 ; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 ; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1043,7 +1043,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 @@ -1056,9 +1056,9 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -1067,7 +1067,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX2-LABEL: sext_8i16_to_8i64: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -1101,7 +1101,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwq %xmm2, %xmm2 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm3 @@ -1160,7 +1160,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1171,7 +1171,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1180,7 +1180,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: sext_4i32_to_4i64: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1188,7 +1188,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: sext_4i32_to_4i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -1208,7 +1208,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE2-NEXT: pxor %xmm2, %xmm2 ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1217,7 +1217,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE41-LABEL: sext_4i32_to_4i64: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -1235,12 +1235,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -1254,12 +1254,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -1269,9 +1269,9 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovsxdq %xmm0, %xmm5 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1280,12 +1280,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: sext_8i32_to_8i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -1312,12 +1312,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; X32-SSE2-NEXT: pxor %xmm5, %xmm5 ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -1327,9 +1327,9 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5 ; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm3 ; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 ; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1849,7 +1849,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX1-NEXT: negl %eax ; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -3046,7 +3046,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3058,7 +3058,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3094,7 +3094,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { ; X32-SSE2-NEXT: pxor %xmm2, %xmm2 ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3166,7 +3166,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3179,7 +3179,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3190,7 +3190,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -3200,7 +3200,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -3226,7 +3226,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE2-NEXT: pxor %xmm2, %xmm2 ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -3237,7 +3237,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE41-NEXT: pslld $31, %xmm0 ; X32-SSE41-NEXT: psrad $31, %xmm0 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl @@ -3737,7 +3737,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41-NEXT: psrad $26, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: psllq $58, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 @@ -3767,9 +3767,9 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -3782,7 +3782,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX2-NEXT: vpsllw $10, %xmm0, %xmm0 ; AVX2-NEXT: vpsraw $10, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 ; AVX2-NEXT: retq ; @@ -3861,7 +3861,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; X32-SSE41-NEXT: psrad $26, %xmm1 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; X32-SSE41-NEXT: psllq $58, %xmm2 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm4 @@ -3916,7 +3916,7 @@ define <8 x i32> @zext_negate_sext(<8 x i8> %x) { ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: psubw %xmm0, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -3962,7 +3962,7 @@ define <8 x i32> @zext_negate_sext(<8 x i8> %x) { ; X32-SSE41-NEXT: pxor %xmm1, %xmm1 ; X32-SSE41-NEXT: psubw %xmm0, %xmm1 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; X32-SSE41-NEXT: retl %z = zext <8 x i8> %x to <8 x i16> @@ -4002,7 +4002,7 @@ define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) { ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -4049,7 +4049,7 @@ define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) { ; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-SSE41-NEXT: paddw %xmm0, %xmm1 ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; X32-SSE41-NEXT: retl %z = zext <8 x i8> %x to <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 5764d19f4c7f7..9b1fb29cb0297 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -23,7 +23,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrlq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: psrlq %xmm4, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE41-NEXT: psrlq %xmm4, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -54,7 +54,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 @@ -99,7 +99,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X32-SSE-NEXT: movdqa %xmm2, %xmm3 ; X32-SSE-NEXT: psrlq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm2 ; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 @@ -123,7 +123,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrad %xmm3, %xmm4 @@ -139,7 +139,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrad %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 @@ -205,7 +205,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrad %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrad %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 358f9b8cc4dec..ae9c375eec254 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -22,7 +22,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -32,7 +32,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 @@ -92,7 +92,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; X32-AVX1-NEXT: # xmm3 = mem[0,0] ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -102,7 +102,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index 11d118bf31c3d..a994d6610d73d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -27,7 +27,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrad %xmm3, %xmm4 @@ -43,7 +43,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrad %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 @@ -109,7 +109,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrad %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrad %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 1a2889ab861e2..2e19f753722d3 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -22,7 +22,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrlq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: psrlq %xmm1, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE2-NEXT: retq @@ -31,7 +31,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: psrlq %xmm1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq @@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-LABEL: var_shift_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq @@ -75,7 +75,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrlq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: psrlq %xmm1, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: retl @@ -93,7 +93,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -109,7 +109,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -175,7 +175,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrld %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrld %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 154c35b51db9f..9119e32bda375 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -22,11 +22,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -69,11 +69,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index b540421fb3de9..8a843ef652e7f 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -27,7 +27,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -43,7 +43,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -109,7 +109,7 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrld %xmm4, %xmm2 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrld %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index eee4561504764..5f1325aacb4d1 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -22,7 +22,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psllq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: psllq %xmm1, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE2-NEXT: retq @@ -31,7 +31,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psllq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: psllq %xmm1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq @@ -39,7 +39,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-LABEL: var_shift_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq @@ -73,7 +73,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psllq %xmm1, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-SSE-NEXT: psllq %xmm1, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 56ebce709a8d7..1296fcf8bb902 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -22,11 +22,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -66,11 +66,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 04064f1c1970d..a11fc5b7c8ed3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1013,19 +1013,19 @@ define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_0c1d2e3f: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512VL-SLOW-NEXT: retq ; @@ -1055,19 +1055,19 @@ define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_48596a7b: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v8i16_48596a7b: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512VL-SLOW-NEXT: retq ; @@ -1513,7 +1513,7 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_cde3XXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 @@ -1529,25 +1529,25 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { ; ; SSE41-LABEL: shuffle_v8i16_cde3XXXX: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,0,1,2,3] +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,4,5,6,7] ; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 ; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512VL-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 635d94b9e7239..131c621162a50 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3167,7 +3167,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -3191,7 +3191,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; XOPAVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -5064,7 +5064,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,u,u,u,u,u,u,u,u] @@ -5090,7 +5090,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],xmm3[14,15],xmm1[u,u,u,u,u,u,u,u] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -5161,7 +5161,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,14,15] @@ -5187,7 +5187,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,2,3,4,5],xmm2[6,7] ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -5752,9 +5752,9 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -5776,9 +5776,9 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 04759bf104cb3..0508c6bac2bb1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4053,7 +4053,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ; AVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -4083,7 +4083,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ; XOPAVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; XOPAVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; XOPAVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 973045696fbd5..4798b4b1d38a2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3216,7 +3216,7 @@ entry: define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { ; AVX1-LABEL: broadcast_concat_crash: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index b2d4dc76a10e0..4c8073614d6dd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -434,7 +434,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_z ; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: ; KNL: ## %bb.0: ; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; KNL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 3199cc0fa9b9b..4237c4107d477 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -397,7 +397,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_ ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq @@ -410,7 +410,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_ ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 72c1aefb5da7d..622eb0881052e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3228,7 +3228,7 @@ define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) { ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll index 25c44964a5511..6adb6b0c2c0b8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -372,14 +372,14 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { ; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; BTVER1: # %bb.0: ; BTVER1-NEXT: psrld $16, %xmm1 -; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; BTVER1-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; BTVER1-NEXT: retq ; ; BTVER2-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vpsrld $16, %xmm1, %xmm1 -; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; BTVER2-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 42bf2d9eac284..5d065213f925b 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -57,7 +57,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) { ; SSE41-LABEL: zext_16i8_to_16i16: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -113,9 +113,9 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -277,7 +277,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -290,9 +290,9 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -301,7 +301,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss ; AVX2-LABEL: zext_16i8_to_16i32: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -515,7 +515,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: zext_8i16_to_8i32: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -571,9 +571,9 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -727,7 +727,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -740,9 +740,9 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -751,7 +751,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX2-LABEL: zext_8i16_to_8i64: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -813,7 +813,7 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE41-LABEL: zext_4i32_to_4i64: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -869,9 +869,9 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -1885,7 +1885,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1894,7 +1894,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -2213,14 +2213,14 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -2245,14 +2245,14 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-NEXT: vmovaps %ymm4, %ymm0 @@ -2263,9 +2263,9 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vmovdqa %ymm4, %ymm0 ; AVX2-NEXT: retq @@ -2499,7 +2499,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] @@ -2518,9 +2518,9 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -2532,7 +2532,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vsel-cmp-load.ll b/llvm/test/CodeGen/X86/vsel-cmp-load.ll index c809090c8955c..89d1b549182ab 100644 --- a/llvm/test/CodeGen/X86/vsel-cmp-load.ll +++ b/llvm/test/CodeGen/X86/vsel-cmp-load.ll @@ -80,7 +80,7 @@ define <16 x i16> @sgt_zero(<16 x i8>* %p, <16 x i16> %x, <16 x i16> %y) { ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 3cd5654771c5a..24849c2b850cd 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -42,7 +42,7 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX1: ## %bb.0: ## %bb ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq (%rdi,%rsi,8), %rax diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index 4c56c654defae..42fbdb186357f 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -364,7 +364,7 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double> ; AVX1-LABEL: signbit_sel_v4f64_small_mask: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 @@ -397,7 +397,7 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double> ; XOP-LABEL: signbit_sel_v4f64_small_mask: ; XOP: # %bb.0: ; XOP-NEXT: vpmovsxdq %xmm2, %xmm3 -; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; XOP-NEXT: vpmovsxdq %xmm2, %xmm2 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; XOP-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vshift-4.ll b/llvm/test/CodeGen/X86/vshift-4.ll index 9362c41a3dc8a..1131d07b15d35 100644 --- a/llvm/test/CodeGen/X86/vshift-4.ll +++ b/llvm/test/CodeGen/X86/vshift-4.ll @@ -32,7 +32,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movdqa %xmm0, %xmm2 ; X32-NEXT: psllq %xmm1, %xmm2 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X32-NEXT: psllq %xmm1, %xmm0 ; X32-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-NEXT: movapd %xmm0, (%eax) @@ -42,7 +42,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: psllq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: psllq %xmm1, %xmm0 ; X64-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X64-NEXT: movapd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll index 9c567fc384157..d5022b24eabe1 100644 --- a/llvm/test/CodeGen/X86/widen_conv-4.ll +++ b/llvm/test/CodeGen/X86/widen_conv-4.ll @@ -28,7 +28,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin ; X86-SSE42-LABEL: convert_v7i16_v7f32: ; X86-SSE42: # %bb.0: # %entry ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X86-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1 ; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -55,7 +55,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin ; ; X64-SSE42-LABEL: convert_v7i16_v7f32: ; X64-SSE42: # %bb.0: # %entry -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X64-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1 ; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll index 6f335c00b589f..e258cdf0035a0 100644 --- a/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll +++ b/llvm/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll @@ -30,7 +30,7 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin ; CHECK: ## %bb.0: ; CHECK-NEXT: cmpeqps %xmm1, %xmm0 ; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 ; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index e836b00bb6206..e1d6929503f15 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -407,7 +407,7 @@ define i32 @PR17487(i1 %tobool) { ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X32-NEXT: pandn {{\.LCPI.*}}, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X32-NEXT: movd %xmm0, %ecx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: cmpl $1, %ecx From ca9bfc20f48c82a9f223ec814697714a16d1a22d Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Sun, 26 Jul 2020 17:10:59 +0100 Subject: [PATCH 0141/1035] [clang][NFC] Remove spurious +x flag on SemaConcept.cpp --- clang/lib/Sema/SemaConcept.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 clang/lib/Sema/SemaConcept.cpp diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp old mode 100755 new mode 100644 From a2f83d5a07daf7a9b717fff8916c44cd7cc1c678 Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Sun, 26 Jul 2020 17:20:56 +0100 Subject: [PATCH 0142/1035] [clang][NFC] Add tests for the use of NamedDecl::getDeclName in the unused/unneeded diagnostics. --- clang/test/SemaCXX/warn-func-not-needed.cpp | 22 +++-- clang/test/SemaCXX/warn-member-not-needed.cpp | 12 ++- clang/test/SemaCXX/warn-unused-filescoped.cpp | 98 +++++++++++-------- .../test/SemaCXX/warn-variable-not-needed.cpp | 7 +- 4 files changed, 89 insertions(+), 50 deletions(-) diff --git a/clang/test/SemaCXX/warn-func-not-needed.cpp b/clang/test/SemaCXX/warn-func-not-needed.cpp index 65721f44f5707..5040aaad94601 100644 --- a/clang/test/SemaCXX/warn-func-not-needed.cpp +++ b/clang/test/SemaCXX/warn-func-not-needed.cpp @@ -1,13 +1,23 @@ // RUN: %clang_cc1 -fsyntax-only -verify -Wall %s namespace test1 { - static void f() {} // expected-warning {{is not needed and will not be emitted}} - static void f(); - template - void foo() { - f(); - } +static void f() {} // expected-warning {{function 'f' is not needed and will not be emitted}} +static void f(); +template +void foo() { + f(); +} +} + +namespace test1_template { +template static void f() {} +template <> void f() {} // expected-warning {{function 'f' is not needed and will not be emitted}} +template +void foo() { + f(); + f(); } +} // namespace test1_template namespace test2 { static void f() {} diff --git a/clang/test/SemaCXX/warn-member-not-needed.cpp b/clang/test/SemaCXX/warn-member-not-needed.cpp index 61bb3488c6116..95241f4f7fee0 100644 --- a/clang/test/SemaCXX/warn-member-not-needed.cpp +++ b/clang/test/SemaCXX/warn-member-not-needed.cpp @@ -1,11 +1,19 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -Wunneeded-member-function %s +// RUN: %clang_cc1 -fsyntax-only -verify -Wunneeded-member-function -Wno-unused-template %s namespace { class A { - void g() {} // expected-warning {{is not needed and will not be emitted}} + void g() {} // expected-warning {{member function 'g' is not needed and will not be emitted}} + template void gt(T) {} + template <> void gt(int) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} + template <> void gt(float) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} + template void foo() { g(); + gt(0); + gt(0.0f); + gt(0.0); } }; + template void A::gt(double); // no-warning } diff --git a/clang/test/SemaCXX/warn-unused-filescoped.cpp b/clang/test/SemaCXX/warn-unused-filescoped.cpp index 7ea398feb2b1e..056543d5eeb08 100644 --- a/clang/test/SemaCXX/warn-unused-filescoped.cpp +++ b/clang/test/SemaCXX/warn-unused-filescoped.cpp @@ -1,14 +1,15 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -Wunused -Wunused-template -Wunused-member-function -Wno-unused-local-typedefs -Wno-c++11-extensions -std=c++98 %s +// RUN: %clang_cc1 -fsyntax-only -verify -Wunused -Wunused-template -Wunused-member-function -Wno-unused-local-typedefs \ +// RUN: -Wno-c++11-extensions -Wno-c++14-extensions -std=c++98 %s // RUN: %clang_cc1 -fsyntax-only -verify -Wunused -Wunused-template -Wunused-member-function -Wno-unused-local-typedefs -std=c++14 %s #ifdef HEADER -static void headerstatic() {} // expected-warning{{unused}} +static void headerstatic() {} // expected-warning{{unused function 'headerstatic'}} static inline void headerstaticinline() {} namespace { - void headeranon() {} // expected-warning{{unused}} - inline void headerinlineanon() {} +void headeranon() {} // expected-warning{{unused function 'headeranon'}} +inline void headerinlineanon() {} } namespace test7 @@ -43,31 +44,31 @@ namespace pr19713 { #define HEADER #include "warn-unused-filescoped.cpp" -static void f1(); // expected-warning{{unused}} +static void f1(); // expected-warning{{unused function 'f1'}} namespace { - void f2(); // expected-warning{{unused}} +void f2(); // expected-warning{{unused function 'f2'}} - void f3() { } // expected-warning{{unused}} +void f3() {} // expected-warning{{unused function 'f3'}} - struct S { - void m1() { } // expected-warning{{unused}} - void m2(); // expected-warning{{unused}} - void m3(); - S(const S&); - void operator=(const S&); - }; +struct S { + void m1() {} // expected-warning{{unused member function 'm1'}} + void m2(); // expected-warning{{unused member function 'm2'}} + void m3(); + S(const S &); + void operator=(const S &); +}; template struct TS { void m(); }; - template <> void TS::m() { } // expected-warning{{unused}} + template <> void TS::m() {} // expected-warning{{unused member function 'm'}} template - void tf() { } // expected-warning{{unused}} - template <> void tf() { } // expected-warning{{unused}} - + void tf() {} // expected-warning{{unused function template 'tf'}} + template <> void tf() {} // expected-warning{{unused function 'tf'}} + struct VS { virtual void vm() { } }; @@ -77,27 +78,31 @@ namespace { }; } -void S::m3() { } // expected-warning{{unused}} +void S::m3() {} // expected-warning{{unused member function 'm3'}} -static inline void f4() { } // expected-warning{{unused}} -const unsigned int cx = 0; // expected-warning{{unused}} +static inline void f4() {} // expected-warning{{unused function 'f4'}} +const unsigned int cx = 0; // expected-warning{{unused variable 'cx'}} const unsigned int cy = 0; int f5() { return cy; } -static int x1; // expected-warning{{unused}} +static int x1; // expected-warning{{unused variable 'x1'}} namespace { - int x2; // expected-warning{{unused}} - - struct S2 { - static int x; // expected-warning{{unused}} - }; +int x2; // expected-warning{{unused variable 'x2'}} + +struct S2 { + static int x; // expected-warning{{unused variable 'x'}} +}; template struct TS2 { static int x; }; - template <> int TS2::x; // expected-warning{{unused}} + template <> int TS2::x; // expected-warning{{unused variable 'x'}} + + template int vt = 0; // expected-warning {{unused variable template 'vt'}} + template int vt = 0; + template <> int vt = 0; // expected-warning {{unused variable 'vt'}} } namespace PR8841 { @@ -120,17 +125,21 @@ namespace PR8841 { namespace test4 { namespace { struct A {}; } - void test(A a); // expected-warning {{unused function}} + void test(A a); // expected-warning {{unused function 'test'}} extern "C" void test4(A a); } namespace rdar8733476 { - static void foo() { } // expected-warning {{not needed and will not be emitted}} +static void foo() {} // expected-warning {{function 'foo' is not needed and will not be emitted}} +template static void foo_t() {} // expected-warning {{unused function template 'foo_t'}} +template <> void foo_t() {} // expected-warning {{function 'foo_t' is not needed and will not be emitted}} - template - void bar() { - foo(); - } +template +void bar() { + foo(); + foo_t(); + foo_t(); +} } namespace test5 { @@ -142,8 +151,15 @@ namespace test5 { // FIXME: We should produce warnings for both of these. static const int m = n; int x = sizeof(m); - static const double d = 0.0; // expected-warning{{not needed and will not be emitted}} + static const double d = 0.0; // expected-warning{{variable 'd' is not needed and will not be emitted}} int y = sizeof(d); + + namespace { + // FIXME: Should be "unused variable template 'var_t'" instead. + template const double var_t = 0; // expected-warning {{unused variable 'var_t'}} + template <> const double var_t = 0; // expected-warning {{variable 'var_t' is not needed and will not be emitted}} + int z = sizeof(var_t); // expected-warning {{unused variable 'z'}} + } // namespace } namespace unused_nested { @@ -158,9 +174,9 @@ namespace unused_nested { namespace unused { struct { - void func() { // expected-warning {{unused member function}} + void func() { // expected-warning {{unused member function 'func'}} } - } x; // expected-warning {{unused variable}} + } x; // expected-warning {{unused variable 'x'}} } namespace test6 { @@ -201,8 +217,8 @@ static void func() {} } namespace test9 { -template -static void completeRedeclChainForTemplateSpecialization() { } // expected-warning {{unused}} +template +static void completeRedeclChainForTemplateSpecialization() {} // expected-warning {{unused function template 'completeRedeclChainForTemplateSpecialization'}} } namespace test10 { @@ -216,8 +232,8 @@ constexpr T pi = T(3.14); namespace pr19713 { #if __cplusplus >= 201103L // FIXME: We should warn on both of these. - static constexpr int constexpr3() { return 1; } // expected-warning {{unused}} - constexpr int constexpr4() { return 2; } +static constexpr int constexpr3() { return 1; } // expected-warning {{unused function 'constexpr3'}} +constexpr int constexpr4() { return 2; } #endif } diff --git a/clang/test/SemaCXX/warn-variable-not-needed.cpp b/clang/test/SemaCXX/warn-variable-not-needed.cpp index 0fb0f8151b449..139c2923f4bae 100644 --- a/clang/test/SemaCXX/warn-variable-not-needed.cpp +++ b/clang/test/SemaCXX/warn-variable-not-needed.cpp @@ -2,9 +2,14 @@ namespace test1 { static int abc = 42; // expected-warning {{variable 'abc' is not needed and will not be emitted}} + + namespace { + template int abc_template = 0; + template <> int abc_template = 0; // expected-warning {{variable 'abc_template' is not needed and will not be emitted}} + } // namespace template int foo(void) { - return abc; + return abc + abc_template + abc_template; } } From b0512eed1e9dc03dba4ef8cccee73c13d3487565 Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Sun, 26 Jul 2020 17:24:43 +0100 Subject: [PATCH 0143/1035] [clang][NFC] Add a test for __attribute__((flag_enum)) with an unnamed enumeration. --- clang/test/Sema/attr-flag-enum.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/test/Sema/attr-flag-enum.c b/clang/test/Sema/attr-flag-enum.c index ae3e3ad5ab99f..467afd950973d 100644 --- a/clang/test/Sema/attr-flag-enum.c +++ b/clang/test/Sema/attr-flag-enum.c @@ -6,6 +6,10 @@ enum __attribute__((flag_enum)) flag { ec = 0x8, }; +enum __attribute__((flag_enum)) { + g = 0x7, // expected-warning {{enumeration value 'g' is out of range of flags in enumeration type ''}} +}; + enum __attribute__((flag_enum)) flag2 { ga = 0x1, gb = 0x4, From ab4e1be7ab3f1a1e7b78316b5b1929963e0e3bce Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Mon, 27 Jul 2020 02:23:51 +0900 Subject: [PATCH 0144/1035] [InstCombine] Add a test for folding freeze into phi; NFC --- .../test/Transforms/InstCombine/freeze-phi.ll | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/freeze-phi.ll diff --git a/llvm/test/Transforms/InstCombine/freeze-phi.ll b/llvm/test/Transforms/InstCombine/freeze-phi.ll new file mode 100644 index 0000000000000..fcda502a0d7f8 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/freeze-phi.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +define i32 @const(i1 %cond) { +; CHECK-LABEL: @const( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[C]] +; CHECK: C: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ 0, [[A]] ], [ 1, [[B]] ] +; CHECK-NEXT: ret i32 [[Y]] +; + br i1 %cond, label %A, label %B +A: + br label %C +B: + br label %C +C: + %y = phi i32 [0, %A], [1, %B] + %y.fr = freeze i32 %y + ret i32 %y +} + +define i32 @one(i1 %cond, i32 %x) { +; CHECK-LABEL: @one( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[C]] +; CHECK: C: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ 0, [[A]] ], [ [[X:%.*]], [[B]] ] +; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] +; CHECK-NEXT: ret i32 [[Y_FR]] +; + br i1 %cond, label %A, label %B +A: + br label %C +B: + br label %C +C: + %y = phi i32 [0, %A], [%x, %B] + %y.fr = freeze i32 %y + ret i32 %y.fr +} + +define i32 @two(i1 %cond, i32 %x, i32 %x2) { +; CHECK-LABEL: @two( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[C]] +; CHECK: C: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ [[X:%.*]], [[A]] ], [ [[X2:%.*]], [[B]] ] +; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] +; CHECK-NEXT: ret i32 [[Y_FR]] +; + br i1 %cond, label %A, label %B +A: + br label %C +B: + br label %C +C: + %y = phi i32 [%x, %A], [%x2, %B] + %y.fr = freeze i32 %y + ret i32 %y.fr +} + +define i32 @two_undef(i8 %cond, i32 %x) { +; CHECK-LABEL: @two_undef( +; CHECK-NEXT: switch i8 [[COND:%.*]], label [[A:%.*]] [ +; CHECK-NEXT: i8 0, label [[B:%.*]] +; CHECK-NEXT: i8 1, label [[C:%.*]] +; CHECK-NEXT: ] +; CHECK: A: +; CHECK-NEXT: br label [[D:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[D]] +; CHECK: C: +; CHECK-NEXT: br label [[D]] +; CHECK: D: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ undef, [[A]] ], [ [[X:%.*]], [[B]] ], [ 0, [[C]] ] +; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] +; CHECK-NEXT: ret i32 [[Y_FR]] +; + switch i8 %cond, label %A [ + i8 0, label %B + i8 1, label %C + ] +A: + br label %D +B: + br label %D +C: + br label %D +D: + %y = phi i32 [undef, %A], [%x, %B], [0, %C] + %y.fr = freeze i32 %y + ret i32 %y.fr +} + +define i32 @one_undef(i8 %cond) { +; CHECK-LABEL: @one_undef( +; CHECK-NEXT: switch i8 [[COND:%.*]], label [[A:%.*]] [ +; CHECK-NEXT: i8 0, label [[B:%.*]] +; CHECK-NEXT: i8 1, label [[C:%.*]] +; CHECK-NEXT: ] +; CHECK: A: +; CHECK-NEXT: br label [[D:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[D]] +; CHECK: C: +; CHECK-NEXT: br label [[D]] +; CHECK: D: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ undef, [[A]] ], [ 32, [[B]] ], [ 0, [[C]] ] +; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] +; CHECK-NEXT: ret i32 [[Y_FR]] +; + switch i8 %cond, label %A [ + i8 0, label %B + i8 1, label %C + ] +A: + br label %D +B: + br label %D +C: + br label %D +D: + %y = phi i32 [undef, %A], [32, %B], [0, %C] + %y.fr = freeze i32 %y + ret i32 %y.fr +} + +@glb = global i8 0 + +define i32 @one_constexpr(i8 %cond, i32 %x) { +; CHECK-LABEL: @one_constexpr( +; CHECK-NEXT: switch i8 [[COND:%.*]], label [[A:%.*]] [ +; CHECK-NEXT: i8 0, label [[B:%.*]] +; CHECK-NEXT: i8 1, label [[C:%.*]] +; CHECK-NEXT: ] +; CHECK: A: +; CHECK-NEXT: br label [[D:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[D]] +; CHECK: C: +; CHECK-NEXT: br label [[D]] +; CHECK: D: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ ptrtoint (i8* getelementptr inbounds (i8, i8* @glb, i64 2) to i32), [[A]] ], [ 32, [[B]] ], [ 0, [[C]] ] +; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] +; CHECK-NEXT: ret i32 [[Y_FR]] +; + switch i8 %cond, label %A [ + i8 0, label %B + i8 1, label %C + ] +A: + br label %D +B: + br label %D +C: + br label %D +D: + %y = phi i32 [ptrtoint (i8* getelementptr inbounds (i8, i8* @glb, i64 2) to i32), %A], [32, %B], [0, %C] + %y.fr = freeze i32 %y + ret i32 %y.fr +} From 1a75d88b3eb4e0cc5494281f24f362ece3d4aa4a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 26 Jul 2020 10:38:34 -0700 Subject: [PATCH 0145/1035] [X86] Move getGatherOverhead/getScatterOverhead into X86TargetTransformInfo. These cost methods don't make much sense in X86Subtarget. Make them methods in X86's TTI and move the feature checks from the X86Subtarget constructor into these methods. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D84594 --- llvm/lib/Target/X86/X86Subtarget.cpp | 11 --------- llvm/lib/Target/X86/X86Subtarget.h | 6 ----- .../lib/Target/X86/X86TargetTransformInfo.cpp | 24 +++++++++++++++++-- llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 +++ 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 07e913e139111..095aea3a10636 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -262,17 +262,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { isTargetKFreeBSD() || In64BitMode) stackAlignment = Align(16); - // Some CPUs have more overhead for gather. The specified overhead is relative - // to the Load operation. "2" is the number provided by Intel architects. This - // parameter is used for cost estimation of Gather Op and comparison with - // other alternatives. - // TODO: Remove the explicit hasAVX512()?, That would mean we would only - // enable gather with a -march. - if (hasAVX512() || (hasAVX2() && hasFastGather())) - GatherOverhead = 2; - if (hasAVX512()) - ScatterOverhead = 2; - // Consume the vector width attribute or apply any target specific limit. if (PreferVectorWidthOverride) PreferVectorWidth = PreferVectorWidthOverride; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 5b5ab4b969aac..e555dfdd638ac 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -506,10 +506,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode = false; - /// Contains the Overhead of gather\scatter instructions - int GatherOverhead = 1024; - int ScatterOverhead = 1024; - X86SelectionDAGInfo TSInfo; // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which // X86TargetLowering needs. @@ -678,8 +674,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool isPMADDWDSlow() const { return IsPMADDWDSlow; } bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } - int getGatherOverhead() const { return GatherOverhead; } - int getScatterOverhead() const { return ScatterOverhead; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } bool useLeaForSP() const { return UseLeaForSP; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index cc18e55656ef5..491078fd0542e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3848,6 +3848,26 @@ X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; } +int X86TTIImpl::getGatherOverhead() const { + // Some CPUs have more overhead for gather. The specified overhead is relative + // to the Load operation. "2" is the number provided by Intel architects. This + // parameter is used for cost estimation of Gather Op and comparison with + // other alternatives. + // TODO: Remove the explicit hasAVX512()?, That would mean we would only + // enable gather with a -march. + if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) + return 2; + + return 1024; +} + +int X86TTIImpl::getScatterOverhead() const { + if (ST->hasAVX512()) + return 2; + + return 1024; +} + // Return an average cost of Gather / Scatter instruction, maybe improved later int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, Align Alignment, unsigned AddressSpace) { @@ -3906,8 +3926,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. const int GSOverhead = (Opcode == Instruction::Load) - ? ST->getGatherOverhead() - : ST->getScatterOverhead(); + ? getGatherOverhead() + : getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), MaybeAlign(Alignment), AddressSpace, TTI::TCK_RecipThroughput); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index ca875fa68523c..5ccaf409790db 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -244,6 +244,9 @@ class X86TTIImpl : public BasicTTIImplBase { int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr, Align Alignment, unsigned AddressSpace); + int getGatherOverhead() const; + int getScatterOverhead() const; + /// @} }; From 3bbf3e026d3c692966583075ae6d12c4575e9d72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20K=C3=A4ufler?= Date: Sun, 26 Jul 2020 13:59:45 -0400 Subject: [PATCH 0146/1035] Replace comment by private method; NFC. --- clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp index 68bb987c1275f..1cae618dfd093 100644 --- a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp +++ b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp @@ -123,12 +123,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks { // Emit warnings for headers that are missing guards. checkGuardlessHeaders(); - - // Clear all state. - Macros.clear(); - Files.clear(); - Ifndefs.clear(); - EndIfs.clear(); + clearAllState(); } bool wouldFixEndifComment(StringRef FileName, SourceLocation EndIf, @@ -255,6 +250,13 @@ class HeaderGuardPPCallbacks : public PPCallbacks { } private: + void clearAllState() { + Macros.clear(); + Files.clear(); + Ifndefs.clear(); + EndIfs.clear(); + } + std::vector> Macros; llvm::StringMap Files; std::map> From 7454acdf3b7d064ebbf6b8027296f42f504b285a Mon Sep 17 00:00:00 2001 From: Tim Keith Date: Sun, 26 Jul 2020 12:13:36 -0700 Subject: [PATCH 0147/1035] [flang] Fix implicit declarations in statement functions If a symbol (that is not a dummy argument) is implicitly declared inside a statement function, don't create it in the statement function's scope. Instead, treat statement functions like blocks when finding the inclusive scope and create the symbol there. Add a new flag, StmtFunction, to symbols that represent statement functions. Differential Revision: https://reviews.llvm.org/D84588 --- flang/include/flang/Semantics/scope.h | 1 + flang/include/flang/Semantics/symbol.h | 1 + flang/lib/Semantics/resolve-names.cpp | 4 +++- flang/lib/Semantics/scope.cpp | 4 ++++ flang/test/Semantics/symbol16.f90 | 12 +++++++++++- 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h index a67a008889276..5ebe5f32eb677 100644 --- a/flang/include/flang/Semantics/scope.h +++ b/flang/include/flang/Semantics/scope.h @@ -87,6 +87,7 @@ class Scope { bool IsModule() const; // only module, not submodule bool IsSubmodule() const; bool IsDerivedType() const { return kind_ == Kind::DerivedType; } + bool IsStmtFunction() const; bool IsParameterizedDerivedType() const; Symbol *symbol() { return symbol_; } const Symbol *symbol() const { return symbol_; } diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h index 227f951aef5f0..a1fd1baef78d3 100644 --- a/flang/include/flang/Semantics/symbol.h +++ b/flang/include/flang/Semantics/symbol.h @@ -482,6 +482,7 @@ class Symbol { Error, // an error has been reported on this symbol Function, // symbol is a function Subroutine, // symbol is a subroutine + StmtFunction, // symbol is a statement function (Function is set too) Implicit, // symbol is implicitly typed ModFile, // symbol came from .mod file ParentComp, // symbol is the "parent component" of an extended type diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 7189b48482654..8ea2e8ed9a60a 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -2015,7 +2015,8 @@ void ScopeHandler::Say2(const parser::Name &name, MessageFixedText &&msg1, Scope &ScopeHandler::InclusiveScope() { for (auto *scope{&currScope()};; scope = &scope->parent()) { - if (scope->kind() != Scope::Kind::Block && !scope->IsDerivedType()) { + if (scope->kind() != Scope::Kind::Block && !scope->IsDerivedType() && + !scope->IsStmtFunction()) { return *scope; } } @@ -2692,6 +2693,7 @@ bool SubprogramVisitor::HandleStmtFunction(const parser::StmtFunctionStmt &x) { return true; } auto &symbol{PushSubprogramScope(name, Symbol::Flag::Function)}; + symbol.set(Symbol::Flag::StmtFunction); EraseSymbol(symbol); // removes symbol added by PushSubprogramScope auto &details{symbol.get()}; for (const auto &dummyName : std::get>(x.t)) { diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp index 02637ba2add7f..a2a9e1dbe9e73 100644 --- a/flang/lib/Semantics/scope.cpp +++ b/flang/lib/Semantics/scope.cpp @@ -333,6 +333,10 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Scope &scope) { return os; } +bool Scope::IsStmtFunction() const { + return symbol_ && symbol_->test(Symbol::Flag::StmtFunction); +} + bool Scope::IsParameterizedDerivedType() const { if (!IsDerivedType()) { return false; diff --git a/flang/test/Semantics/symbol16.f90 b/flang/test/Semantics/symbol16.f90 index ce47134fc3377..4fa6f2b9c0eab 100644 --- a/flang/test/Semantics/symbol16.f90 +++ b/flang/test/Semantics/symbol16.f90 @@ -3,7 +3,7 @@ !DEF: /p1 MainProgram program p1 - !DEF: /p1/f (Function) Subprogram INTEGER(4) + !DEF: /p1/f (Function, StmtFunction) Subprogram INTEGER(4) !DEF: /p1/i ObjectEntity INTEGER(4) !DEF: /p1/j ObjectEntity INTEGER(4) integer f, i, j @@ -15,3 +15,13 @@ program p1 !REF: /p1/f j = f(2) end program + +!DEF: /p2 MainProgram +program p2 + !DEF: /p2/f (Function, StmtFunction) Subprogram REAL(4) + !DEF: /p2/f/x (Implicit) ObjectEntity REAL(4) + !DEF: /p2/y (Implicit) ObjectEntity REAL(4) + f(x) = y + !REF: /p2/y + y = 1.0 +end program From df12524e6ba02d3eda975de4541f55e151074b07 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 26 Jul 2020 10:57:59 -0700 Subject: [PATCH 0148/1035] [X86] Turn X86DAGToDAGISel::tryVPTERNLOG into a fully custom instruction selector that can handle bitcasts between logic ops Previously we just matched the logic ops and replaced with an X86ISD::VPTERNLOG node that we would send through the normal pattern match. But that approach couldn't handle a bitcast between the logic ops. Extending that approach would require us to peek through the bitcasts and emit new bitcasts to match the types. Those new bitcasts would then have to be properly topologically sorted. This patch instead switches to directly emitting the MachineSDNode and skips the normal tablegen pattern matching. We do have to handle load folding and broadcast load folding ourselves now. Which also means commuting the immediate control. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D83630 --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 151 +++++++++++++++++++++--- llvm/test/CodeGen/X86/avx512-logic.ll | 30 ++--- llvm/test/CodeGen/X86/avx512vl-logic.ll | 12 +- 3 files changed, 145 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 3cd80cb04ab84..4098911dee3b1 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3940,30 +3940,39 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; - unsigned Opc1 = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - auto isLogicOp = [](unsigned Opc) { - return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || - Opc == X86ISD::ANDNP; + auto getFoldableLogicOp = [](SDValue Op) { + // Peek through single use bitcast. + if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) + Op = Op.getOperand(0); + + if (!Op.hasOneUse()) + return SDValue(); + + unsigned Opc = Op.getOpcode(); + if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || + Opc == X86ISD::ANDNP) + return Op; + + return SDValue(); }; - SDValue A, B, C; - unsigned Opc2; - if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) { - Opc2 = N1.getOpcode(); + SDValue A, FoldableOp; + if ((FoldableOp = getFoldableLogicOp(N1))) { A = N0; - B = N1.getOperand(0); - C = N1.getOperand(1); - } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) { - Opc2 = N0.getOpcode(); + } else if ((FoldableOp = getFoldableLogicOp(N0))) { A = N1; - B = N0.getOperand(0); - C = N0.getOperand(1); } else return false; + SDValue B = FoldableOp.getOperand(0); + SDValue C = FoldableOp.getOperand(1); + + unsigned Opc1 = N->getOpcode(); + unsigned Opc2 = FoldableOp.getOpcode(); + uint64_t Imm; switch (Opc1) { default: llvm_unreachable("Unexpected opcode!"); @@ -3996,11 +4005,117 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { break; } + auto tryFoldLoadOrBCast = + [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, SDValue &Segment) { + if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) + return true; + + // Not a load, check for broadcast which may be behind a bitcast. + if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { + P = L.getNode(); + L = L.getOperand(0); + } + + if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) + return false; + + // Only 32 and 64 bit broadcasts are supported. + auto *MemIntr = cast(L); + unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); + if (Size != 32 && Size != 64) + return false; + + return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); + }; + + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoadOrBCast(N, FoldableOp.getNode(), C, Tmp0, Tmp1, Tmp2, Tmp3, + Tmp4)) { + FoldedLoad = true; + } else if (tryFoldLoadOrBCast(N, N, A, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + FoldedLoad = true; + std::swap(A, C); + // Swap bits 1/4 and 3/6. + uint8_t OldImm = Imm; + Imm = OldImm & 0xa5; + if (OldImm & 0x02) Imm |= 0x10; + if (OldImm & 0x10) Imm |= 0x02; + if (OldImm & 0x08) Imm |= 0x40; + if (OldImm & 0x40) Imm |= 0x08; + } else if (tryFoldLoadOrBCast(N, FoldableOp.getNode(), B, Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + FoldedLoad = true; + std::swap(B, C); + // Swap bits 1/2 and 5/6. + uint8_t OldImm = Imm; + Imm = OldImm & 0x99; + if (OldImm & 0x02) Imm |= 0x04; + if (OldImm & 0x04) Imm |= 0x02; + if (OldImm & 0x20) Imm |= 0x40; + if (OldImm & 0x40) Imm |= 0x20; + } + SDLoc DL(N); - SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C, - CurDAG->getTargetConstant(Imm, DL, MVT::i8)); - ReplaceNode(N, New.getNode()); - SelectCode(New.getNode()); + + SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); + + MachineSDNode *MNode; + if (FoldedLoad) { + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + + unsigned Opc; + if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast(C); + unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); + assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!"); + + bool UseD = EltSize == 32; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; + else + llvm_unreachable("Unexpected vector size!"); + } else { + bool UseD = NVT.getVectorElementType() == MVT::i32; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; + else + llvm_unreachable("Unexpected vector size!"); + } + + SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)}; + MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); + + // Update the chain. + ReplaceUses(C.getValue(1), SDValue(MNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(MNode, {cast(C)->getMemOperand()}); + } else { + bool UseD = NVT.getVectorElementType() == MVT::i32; + unsigned Opc; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; + else + llvm_unreachable("Unexpected vector size!"); + + MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm}); + } + + ReplaceUses(SDValue(N, 0), SDValue(MNode, 0)); + CurDAG->RemoveDeadNode(N); return true; } diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll index 30607214f56d5..24e58149eb4c2 100644 --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -887,34 +887,20 @@ define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) } define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) { -; KNL-LABEL: ternlog_or_and_mask: -; KNL: ## %bb.0: -; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: ternlog_or_and_mask: -; SKX: ## %bb.0: -; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: ternlog_or_and_mask: +; ALL: ## %bb.0: +; ALL-NEXT: vpternlogd $236, {{.*}}(%rip), %zmm1, %zmm0 +; ALL-NEXT: retq %a = and <16 x i32> %x, %b = or <16 x i32> %a, %y ret <16 x i32> %b } define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) { -; KNL-LABEL: ternlog_xor_and_mask: -; KNL: ## %bb.0: -; KNL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 -; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: ternlog_xor_and_mask: -; SKX: ## %bb.0: -; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: ternlog_xor_and_mask: +; ALL: ## %bb.0: +; ALL-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm1, %zmm0 +; ALL-NEXT: retq %a = and <8 x i64> %x, %b = xor <8 x i64> %a, %y ret <8 x i64> %b diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll index 3f0ce30928478..13c4c8afb9a8f 100644 --- a/llvm/test/CodeGen/X86/avx512vl-logic.ll +++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll @@ -991,8 +991,7 @@ define <4 x i32> @ternlog_xor_andn(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: ternlog_or_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpternlogd $236, {{.*}}(%rip), %xmm1, %xmm0 ; CHECK-NEXT: retq %a = and <4 x i32> %x, %b = or <4 x i32> %a, %y @@ -1002,8 +1001,7 @@ define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) { define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: ternlog_or_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpternlogd $236, {{.*}}(%rip), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = and <8 x i32> %x, %b = or <8 x i32> %a, %y @@ -1013,8 +1011,7 @@ define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) { define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: ternlog_xor_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 ; CHECK-NEXT: retq %a = and <2 x i64> %x, %b = xor <2 x i64> %a, %y @@ -1024,8 +1021,7 @@ define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) { define <4 x i64> @ternlog_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y) { ; CHECK-LABEL: ternlog_xor_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpternlogq $108, {{.*}}(%rip), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = and <4 x i64> %x, %b = xor <4 x i64> %a, %y From 9282d04e041c4dd21d3af8463e2cb30964a9272c Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Sun, 26 Jul 2020 12:46:46 -0700 Subject: [PATCH 0149/1035] [lld-macho] Support lookup of dylibs in frameworks Needed for testing Objective-C programs (since e.g. Core Foundation is a framework) Reviewed By: #lld-macho, compnerd Differential Revision: https://reviews.llvm.org/D83925 --- lld/MachO/Config.h | 1 - lld/MachO/Driver.cpp | 38 ++++++++++++++++++++++++++++++++++++++ lld/test/MachO/framework.s | 29 +++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 lld/test/MachO/framework.s diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 79812a4335638..c66991b581fc0 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -30,7 +30,6 @@ struct Configuration { llvm::MachO::Architecture arch; llvm::MachO::HeaderFileType outputType; std::vector librarySearchPaths; - // TODO: use the framework search paths std::vector frameworkSearchPaths; llvm::DenseMap priorities; }; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 4dfb387e4e622..ee794129e1fc6 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -30,6 +30,7 @@ #include "llvm/Object/Archive.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/Option.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/Host.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" @@ -98,6 +99,32 @@ static Optional findLibrary(StringRef name) { return {}; } +static Optional findFramework(StringRef name) { + // TODO: support .tbd files + llvm::SmallString<260> symlink; + llvm::SmallString<260> location; + StringRef suffix; + std::tie(name, suffix) = name.split(","); + for (StringRef dir : config->frameworkSearchPaths) { + symlink = dir; + path::append(symlink, name + ".framework", name); + // If the symlink fails to resolve, skip to the next search path. + // NOTE: we must resolve the symlink before trying the suffixes, because + // there are no symlinks for the suffixed paths. + if (fs::real_path(symlink, location)) + continue; + if (!suffix.empty()) { + llvm::Twine suffixed = location + suffix; + if (fs::exists(suffixed)) + return suffixed.str(); + // Suffix lookup failed, fall through to the no-suffix case. + } + if (fs::exists(location)) + return location.str().str(); + } + return {}; +} + static TargetInfo *createTargetInfo(opt::InputArgList &args) { StringRef arch = args.getLastArgValue(OPT_arch, "x86_64"); config->arch = llvm::MachO::getArchitectureFromName( @@ -393,13 +420,24 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, error("library not found for -l" + name); break; } + case OPT_framework: { + StringRef name = arg->getValue(); + if (Optional path = findFramework(name)) { + addFile(*path); + break; + } + error("framework not found for -framework " + name); + break; + } case OPT_platform_version: handlePlatformVersion(arg); break; case OPT_o: case OPT_dylib: case OPT_e: + case OPT_F: case OPT_L: + case OPT_install_name: case OPT_Z: case OPT_arch: // handled elsewhere diff --git a/lld/test/MachO/framework.s b/lld/test/MachO/framework.s new file mode 100644 index 0000000000000..a527970ed6bc9 --- /dev/null +++ b/lld/test/MachO/framework.s @@ -0,0 +1,29 @@ +# REQUIRES: x86, shell +# RUN: mkdir -p %t +# RUN: echo ".globl _foo; _foo: ret" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: mkdir -p %t/Foo.framework/Versions/A +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -dylib -install_name %t/Foo.framework/Versions/A/Foo %t/foo.o -o %t/Foo.framework/Versions/A/Foo +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -dylib -install_name %t/Foo.framework/Versions/A/Foobar %t/foo.o -o %t/Foo.framework/Versions/A/Foobar +# RUN: ln -sf %t/Foo.framework/Versions/A %t/Foo.framework/Versions/Current +# RUN: ln -sf %t/Foo.framework/Versions/Current/Foo %t/Foo.framework/Foo + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/test.o %s +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -F%t -framework Foo %t/test.o -o %t/test +# RUN: llvm-objdump --macho --lazy-bind %t/test | FileCheck %s --check-prefix=NOSUFFIX +# NOSUFFIX: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} {{.*}}Foo _foo + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -F%t -framework Foo,baz %t/test.o -o %t/test-wrong-suffix +# RUN: llvm-objdump --macho --lazy-bind %t/test-wrong-suffix | FileCheck %s --check-prefix=NOSUFFIX + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -F%t -framework Foo,bar %t/test.o -o %t/test-suffix +# RUN: llvm-objdump --macho --lazy-bind %t/test-suffix | FileCheck %s --check-prefix=SUFFIX +# SUFFIX: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} {{.*}}Foobar _foo + +.globl _main +.text +_main: + sub $8, %rsp # 16-byte-align the stack; dylld -flavor darwinnew checks for this + callq _foo + mov $0, %rax + add $8, %rsp + ret From 8dc820393219c7ee440b4ec86c9a201301943276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 25 Jul 2020 15:01:48 +0300 Subject: [PATCH 0150/1035] [LLD] [COFF] Fix test to properly test all aspects of c3b1d730d6. NFC. Previously, the test could pass with one part of c3b1d730d6 removed. --- lld/test/COFF/associative-comdat-mingw-i386.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/test/COFF/associative-comdat-mingw-i386.s b/lld/test/COFF/associative-comdat-mingw-i386.s index 3f5e02330d505..8d89478d4eb03 100644 --- a/lld/test/COFF/associative-comdat-mingw-i386.s +++ b/lld/test/COFF/associative-comdat-mingw-i386.s @@ -30,7 +30,7 @@ _main: .scl 2; .type 32; .endef - .section .text$foo,"xr",discard,foo + .section .text$foo,"xr",discard,_foo .globl _foo .p2align 4 _foo: From ff25b2da2ab9049e154cc8b9af06a24f79a74209 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Sun, 26 Jul 2020 20:00:49 -0400 Subject: [PATCH 0151/1035] [flang][openacc] Basic name resolution infrastructure for OpenACC construct Reviewed By: tskeith, klausler, ichoyjx Differential Revision: https://reviews.llvm.org/D83998 --- flang/include/flang/Semantics/symbol.h | 7 + flang/lib/Semantics/resolve-names.cpp | 749 +++++++++++++++---- flang/lib/Semantics/unparse-with-symbols.cpp | 5 + flang/test/Semantics/acc-resolve01.f90 | 22 + flang/test/Semantics/acc-resolve02.f90 | 17 + flang/test/Semantics/acc-symbols01.f90 | 26 + flang/test/Semantics/test_symbols.sh | 5 +- 7 files changed, 689 insertions(+), 142 deletions(-) create mode 100644 flang/test/Semantics/acc-resolve01.f90 create mode 100644 flang/test/Semantics/acc-resolve02.f90 create mode 100644 flang/test/Semantics/acc-symbols01.f90 diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h index a1fd1baef78d3..3000a39c3b589 100644 --- a/flang/include/flang/Semantics/symbol.h +++ b/flang/include/flang/Semantics/symbol.h @@ -492,6 +492,13 @@ class Symbol { LocalityShared, // named in SHARED locality-spec InDataStmt, // initialized in a DATA statement + // OpenACC data-sharing attribute + AccPrivate, AccFirstPrivate, AccShared, + // OpenACC data-mapping attribute + AccCopyIn, AccCopyOut, AccCreate, AccDelete, AccPresent, + // OpenACC miscellaneous flags + AccCommonBlock, AccThreadPrivate, AccReduction, AccNone, AccPreDetermined, + // OpenMP data-sharing attribute OmpShared, OmpPrivate, OmpLinear, OmpFirstPrivate, OmpLastPrivate, // OpenMP data-mapping attribute diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 8ea2e8ed9a60a..8d3e97d20521d 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -7,6 +7,7 @@ #include "resolve-names.h" #include "assignment.h" +#include "check-acc-structure.h" #include "check-omp-structure.h" #include "mod-file.h" #include "pointer-assignment.h" @@ -1081,6 +1082,305 @@ class ConstructVisitor : public virtual DeclarationVisitor { void PopAssociation(); }; +template class DirectiveAttributeVisitor { +public: + explicit DirectiveAttributeVisitor( + SemanticsContext &context, ResolveNamesVisitor &resolver) + : context_{context}, resolver_{resolver} {} + +protected: + struct DirContext { + DirContext(const parser::CharBlock &source, T d, Scope &s) + : directiveSource{source}, directive{d}, scope{s} {} + parser::CharBlock directiveSource; + T directive; + Scope &scope; + Symbol::Flag defaultDSA{Symbol::Flag::AccShared}; // TODOACC + std::map objectWithDSA; + bool withinConstruct{false}; + int64_t associatedLoopLevel{0}; + }; + + DirContext &GetContext() { + CHECK(!dirContext_.empty()); + return dirContext_.back(); + } + void PushContext(const parser::CharBlock &source, T dir) { + dirContext_.emplace_back(source, dir, context_.FindScope(source)); + } + void PopContext() { dirContext_.pop_back(); } + void SetContextDirectiveSource(parser::CharBlock &dir) { + GetContext().directiveSource = dir; + } + void SetContextDirectiveEnum(T dir) { GetContext().directive = dir; } + Scope &currScope() { return GetContext().scope; } + void SetContextDefaultDSA(Symbol::Flag flag) { + GetContext().defaultDSA = flag; + } + void AddToContextObjectWithDSA( + const Symbol &symbol, Symbol::Flag flag, DirContext &context) { + context.objectWithDSA.emplace(&symbol, flag); + } + void AddToContextObjectWithDSA(const Symbol &symbol, Symbol::Flag flag) { + AddToContextObjectWithDSA(symbol, flag, GetContext()); + } + bool IsObjectWithDSA(const Symbol &symbol) { + auto it{GetContext().objectWithDSA.find(&symbol)}; + return it != GetContext().objectWithDSA.end(); + } + void SetContextAssociatedLoopLevel(int64_t level) { + GetContext().associatedLoopLevel = level; + } + Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev, Scope &scope) { + const auto pair{scope.try_emplace(name, Attrs{}, HostAssocDetails{prev})}; + return *pair.first->second; + } + Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev) { + return MakeAssocSymbol(name, prev, currScope()); + } + static const parser::Name *GetDesignatorNameIfDataRef( + const parser::Designator &designator) { + const auto *dataRef{std::get_if(&designator.u)}; + return dataRef ? std::get_if(&dataRef->u) : nullptr; + } + void AddDataSharingAttributeObject(SymbolRef object) { + dataSharingAttributeObjects_.insert(object); + } + void ClearDataSharingAttributeObjects() { + dataSharingAttributeObjects_.clear(); + } + bool HasDataSharingAttributeObject(const Symbol &); + const parser::Name &GetLoopIndex(const parser::DoConstruct &); + const parser::DoConstruct *GetDoConstructIf( + const parser::ExecutionPartConstruct &); + Symbol *DeclarePrivateAccessEntity( + const parser::Name &, Symbol::Flag, Scope &); + Symbol *DeclarePrivateAccessEntity(Symbol &, Symbol::Flag, Scope &); + Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); + + SymbolSet dataSharingAttributeObjects_; // on one directive + SemanticsContext &context_; + ResolveNamesVisitor &resolver_; + std::vector dirContext_; // used as a stack +}; + +template +bool DirectiveAttributeVisitor::HasDataSharingAttributeObject( + const Symbol &object) { + auto it{dataSharingAttributeObjects_.find(object)}; + return it != dataSharingAttributeObjects_.end(); +} + +template +const parser::Name &DirectiveAttributeVisitor::GetLoopIndex( + const parser::DoConstruct &x) { + auto &loopControl{x.GetLoopControl().value()}; + using Bounds = parser::LoopControl::Bounds; + const Bounds &bounds{std::get(loopControl.u)}; + return bounds.name.thing; +} + +template +const parser::DoConstruct *DirectiveAttributeVisitor::GetDoConstructIf( + const parser::ExecutionPartConstruct &x) { + return parser::Unwrap(x); +} + +template +Symbol *DirectiveAttributeVisitor::DeclarePrivateAccessEntity( + const parser::Name &name, Symbol::Flag flag, Scope &scope) { + if (!name.symbol) { + return nullptr; // not resolved by Name Resolution step, do nothing + } + name.symbol = DeclarePrivateAccessEntity(*name.symbol, flag, scope); + return name.symbol; +} + +template +Symbol *DirectiveAttributeVisitor::DeclarePrivateAccessEntity( + Symbol &object, Symbol::Flag flag, Scope &scope) { + if (object.owner() != currScope()) { + auto &symbol{MakeAssocSymbol(object.name(), object, scope)}; + symbol.set(flag); + return &symbol; + } else { + object.set(flag); + return &object; + } +} + +// Create scopes for OpenACC constructs +class AccVisitor : public virtual DeclarationVisitor { +public: + void AddAccSourceRange(const parser::CharBlock &); + + static bool NeedsScope(const parser::OpenACCBlockConstruct &); + + bool Pre(const parser::OpenACCBlockConstruct &); + void Post(const parser::OpenACCBlockConstruct &); + bool Pre(const parser::AccBeginBlockDirective &x) { + AddAccSourceRange(x.source); + return true; + } + void Post(const parser::AccBeginBlockDirective &) { + messageHandler().set_currStmtSource(std::nullopt); + } + bool Pre(const parser::AccEndBlockDirective &x) { + AddAccSourceRange(x.source); + return true; + } + void Post(const parser::AccEndBlockDirective &) { + messageHandler().set_currStmtSource(std::nullopt); + } + bool Pre(const parser::AccBeginLoopDirective &x) { + AddAccSourceRange(x.source); + return true; + } + void Post(const parser::AccBeginLoopDirective &x) { + messageHandler().set_currStmtSource(std::nullopt); + } +}; + +bool AccVisitor::NeedsScope(const parser::OpenACCBlockConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &beginDir{std::get(beginBlockDir.t)}; + switch (beginDir.v) { + case llvm::acc::Directive::ACCD_data: + case llvm::acc::Directive::ACCD_host_data: + case llvm::acc::Directive::ACCD_kernels: + case llvm::acc::Directive::ACCD_parallel: + case llvm::acc::Directive::ACCD_serial: + return true; + default: + return false; + } +} + +void AccVisitor::AddAccSourceRange(const parser::CharBlock &source) { + messageHandler().set_currStmtSource(source); + currScope().AddSourceRange(source); +} + +bool AccVisitor::Pre(const parser::OpenACCBlockConstruct &x) { + if (NeedsScope(x)) { + PushScope(Scope::Kind::Block, nullptr); + } + return true; +} + +void AccVisitor::Post(const parser::OpenACCBlockConstruct &x) { + if (NeedsScope(x)) { + PopScope(); + } +} + +class AccAttributeVisitor : DirectiveAttributeVisitor { +public: + explicit AccAttributeVisitor( + SemanticsContext &context, ResolveNamesVisitor &resolver) + : DirectiveAttributeVisitor(context, resolver) {} + + template void Walk(const A &x) { parser::Walk(x, *this); } + template bool Pre(const A &) { return true; } + template void Post(const A &) {} + + bool Pre(const parser::SpecificationPart &x) { + Walk(std::get>(x.t)); + return false; + } + + bool Pre(const parser::OpenACCBlockConstruct &); + void Post(const parser::OpenACCBlockConstruct &) { PopContext(); } + bool Pre(const parser::OpenACCCombinedConstruct &); + void Post(const parser::OpenACCCombinedConstruct &) { PopContext(); } + + void Post(const parser::AccBeginBlockDirective &) { + GetContext().withinConstruct = true; + } + + bool Pre(const parser::OpenACCLoopConstruct &); + void Post(const parser::OpenACCLoopConstruct &) { PopContext(); } + void Post(const parser::AccLoopDirective &) { + GetContext().withinConstruct = true; + } + + bool Pre(const parser::OpenACCStandaloneConstruct &); + void Post(const parser::OpenACCStandaloneConstruct &) { PopContext(); } + void Post(const parser::AccStandaloneDirective &) { + GetContext().withinConstruct = true; + } + + void Post(const parser::AccDefaultClause &); + + bool Pre(const parser::AccClause::Copy &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccCopyIn); + ResolveAccObjectList(x.v, Symbol::Flag::AccCopyOut); + return false; + } + + bool Pre(const parser::AccClause::Create &x) { + const auto &objectList{std::get(x.v.t)}; + ResolveAccObjectList(objectList, Symbol::Flag::AccCreate); + return false; + } + + bool Pre(const parser::AccClause::Copyin &x) { + const auto &objectList{std::get(x.v.t)}; + ResolveAccObjectList(objectList, Symbol::Flag::AccCopyIn); + return false; + } + + bool Pre(const parser::AccClause::Copyout &x) { + const auto &objectList{std::get(x.v.t)}; + ResolveAccObjectList(objectList, Symbol::Flag::AccCopyOut); + return false; + } + + bool Pre(const parser::AccClause::Present &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccPresent); + return false; + } + bool Pre(const parser::AccClause::Private &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccPrivate); + return false; + } + bool Pre(const parser::AccClause::FirstPrivate &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccFirstPrivate); + return false; + } + + void Post(const parser::Name &); + +private: + int64_t GetAssociatedLoopLevelFromClauses(const parser::AccClauseList &); + + static constexpr Symbol::Flags dataSharingAttributeFlags{ + Symbol::Flag::AccShared, Symbol::Flag::AccPrivate, + Symbol::Flag::AccPresent, Symbol::Flag::AccFirstPrivate, + Symbol::Flag::AccReduction}; + + static constexpr Symbol::Flags dataMappingAttributeFlags{ + Symbol::Flag::AccCreate, Symbol::Flag::AccCopyIn, + Symbol::Flag::AccCopyOut, Symbol::Flag::AccDelete}; + + static constexpr Symbol::Flags accFlagsRequireNewSymbol{ + Symbol::Flag::AccPrivate, Symbol::Flag::AccFirstPrivate, + Symbol::Flag::AccReduction}; + + static constexpr Symbol::Flags accFlagsRequireMark{}; + + void PrivatizeAssociatedLoopIndex(const parser::OpenACCLoopConstruct &); + void ResolveAccObjectList(const parser::AccObjectList &, Symbol::Flag); + void ResolveAccObject(const parser::AccObject &, Symbol::Flag); + Symbol *ResolveAcc(const parser::Name &, Symbol::Flag, Scope &); + Symbol *ResolveAcc(Symbol &, Symbol::Flag, Scope &); + Symbol *ResolveAccCommonBlockName(const parser::Name *); + Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); + Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag); + void CheckMultipleAppearances( + const parser::Name &, const Symbol &, Symbol::Flag); +}; + // Create scopes for OpenMP constructs class OmpVisitor : public virtual DeclarationVisitor { public: @@ -1178,11 +1478,11 @@ void OmpVisitor::Post(const parser::OpenMPBlockConstruct &x) { } // Data-sharing and Data-mapping attributes for data-refs in OpenMP construct -class OmpAttributeVisitor { +class OmpAttributeVisitor : DirectiveAttributeVisitor { public: explicit OmpAttributeVisitor( SemanticsContext &context, ResolveNamesVisitor &resolver) - : context_{context}, resolver_{resolver} {} + : DirectiveAttributeVisitor(context, resolver) {} template void Walk(const A &x) { parser::Walk(x, *this); } @@ -1235,70 +1535,8 @@ class OmpAttributeVisitor { void Post(const parser::Name &); private: - struct OmpContext { - OmpContext( - const parser::CharBlock &source, llvm::omp::Directive d, Scope &s) - : directiveSource{source}, directive{d}, scope{s} {} - parser::CharBlock directiveSource; - llvm::omp::Directive directive; - Scope &scope; - // TODO: default DSA is implicitly determined in different ways - Symbol::Flag defaultDSA{Symbol::Flag::OmpShared}; - // variables on Data-sharing attribute clauses - std::map objectWithDSA; - bool withinConstruct{false}; - std::int64_t associatedLoopLevel{0}; - }; - // back() is the top of the stack - OmpContext &GetContext() { - CHECK(!ompContext_.empty()); - return ompContext_.back(); - } - void PushContext(const parser::CharBlock &source, llvm::omp::Directive dir) { - ompContext_.emplace_back(source, dir, context_.FindScope(source)); - } - void PopContext() { ompContext_.pop_back(); } - void SetContextDirectiveSource(parser::CharBlock &dir) { - GetContext().directiveSource = dir; - } - void SetContextDirectiveEnum(llvm::omp::Directive dir) { - GetContext().directive = dir; - } - Scope &currScope() { return GetContext().scope; } - void SetContextDefaultDSA(Symbol::Flag flag) { - GetContext().defaultDSA = flag; - } - void AddToContextObjectWithDSA( - const Symbol &symbol, Symbol::Flag flag, OmpContext &context) { - context.objectWithDSA.emplace(&symbol, flag); - } - void AddToContextObjectWithDSA(const Symbol &symbol, Symbol::Flag flag) { - AddToContextObjectWithDSA(symbol, flag, GetContext()); - } - bool IsObjectWithDSA(const Symbol &symbol) { - auto it{GetContext().objectWithDSA.find(&symbol)}; - return it != GetContext().objectWithDSA.end(); - } - - void SetContextAssociatedLoopLevel(std::int64_t level) { - GetContext().associatedLoopLevel = level; - } std::int64_t GetAssociatedLoopLevelFromClauses(const parser::OmpClauseList &); - Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev, Scope &scope) { - const auto pair{scope.try_emplace(name, Attrs{}, HostAssocDetails{prev})}; - return *pair.first->second; - } - Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev) { - return MakeAssocSymbol(name, prev, currScope()); - } - - static const parser::Name *GetDesignatorNameIfDataRef( - const parser::Designator &designator) { - const auto *dataRef{std::get_if(&designator.u)}; - return dataRef ? std::get_if(&dataRef->u) : nullptr; - } - static constexpr Symbol::Flags dataSharingAttributeFlags{ Symbol::Flag::OmpShared, Symbol::Flag::OmpPrivate, Symbol::Flag::OmpFirstPrivate, Symbol::Flag::OmpLastPrivate, @@ -1312,19 +1550,8 @@ class OmpAttributeVisitor { static constexpr Symbol::Flags ompFlagsRequireMark{ Symbol::Flag::OmpThreadprivate}; - void AddDataSharingAttributeObject(SymbolRef object) { - dataSharingAttributeObjects_.insert(object); - } - void ClearDataSharingAttributeObjects() { - dataSharingAttributeObjects_.clear(); - } - bool HasDataSharingAttributeObject(const Symbol &); - - const parser::DoConstruct *GetDoConstructIf( - const parser::ExecutionPartConstruct &); // Predetermined DSA rules void PrivatizeAssociatedLoopIndex(const parser::OpenMPLoopConstruct &); - const parser::Name &GetLoopIndex(const parser::DoConstruct &); void ResolveSeqLoopIndexInParallelOrTaskConstruct(const parser::Name &); void ResolveOmpObjectList(const parser::OmpObjectList &, Symbol::Flag); @@ -1332,18 +1559,10 @@ class OmpAttributeVisitor { Symbol *ResolveOmp(const parser::Name &, Symbol::Flag, Scope &); Symbol *ResolveOmp(Symbol &, Symbol::Flag, Scope &); Symbol *ResolveOmpCommonBlockName(const parser::Name *); - Symbol *DeclarePrivateAccessEntity( - const parser::Name &, Symbol::Flag, Scope &); - Symbol *DeclarePrivateAccessEntity(Symbol &, Symbol::Flag, Scope &); Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag); void CheckMultipleAppearances( const parser::Name &, const Symbol &, Symbol::Flag); - SymbolSet dataSharingAttributeObjects_; // on one directive - - SemanticsContext &context_; - ResolveNamesVisitor &resolver_; - std::vector ompContext_; // used as a stack }; // Walk the parse tree and resolve names to symbols. @@ -1351,8 +1570,11 @@ class ResolveNamesVisitor : public virtual ScopeHandler, public ModuleVisitor, public SubprogramVisitor, public ConstructVisitor, - public OmpVisitor { + public OmpVisitor, + public AccVisitor { public: + using AccVisitor::Post; + using AccVisitor::Pre; using ArraySpecVisitor::Post; using ConstructVisitor::Post; using ConstructVisitor::Pre; @@ -1450,6 +1672,7 @@ class ResolveNamesVisitor : public virtual ScopeHandler, void FinishSpecificationParts(const ProgramTree &); void FinishDerivedTypeInstantiation(Scope &); void ResolveExecutionParts(const ProgramTree &); + void ResolveAccParts(const parser::ProgramUnit &); void ResolveOmpParts(const parser::ProgramUnit &); }; @@ -6275,7 +6498,12 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) { inExecutionPart_ = true; ResolveExecutionParts(root); inExecutionPart_ = false; - ResolveOmpParts(x); + if (context().IsEnabled(common::LanguageFeature::OpenACC)) { + ResolveAccParts(x); + } + if (context().IsEnabled(common::LanguageFeature::OpenMP)) { + ResolveOmpParts(x); + } return false; } @@ -6468,6 +6696,287 @@ class DeferredCheckVisitor { bool pushedScope_{false}; }; +bool AccAttributeVisitor::Pre(const parser::OpenACCBlockConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &blockDir{std::get(beginBlockDir.t)}; + switch (blockDir.v) { + case llvm::acc::Directive::ACCD_data: + case llvm::acc::Directive::ACCD_host_data: + case llvm::acc::Directive::ACCD_kernels: + case llvm::acc::Directive::ACCD_parallel: + case llvm::acc::Directive::ACCD_serial: + PushContext(blockDir.source, blockDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +bool AccAttributeVisitor::Pre(const parser::OpenACCLoopConstruct &x) { + const auto &beginDir{std::get(x.t)}; + const auto &loopDir{std::get(beginDir.t)}; + const auto &clauseList{std::get(beginDir.t)}; + if (loopDir.v == llvm::acc::Directive::ACCD_loop) { + PushContext(loopDir.source, loopDir.v); + } + ClearDataSharingAttributeObjects(); + SetContextAssociatedLoopLevel(GetAssociatedLoopLevelFromClauses(clauseList)); + PrivatizeAssociatedLoopIndex(x); + return true; +} + +bool AccAttributeVisitor::Pre(const parser::OpenACCStandaloneConstruct &x) { + const auto &standaloneDir{std::get(x.t)}; + switch (standaloneDir.v) { + case llvm::acc::Directive::ACCD_cache: + case llvm::acc::Directive::ACCD_enter_data: + case llvm::acc::Directive::ACCD_exit_data: + case llvm::acc::Directive::ACCD_init: + case llvm::acc::Directive::ACCD_set: + case llvm::acc::Directive::ACCD_shutdown: + case llvm::acc::Directive::ACCD_update: + PushContext(standaloneDir.source, standaloneDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +bool AccAttributeVisitor::Pre(const parser::OpenACCCombinedConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &combinedDir{ + std::get(beginBlockDir.t)}; + switch (combinedDir.v) { + case llvm::acc::Directive::ACCD_kernels_loop: + case llvm::acc::Directive::ACCD_parallel_loop: + case llvm::acc::Directive::ACCD_serial_loop: + PushContext(combinedDir.source, combinedDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +int64_t AccAttributeVisitor::GetAssociatedLoopLevelFromClauses( + const parser::AccClauseList &x) { + int64_t collapseLevel{0}; + for (const auto &clause : x.v) { + if (const auto *collapseClause{ + std::get_if(&clause.u)}) { + if (const auto v{evaluate::ToInt64( + resolver_.EvaluateIntExpr(collapseClause->v))}) { + collapseLevel = *v; + } + } + } + + if (collapseLevel) { + return collapseLevel; + } + return 1; // default is outermost loop +} + +void AccAttributeVisitor::PrivatizeAssociatedLoopIndex( + const parser::OpenACCLoopConstruct &x) { + int64_t level{GetContext().associatedLoopLevel}; + if (level <= 0) { // collpase value was negative or 0 + return; + } + Symbol::Flag ivDSA{Symbol::Flag::AccPrivate}; + + const auto &outer{std::get>(x.t)}; + for (const parser::DoConstruct *loop{&*outer}; loop && level > 0; --level) { + // go through all the nested do-loops and resolve index variables + const parser::Name &iv{GetLoopIndex(*loop)}; + if (auto *symbol{ResolveAcc(iv, ivDSA, currScope())}) { + symbol->set(Symbol::Flag::AccPreDetermined); + iv.symbol = symbol; // adjust the symbol within region + AddToContextObjectWithDSA(*symbol, ivDSA); + } + + const auto &block{std::get(loop->t)}; + const auto it{block.begin()}; + loop = it != block.end() ? GetDoConstructIf(*it) : nullptr; + } + CHECK(level == 0); +} + +void AccAttributeVisitor::Post(const parser::AccDefaultClause &x) { + if (!dirContext_.empty()) { + switch (x.v) { + case parser::AccDefaultClause::Arg::Present: + SetContextDefaultDSA(Symbol::Flag::AccPresent); + break; + case parser::AccDefaultClause::Arg::None: + SetContextDefaultDSA(Symbol::Flag::AccNone); + break; + } + } +} + +// For OpenACC constructs, check all the data-refs within the constructs +// and adjust the symbol for each Name if necessary +void AccAttributeVisitor::Post(const parser::Name &name) { + auto *symbol{name.symbol}; + if (symbol && !dirContext_.empty() && GetContext().withinConstruct) { + if (!symbol->owner().IsDerivedType() && !symbol->has() && + !IsObjectWithDSA(*symbol)) { + if (Symbol * found{currScope().FindSymbol(name.source)}) { + if (symbol != found) { + name.symbol = found; // adjust the symbol within region + } else if (GetContext().defaultDSA == Symbol::Flag::AccNone) { + // 2.5.14. + context_.Say(name.source, + "The DEFAULT(NONE) clause requires that '%s' must be listed in " + "a data-mapping clause"_err_en_US, + symbol->name()); + } + } + } + } // within OpenACC construct +} + +Symbol *AccAttributeVisitor::ResolveAccCommonBlockName( + const parser::Name *name) { + if (!name) { + return nullptr; + } else if (auto *prev{ + GetContext().scope.parent().FindCommonBlock(name->source)}) { + name->symbol = prev; + return prev; + } else { + return nullptr; + } +} + +void AccAttributeVisitor::ResolveAccObjectList( + const parser::AccObjectList &accObjectList, Symbol::Flag accFlag) { + for (const auto &accObject : accObjectList.v) { + ResolveAccObject(accObject, accFlag); + } +} + +void AccAttributeVisitor::ResolveAccObject( + const parser::AccObject &accObject, Symbol::Flag accFlag) { + std::visit( + common::visitors{ + [&](const parser::Designator &designator) { + if (const auto *name{GetDesignatorNameIfDataRef(designator)}) { + if (auto *symbol{ResolveAcc(*name, accFlag, currScope())}) { + AddToContextObjectWithDSA(*symbol, accFlag); + if (dataSharingAttributeFlags.test(accFlag)) { + CheckMultipleAppearances(*name, *symbol, accFlag); + } + } + } else if (const auto *designatorName{ + resolver_.ResolveDesignator(designator)}; + designatorName->symbol) { + // Array sections to be changed to substrings as needed + if (AnalyzeExpr(context_, designator)) { + if (std::holds_alternative(designator.u)) { + context_.Say(designator.source, + "Substrings are not allowed on OpenACC " + "directives or clauses"_err_en_US); + } + } + // other checks, more TBD + if (const auto *details{designatorName->symbol + ->detailsIf()}) { + if (details->IsArray()) { + // TODO: check Array Sections + } else if (designatorName->symbol->owner().IsDerivedType()) { + // TODO: check Structure Component + } + } + } + }, + [&](const parser::Name &name) { // common block + if (auto *symbol{ResolveAccCommonBlockName(&name)}) { + CheckMultipleAppearances( + name, *symbol, Symbol::Flag::AccCommonBlock); + for (auto &object : symbol->get().objects()) { + if (auto *resolvedObject{ + ResolveAcc(*object, accFlag, currScope())}) { + AddToContextObjectWithDSA(*resolvedObject, accFlag); + } + } + } else { + context_.Say(name.source, + "COMMON block must be declared in the same scoping unit " + "in which the OpenACC directive or clause appears"_err_en_US); + } + }, + }, + accObject.u); +} + +Symbol *AccAttributeVisitor::ResolveAcc( + const parser::Name &name, Symbol::Flag accFlag, Scope &scope) { + if (accFlagsRequireNewSymbol.test(accFlag)) { + return DeclarePrivateAccessEntity(name, accFlag, scope); + } else { + return DeclareOrMarkOtherAccessEntity(name, accFlag); + } +} + +Symbol *AccAttributeVisitor::ResolveAcc( + Symbol &symbol, Symbol::Flag accFlag, Scope &scope) { + if (accFlagsRequireNewSymbol.test(accFlag)) { + return DeclarePrivateAccessEntity(symbol, accFlag, scope); + } else { + return DeclareOrMarkOtherAccessEntity(symbol, accFlag); + } +} + +Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity( + const parser::Name &name, Symbol::Flag accFlag) { + Symbol *prev{currScope().FindSymbol(name.source)}; + if (!name.symbol || !prev) { + return nullptr; + } else if (prev != name.symbol) { + name.symbol = prev; + } + return DeclareOrMarkOtherAccessEntity(*prev, accFlag); +} + +Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity( + Symbol &object, Symbol::Flag accFlag) { + if (accFlagsRequireMark.test(accFlag)) { + object.set(accFlag); + } + return &object; +} + +static bool WithMultipleAppearancesAccException( + const Symbol &symbol, Symbol::Flag flag) { + return false; // Place holder +} + +void AccAttributeVisitor::CheckMultipleAppearances( + const parser::Name &name, const Symbol &symbol, Symbol::Flag accFlag) { + const auto *target{&symbol}; + if (accFlagsRequireNewSymbol.test(accFlag)) { + if (const auto *details{symbol.detailsIf()}) { + target = &details->symbol(); + } + } + if (HasDataSharingAttributeObject(*target) && + !WithMultipleAppearancesAccException(symbol, accFlag)) { + context_.Say(name.source, + "'%s' appears in more than one data-sharing clause " + "on the same OpenACC directive"_err_en_US, + name.ToString()); + } else { + AddDataSharingAttributeObject(*target); + } +} + bool OmpAttributeVisitor::Pre(const parser::OpenMPBlockConstruct &x) { const auto &beginBlockDir{std::get(x.t)}; const auto &beginDir{std::get(beginBlockDir.t)}; @@ -6532,19 +7041,11 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) { return true; } -const parser::Name &OmpAttributeVisitor::GetLoopIndex( - const parser::DoConstruct &x) { - auto &loopControl{x.GetLoopControl().value()}; - using Bounds = parser::LoopControl::Bounds; - const Bounds &bounds{std::get(loopControl.u)}; - return bounds.name.thing; -} - void OmpAttributeVisitor::ResolveSeqLoopIndexInParallelOrTaskConstruct( const parser::Name &iv) { - auto targetIt{ompContext_.rbegin()}; + auto targetIt{dirContext_.rbegin()}; for (;; ++targetIt) { - if (targetIt == ompContext_.rend()) { + if (targetIt == dirContext_.rend()) { return; } if (llvm::omp::parallelSet.test(targetIt->directive) || @@ -6556,7 +7057,7 @@ void OmpAttributeVisitor::ResolveSeqLoopIndexInParallelOrTaskConstruct( targetIt++; symbol->set(Symbol::Flag::OmpPreDetermined); iv.symbol = symbol; // adjust the symbol within region - for (auto it{ompContext_.rbegin()}; it != targetIt; ++it) { + for (auto it{dirContext_.rbegin()}; it != targetIt; ++it) { AddToContextObjectWithDSA(*symbol, Symbol::Flag::OmpPrivate, *it); } } @@ -6567,7 +7068,7 @@ void OmpAttributeVisitor::ResolveSeqLoopIndexInParallelOrTaskConstruct( // or task generating construct is private in the innermost such // construct that encloses the loop bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) { - if (!ompContext_.empty() && GetContext().withinConstruct) { + if (!dirContext_.empty() && GetContext().withinConstruct) { if (const auto &iv{GetLoopIndex(x)}; iv.symbol) { if (!iv.symbol->test(Symbol::Flag::OmpPreDetermined)) { ResolveSeqLoopIndexInParallelOrTaskConstruct(iv); @@ -6579,16 +7080,6 @@ bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) { return true; } -const parser::DoConstruct *OmpAttributeVisitor::GetDoConstructIf( - const parser::ExecutionPartConstruct &x) { - if (auto *y{std::get_if(&x.u)}) { - if (auto *z{std::get_if>(&y->u)}) { - return &z->value(); - } - } - return nullptr; -} - std::int64_t OmpAttributeVisitor::GetAssociatedLoopLevelFromClauses( const parser::OmpClauseList &x) { std::int64_t orderedLevel{0}; @@ -6685,7 +7176,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { } void OmpAttributeVisitor::Post(const parser::OmpDefaultClause &x) { - if (!ompContext_.empty()) { + if (!dirContext_.empty()) { switch (x.v) { case parser::OmpDefaultClause::Type::Private: SetContextDefaultDSA(Symbol::Flag::OmpPrivate); @@ -6707,7 +7198,7 @@ void OmpAttributeVisitor::Post(const parser::OmpDefaultClause &x) { // and adjust the symbol for each Name if necessary void OmpAttributeVisitor::Post(const parser::Name &name) { auto *symbol{name.symbol}; - if (symbol && !ompContext_.empty() && GetContext().withinConstruct) { + if (symbol && !dirContext_.empty() && GetContext().withinConstruct) { if (!symbol->owner().IsDerivedType() && !symbol->has() && !IsObjectWithDSA(*symbol)) { // TODO: create a separate function to go through the rules for @@ -6727,11 +7218,6 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { } // within OpenMP construct } -bool OmpAttributeVisitor::HasDataSharingAttributeObject(const Symbol &object) { - auto it{dataSharingAttributeObjects_.find(object)}; - return it != dataSharingAttributeObjects_.end(); -} - Symbol *OmpAttributeVisitor::ResolveOmpCommonBlockName( const parser::Name *name) { if (auto *prev{name @@ -6826,27 +7312,6 @@ Symbol *OmpAttributeVisitor::ResolveOmp( } } -Symbol *OmpAttributeVisitor::DeclarePrivateAccessEntity( - const parser::Name &name, Symbol::Flag ompFlag, Scope &scope) { - if (!name.symbol) { - return nullptr; // not resolved by Name Resolution step, do nothing - } - name.symbol = DeclarePrivateAccessEntity(*name.symbol, ompFlag, scope); - return name.symbol; -} - -Symbol *OmpAttributeVisitor::DeclarePrivateAccessEntity( - Symbol &object, Symbol::Flag ompFlag, Scope &scope) { - if (object.owner() != currScope()) { - auto &symbol{MakeAssocSymbol(object.name(), object, scope)}; - symbol.set(ompFlag); - return &symbol; - } else { - object.set(ompFlag); - return &object; - } -} - Symbol *OmpAttributeVisitor::DeclareOrMarkOtherAccessEntity( const parser::Name &name, Symbol::Flag ompFlag) { Symbol *prev{currScope().FindSymbol(name.source)}; @@ -6866,11 +7331,11 @@ Symbol *OmpAttributeVisitor::DeclareOrMarkOtherAccessEntity( return &object; } -static bool WithMultipleAppearancesException( - const Symbol &symbol, Symbol::Flag ompFlag) { - return (ompFlag == Symbol::Flag::OmpFirstPrivate && +static bool WithMultipleAppearancesOmpException( + const Symbol &symbol, Symbol::Flag flag) { + return (flag == Symbol::Flag::OmpFirstPrivate && symbol.test(Symbol::Flag::OmpLastPrivate)) || - (ompFlag == Symbol::Flag::OmpLastPrivate && + (flag == Symbol::Flag::OmpLastPrivate && symbol.test(Symbol::Flag::OmpFirstPrivate)); } @@ -6883,7 +7348,7 @@ void OmpAttributeVisitor::CheckMultipleAppearances( } } if (HasDataSharingAttributeObject(*target) && - !WithMultipleAppearancesException(symbol, ompFlag)) { + !WithMultipleAppearancesOmpException(symbol, ompFlag)) { context_.Say(name.source, "'%s' appears in more than one data-sharing clause " "on the same OpenMP directive"_err_en_US, @@ -6962,6 +7427,10 @@ void ResolveNamesVisitor::ResolveExecutionParts(const ProgramTree &node) { } } +void ResolveNamesVisitor::ResolveAccParts(const parser::ProgramUnit &node) { + AccAttributeVisitor{context(), *this}.Walk(node); +} + void ResolveNamesVisitor::ResolveOmpParts(const parser::ProgramUnit &node) { OmpAttributeVisitor{context(), *this}.Walk(node); if (!context().AnyFatalError()) { diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp index 44ceb9fe08bb6..67016e85777c7 100644 --- a/flang/lib/Semantics/unparse-with-symbols.cpp +++ b/flang/lib/Semantics/unparse-with-symbols.cpp @@ -35,6 +35,11 @@ class SymbolDumpVisitor { template void Post(const parser::Statement &) { currStmt_ = std::nullopt; } + bool Pre(const parser::AccClause &clause) { + currStmt_ = clause.source; + return true; + } + void Post(const parser::AccClause &) { currStmt_ = std::nullopt; } bool Pre(const parser::OmpClause &clause) { currStmt_ = clause.source; return true; diff --git a/flang/test/Semantics/acc-resolve01.f90 b/flang/test/Semantics/acc-resolve01.f90 new file mode 100644 index 0000000000000..7e904b525926b --- /dev/null +++ b/flang/test/Semantics/acc-resolve01.f90 @@ -0,0 +1,22 @@ +! RUN: %S/test_errors.sh %s %t %f18 -fopenacc + +! Data-Mapping Attribute Clauses +! 2.15.14 default Clause + +subroutine default_none() + integer a(3) + + A = 1 + B = 2 + !$acc parallel default(none) private(c) + !ERROR: The DEFAULT(NONE) clause requires that 'a' must be listed in a data-mapping clause + A(1:2) = 3 + !ERROR: The DEFAULT(NONE) clause requires that 'b' must be listed in a data-mapping clause + B = 4 + C = 5 + !$acc end parallel +end subroutine default_none + +program mm + call default_none() +end \ No newline at end of file diff --git a/flang/test/Semantics/acc-resolve02.f90 b/flang/test/Semantics/acc-resolve02.f90 new file mode 100644 index 0000000000000..da1a417bba52b --- /dev/null +++ b/flang/test/Semantics/acc-resolve02.f90 @@ -0,0 +1,17 @@ +! RUN: %S/test_errors.sh %s %t %f18 -fopenacc + +subroutine compute() + integer :: a(3), c, i + + a = 1 + !ERROR: 'c' appears in more than one data-sharing clause on the same OpenACC directive + !$acc parallel firstprivate(c) private(c) + do i = 1, 3 + a(i) = c + end do + !$acc end parallel +end subroutine compute + +program mm + call compute() +end diff --git a/flang/test/Semantics/acc-symbols01.f90 b/flang/test/Semantics/acc-symbols01.f90 new file mode 100644 index 0000000000000..23d54eb93fbef --- /dev/null +++ b/flang/test/Semantics/acc-symbols01.f90 @@ -0,0 +1,26 @@ +! RUN: %S/test_symbols.sh %s %t %f18 -fopenacc + +!DEF: /mm MainProgram +program mm + !DEF: /mm/x ObjectEntity REAL(4) + !DEF: /mm/y ObjectEntity REAL(4) + real x, y + !DEF: /mm/a ObjectEntity INTEGER(4) + !DEF: /mm/b ObjectEntity INTEGER(4) + !DEF: /mm/c ObjectEntity INTEGER(4) + !DEF: /mm/i ObjectEntity INTEGER(4) + integer a(10), b(10), c(10), i + !REF: /mm/b + b = 2 + !$acc parallel present(c) firstprivate(b) private(a) + !$acc loop + !DEF: /mm/Block1/i (AccPrivate, AccPreDetermined) HostAssoc INTEGER(4) + do i=1,10 + !DEF: /mm/Block1/a (AccPrivate) HostAssoc INTEGER(4) + !REF: /mm/Block1/i + !DEF: /mm/Block1/b (AccFirstPrivate) HostAssoc INTEGER(4) + a(i) = b(i) + end do + !$acc end parallel + end program + diff --git a/flang/test/Semantics/test_symbols.sh b/flang/test/Semantics/test_symbols.sh index d2b3d688a39b8..61ff7fdb1e7bf 100755 --- a/flang/test/Semantics/test_symbols.sh +++ b/flang/test/Semantics/test_symbols.sh @@ -16,8 +16,9 @@ diffs=$temp/diffs # Strip out blank lines and all comments except "!DEF:", "!REF:", and "!$omp" sed -e 's/!\([DR]EF:\)/KEEP \1/' -e 's/!\($omp\)/KEEP \1/' \ - -e 's/!.*//' -e 's/ *$//' -e '/^$/d' -e 's/KEEP \([DR]EF:\)/!\1/' \ - -e 's/KEEP \($omp\)/!\1/' \ + -e 's/!\($acc\)/KEEP \1/' -e 's/!.*//' -e 's/ *$//' -e '/^$/d' \ + -e 's/KEEP \([DR]EF:\)/!\1/' -e 's/KEEP \($omp\)/!\1/' \ + -e 's/KEEP \($acc\)/!\1/' \ $src > $src1 egrep -v '![DR]EF:' $src1 > $src2 # strip out DEF and REF comments # compile, inserting comments for symbols: From 47a40eda178ef808090314e0664dbd3850db83eb Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Sun, 26 Jul 2020 17:33:07 -0700 Subject: [PATCH 0152/1035] [ORC] Remove a redundant call to getTargetMemory. --- llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp index b4c21c32310ce..85dc4bec9a122 100644 --- a/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp @@ -297,15 +297,14 @@ TPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr, return Alloc.takeError(); auto WorkingMemory = (*Alloc)->getWorkingMemory(ResolverBlockPermissions); - auto TargetAddress = (*Alloc)->getTargetMemory(ResolverBlockPermissions); - ABI->writeResolverCode(WorkingMemory.data(), TargetAddress, ReentryFnAddr, + ResolverBlockAddr = (*Alloc)->getTargetMemory(ResolverBlockPermissions); + ABI->writeResolverCode(WorkingMemory.data(), ResolverBlockAddr, ReentryFnAddr, ReentryCtxAddr); if (auto Err = (*Alloc)->finalize()) return std::move(Err); ResolverBlock = std::move(*Alloc); - ResolverBlockAddr = ResolverBlock->getTargetMemory(ResolverBlockPermissions); return ResolverBlockAddr; } From 194a4beedd1e09ff0de3710ea431d8d3facd59e1 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Mon, 27 Jul 2020 09:43:00 +0900 Subject: [PATCH 0153/1035] [InstCombine] Add more tests to freeze-phi.ll; NFC --- .../test/Transforms/InstCombine/freeze-phi.ll | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/llvm/test/Transforms/InstCombine/freeze-phi.ll b/llvm/test/Transforms/InstCombine/freeze-phi.ll index fcda502a0d7f8..430c2d2e8fe66 100644 --- a/llvm/test/Transforms/InstCombine/freeze-phi.ll +++ b/llvm/test/Transforms/InstCombine/freeze-phi.ll @@ -20,7 +20,52 @@ B: C: %y = phi i32 [0, %A], [1, %B] %y.fr = freeze i32 %y - ret i32 %y + ret i32 %y.fr +} + +define <2 x i32> @vec(i1 %cond) { +; CHECK-LABEL: @vec( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[C]] +; CHECK: C: +; CHECK-NEXT: [[Y:%.*]] = phi <2 x i32> [ , [[A]] ], [ , [[B]] ] +; CHECK-NEXT: ret <2 x i32> [[Y]] +; + br i1 %cond, label %A, label %B +A: + br label %C +B: + br label %C +C: + %y = phi <2 x i32> [, %A], [, %B] + %y.fr = freeze <2 x i32> %y + ret <2 x i32> %y.fr +} + +define <2 x i32> @vec_undef(i1 %cond) { +; CHECK-LABEL: @vec_undef( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: br label [[C:%.*]] +; CHECK: B: +; CHECK-NEXT: br label [[C]] +; CHECK: C: +; CHECK-NEXT: [[Y:%.*]] = phi <2 x i32> [ , [[A]] ], [ , [[B]] ] +; CHECK-NEXT: [[Y_FR:%.*]] = freeze <2 x i32> [[Y]] +; CHECK-NEXT: ret <2 x i32> [[Y_FR]] +; + br i1 %cond, label %A, label %B +A: + br label %C +B: + br label %C +C: + %y = phi <2 x i32> [, %A], [, %B] + %y.fr = freeze <2 x i32> %y + ret <2 x i32> %y.fr } define i32 @one(i1 %cond, i32 %x) { From a6e9f5264c855dfa5dda3c5940ffdaaf22d7e693 Mon Sep 17 00:00:00 2001 From: QingShan Zhang Date: Mon, 27 Jul 2020 02:02:40 +0000 Subject: [PATCH 0154/1035] [Scheduling] Improve group algorithm for store cluster Store Addr and Store Addr+8 are clusterable pair. They have memory(ctrl) dependency on different loads. Current implementation will put these two stores into different group and miss to cluster them. Reviewed By: evandro Differential Revision: https://reviews.llvm.org/D84139 --- llvm/lib/CodeGen/MachineScheduler.cpp | 8 +++- .../CodeGen/AArch64/aarch64-stp-cluster.ll | 47 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index d9d0a783f8a22..cec7a0c031eb5 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1653,7 +1653,13 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { unsigned ChainPredID = DAG->SUnits.size(); for (const SDep &Pred : SU.Preds) { - if (Pred.isCtrl() && !Pred.isArtificial()) { + // We only want to cluster the mem ops that have the same ctrl(non-data) + // pred so that they didn't have ctrl dependency for each other. But for + // store instrs, we can still cluster them if the pred is load instr. + if ((Pred.isCtrl() && + (IsLoad || + (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && + !Pred.isArtificial()) { ChainPredID = Pred.getSUnit()->NodeNum; break; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll index 5f75b4ef944bb..bfe7e4941da8b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -147,3 +147,50 @@ entry: ret i64 %v } +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: stp_i64_with_ld:%bb.0 +; CHECK:Cluster ld/st SU(5) - SU(10) +; CHECK:Cluster ld/st SU(15) - SU(20) +; CHECK:SU(5): STRXui %7:gpr64, %0:gpr64common, 0 :: +; CHECK:SU(10): STRXui %12:gpr64, %0:gpr64common, 1 :: +; CHECK:SU(15): STRXui %17:gpr64, %0:gpr64common, 2 :: +; CHECK:SU(20): STRXui %22:gpr64, %0:gpr64common, 3 :: +define void @stp_i64_with_ld(i64* noalias nocapture %a, i64* noalias nocapture readnone %b, i64* noalias nocapture readnone %c) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 8 + %0 = load i64, i64* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 16 + %1 = load i64, i64* %arrayidx3, align 8 + %mul = mul nsw i64 %1, %0 + %2 = load i64, i64* %a, align 8 + %add6 = add nsw i64 %2, %mul + store i64 %add6, i64* %a, align 8 + %arrayidx.1 = getelementptr inbounds i64, i64* %a, i64 9 + %3 = load i64, i64* %arrayidx.1, align 8 + %arrayidx3.1 = getelementptr inbounds i64, i64* %a, i64 17 + %4 = load i64, i64* %arrayidx3.1, align 8 + %mul.1 = mul nsw i64 %4, %3 + %arrayidx5.1 = getelementptr inbounds i64, i64* %a, i64 1 + %5 = load i64, i64* %arrayidx5.1, align 8 + %add6.1 = add nsw i64 %5, %mul.1 + store i64 %add6.1, i64* %arrayidx5.1, align 8 + %arrayidx.2 = getelementptr inbounds i64, i64* %a, i64 10 + %6 = load i64, i64* %arrayidx.2, align 8 + %arrayidx3.2 = getelementptr inbounds i64, i64* %a, i64 18 + %7 = load i64, i64* %arrayidx3.2, align 8 + %mul.2 = mul nsw i64 %7, %6 + %arrayidx5.2 = getelementptr inbounds i64, i64* %a, i64 2 + %8 = load i64, i64* %arrayidx5.2, align 8 + %add6.2 = add nsw i64 %8, %mul.2 + store i64 %add6.2, i64* %arrayidx5.2, align 8 + %arrayidx.3 = getelementptr inbounds i64, i64* %a, i64 11 + %9 = load i64, i64* %arrayidx.3, align 8 + %arrayidx3.3 = getelementptr inbounds i64, i64* %a, i64 19 + %10 = load i64, i64* %arrayidx3.3, align 8 + %mul.3 = mul nsw i64 %10, %9 + %arrayidx5.3 = getelementptr inbounds i64, i64* %a, i64 3 + %11 = load i64, i64* %arrayidx5.3, align 8 + %add6.3 = add nsw i64 %11, %mul.3 + store i64 %add6.3, i64* %arrayidx5.3, align 8 + ret void +} From 0eff8b3865ede487bacd605f628891dd028c74bd Mon Sep 17 00:00:00 2001 From: biplmish Date: Sun, 26 Jul 2020 21:23:00 -0500 Subject: [PATCH 0155/1035] [PowerPC] Cleanup p10vector clang test Remove the duplicate LE test, correct the labels and remove common tests for vec_splat builtin. Differential Revision: https://reviews.llvm.org/D84382 --- clang/test/CodeGen/builtins-ppc-p10vector.c | 53 +++++++++------------ 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c index 2182a19f2452d..e67018b062141 100644 --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -1,15 +1,11 @@ // REQUIRES: powerpc-registered-target -// RUN: %clang_cc1 -target-feature +vsx -target-feature +altivec \ -// RUN: -target-cpu pwr10 -triple powerpc64le-unknown-unknown -emit-llvm %s \ -// RUN: -o - | FileCheck %s - -// RUN: %clang_cc1 -target-feature +vsx -target-feature +altivec \ +// RUN: %clang_cc1 -target-feature +vsx \ // RUN: -target-cpu pwr10 -triple powerpc64-unknown-unknown -emit-llvm %s \ -// RUN: -o - | FileCheck %s -check-prefix=CHECK-BE +// RUN: -o - | FileCheck %s -check-prefixes=CHECK-BE,CHECK -// RUN: %clang_cc1 -target-feature +vsx -target-feature +altivec \ +// RUN: %clang_cc1 -target-feature +vsx \ // RUN: -target-cpu pwr10 -triple powerpc64le-unknown-unknown -emit-llvm %s \ -// RUN: -o - | FileCheck %s -check-prefix=CHECK-LE +// RUN: -o - | FileCheck %s -check-prefixes=CHECK-LE,CHECK #include @@ -514,19 +510,16 @@ vector unsigned int test_vec_inserth_uiv(void) { } vector signed int test_vec_vec_splati_si(void) { - // CHECK-BE: ret <4 x i32> // CHECK: ret <4 x i32> return vec_splati(-17); } vector unsigned int test_vec_vec_splati_ui(void) { - // CHECK-BE: ret <4 x i32> // CHECK: ret <4 x i32> return vec_splati(16U); } vector float test_vec_vec_splati_f(void) { - // CHECK-BE: ret <4 x float> // CHECK: ret <4 x float> return vec_splati(1.0f); } @@ -536,10 +529,10 @@ vector double test_vec_vec_splatid(void) { // CHECK-BE-NEXT: [[T2:%.+]] = insertelement <2 x double> undef, double [[T1:%.+]], i32 0 // CHECK-BE-NEXT: [[T3:%.+]] = shufflevector <2 x double> [[T2:%.+]], <2 x double> undef, <2 x i32> zeroinitialize // CHECK-BE-NEXT: ret <2 x double> [[T3:%.+]] - // CHECK: [[T1:%.+]] = fpext float %{{.+}} to double - // CHECK-NEXT: [[T2:%.+]] = insertelement <2 x double> undef, double [[T1:%.+]], i32 0 - // CHECK-NEXT: [[T3:%.+]] = shufflevector <2 x double> [[T2:%.+]], <2 x double> undef, <2 x i32> zeroinitialize - // CHECK-NEXT: ret <2 x double> [[T3:%.+]] + // CHECK-LE: [[T1:%.+]] = fpext float %{{.+}} to double + // CHECK-LE-NEXT: [[T2:%.+]] = insertelement <2 x double> undef, double [[T1:%.+]], i32 0 + // CHECK-LE-NEXT: [[T3:%.+]] = shufflevector <2 x double> [[T2:%.+]], <2 x double> undef, <2 x i32> zeroinitialize + // CHECK-LE-NEXT: ret <2 x double> [[T3:%.+]] return vec_splatid(1.0); } @@ -548,11 +541,11 @@ vector signed int test_vec_vec_splati_ins_si(void) { // CHECK-BE: [[T1:%.+]] = add i32 2, %{{.+}} // CHECK-BE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] // CHECK-BE: ret <4 x i32> - // CHECK: [[T1:%.+]] = sub i32 1, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] - // CHECK: [[T2:%.+]] = sub i32 3, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] - // CHECK: ret <4 x i32> + // CHECK-LE: [[T1:%.+]] = sub i32 1, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] + // CHECK-LE: [[T2:%.+]] = sub i32 3, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] + // CHECK-LE: ret <4 x i32> return vec_splati_ins(vsia, 0, -17); } @@ -561,11 +554,11 @@ vector unsigned int test_vec_vec_splati_ins_ui(void) { // CHECK-BE: [[T1:%.+]] = add i32 2, %{{.+}} // CHECK-BE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] // CHECK-BE: ret <4 x i32> - // CHECK: [[T1:%.+]] = sub i32 1, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] - // CHECK: [[T2:%.+]] = sub i32 3, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] - // CHECK: ret <4 x i32> + // CHECK-LE: [[T1:%.+]] = sub i32 1, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] + // CHECK-LE: [[T2:%.+]] = sub i32 3, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] + // CHECK-LE: ret <4 x i32> return vec_splati_ins(vuia, 1, 16U); } @@ -574,11 +567,11 @@ vector float test_vec_vec_splati_ins_f(void) { // CHECK-BE: [[T1:%.+]] = add i32 2, %{{.+}} // CHECK-BE: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T1]] // CHECK-BE: ret <4 x float> - // CHECK: [[T1:%.+]] = sub i32 1, %{{.+}} - // CHECK: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T1]] - // CHECK: [[T2:%.+]] = sub i32 3, %{{.+}} - // CHECK: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T2]] - // CHECK: ret <4 x float> + // CHECK-LE: [[T1:%.+]] = sub i32 1, %{{.+}} + // CHECK-LE: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T1]] + // CHECK-LE: [[T2:%.+]] = sub i32 3, %{{.+}} + // CHECK-LE: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T2]] + // CHECK-LE: ret <4 x float> return vec_splati_ins(vfa, 0, 1.0f); } From e97aa5609fa53827c8c09a0fc79075dc834f292d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 26 Jul 2020 23:01:28 -0400 Subject: [PATCH 0156/1035] AMDGPU/GlobalISel: Don't assert in LegalizerInfo constructor We don't really need these asserts. The LegalizerInfo is also overly-aggressivly constructed, even when not in use. It needs to not assert on dummy targets that have manually specified, unrelated features. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 - .../CodeGen/AMDGPU/GlobalISel/dummy-target.ll | 91 +++++++++++++++++++ 2 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 673c5fc1e840c..f1962db35bc01 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -423,7 +423,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); if (ST.hasVOP3PInsts()) { - assert(ST.hasIntClamp() && "all targets with VOP3P should support clamp"); getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) .clampScalar(0, S16, S32) @@ -445,8 +444,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 - assert(ST.hasIntClamp() && "all targets with 16-bit should support clamp"); - // Technically the saturating operations require clamp bit support, but this // was introduced at the same time as 16-bit operations. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll new file mode 100644 index 0000000000000..81f6b8e71254c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s + +; Make sure legalizer info doesn't assert on dummy targets + +define i16 @vop3p_add_i16(i16 %arg0) #0 { + ; CHECK-LABEL: name: vop3p_add_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %add = add i16 %arg0, %arg0 + ret i16 %add +} + +define <2 x i16> @vop3p_add_v2i16(<2 x i16> %arg0) #0 { + ; CHECK-LABEL: name: vop3p_add_v2i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; CHECK: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC2]] + ; CHECK: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[TRUNC3]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] + ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %add = add <2 x i16> %arg0, %arg0 + ret <2 x i16> %add +} + +define i16 @halfinsts_add_i16(i16 %arg0) #1 { + ; CHECK-LABEL: name: halfinsts_add_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK: $vgpr0 = COPY [[COPY3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 + %add = add i16 %arg0, %arg0 + ret i16 %add +} + +define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 { + ; CHECK-LABEL: name: halfinsts_add_v2i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY4]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[COPY6]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) + ; CHECK: $vgpr0 = COPY [[COPY7]](s32) + ; CHECK: $vgpr1 = COPY [[COPY8]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY9]], implicit $vgpr0, implicit $vgpr1 + %add = add <2 x i16> %arg0, %arg0 + ret <2 x i16> %add +} + +attributes #0 = { "target-features"="+vop3p" } +attributes #0 = { "target-features"="+16-bit-insts" } From fae221e7ad5a94ed7697c13d169db18d253f5f15 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 26 Jul 2020 21:14:20 -0700 Subject: [PATCH 0157/1035] [gcov] Simplify/speed up CFG hash calculation --- .../Instrumentation/GCOVProfiling.cpp | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index cd2ea8d5e4edd..53a89f7348de4 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CRC.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" @@ -300,15 +301,16 @@ namespace { assert(OutEdges.empty()); } + uint32_t Number; + SmallVector OutEdges; + private: friend class GCOVFunction; GCOVBlock(GCOVProfiler *P, uint32_t Number) : GCOVRecord(P), Number(Number) {} - uint32_t Number; StringMap LinesByFile; - SmallVector OutEdges; }; // A function has a unique identifier, a checksum (we leave as zero) and a @@ -347,18 +349,6 @@ namespace { return ReturnBlock; } - std::string getEdgeDestinations() { - std::string EdgeDestinations; - raw_string_ostream EDOS(EdgeDestinations); - Function *F = Blocks.begin()->first->getParent(); - for (BasicBlock &I : *F) { - GCOVBlock &Block = getBlock(&I); - for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) - EDOS << Block.OutEdges[i]->Number; - } - return EdgeDestinations; - } - uint32_t getFuncChecksum() const { return FuncChecksum; } @@ -729,7 +719,7 @@ void GCOVProfiler::emitProfileNotes() { continue; } - std::string EdgeDestinations; + std::vector EdgeDestinations; Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little : support::endianness::big; @@ -774,6 +764,11 @@ void GCOVProfiler::emitProfileNotes() { } else if (isa(TI)) { Block.addEdge(Func.getReturnBlock()); } + for (GCOVBlock *Succ : Block.OutEdges) { + uint32_t Idx = Succ->Number; + do EdgeDestinations.push_back(Idx & 255); + while ((Idx >>= 8) > 0); + } for (auto &I : BB) { // Debug intrinsic locations correspond to the location of the @@ -798,12 +793,13 @@ void GCOVProfiler::emitProfileNotes() { } Line = 0; } - EdgeDestinations += Func.getEdgeDestinations(); } char Tmp[4]; + JamCRC JC; + JC.update(EdgeDestinations); os = &out; - auto Stamp = static_cast(hash_value(EdgeDestinations)); + uint32_t Stamp = JC.getCRC(); FileChecksums.push_back(Stamp); if (Endian == support::endianness::big) { out.write("gcno", 4); From 1c93f09bf34c2f7a9e6b1753cc2edb41609cd544 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Mon, 27 Jul 2020 04:50:08 +0000 Subject: [PATCH 0158/1035] Remove declaration of constexpr member kDynamicSize in MemRefType This member is already publicly declared on the base class. The redundant declaration is mangled differently though and in some unoptimized build it requires a definition to also exist. However we have a definition for the base ShapedType class, removing the declaration here will redirect every use to the base class member instead. Differential Revision: https://reviews.llvm.org/D84615 --- mlir/include/mlir/IR/StandardTypes.h | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/include/mlir/IR/StandardTypes.h b/mlir/include/mlir/IR/StandardTypes.h index 5380668b59013..1ac24359cbb62 100644 --- a/mlir/include/mlir/IR/StandardTypes.h +++ b/mlir/include/mlir/IR/StandardTypes.h @@ -564,7 +564,6 @@ class MemRefType : public Type::TypeBase Date: Sun, 26 Jul 2020 23:56:19 -0500 Subject: [PATCH 0159/1035] [PowerPC] Add Vector Extract Double Instruction Definitions and MC tests. This patch adds the td definitions and asm/disasm tests for the following instructions: Vector Extract Double Left Index - vextdubvlx, vextduhvlx, vextduwvlx, vextddvlx Vector Extract Double Right Index - vextdubvrx, vextduhvrx, vextduwvrx, vextddvrx Differential Revision: https://reviews.llvm.org/D84384 --- llvm/lib/Target/PowerPC/PPCInstrPrefix.td | 32 +++++++++++++++++++ .../PowerPC/ppc64-encoding-ISA31.txt | 24 ++++++++++++++ llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s | 24 ++++++++++++++ 3 files changed, 80 insertions(+) diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 22839e697381b..418ef3b377282 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -905,6 +905,38 @@ let Predicates = [IsISA3_1] in { [(set v2i64:$vD, (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, i64:$rB))]>, RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; + def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextdubvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUBVRX : VAForm_1a<25, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextdubvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUHVLX : VAForm_1a<26, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduhvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUHVRX : VAForm_1a<27, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduhvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUWVLX : VAForm_1a<28, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduwvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUWVRX : VAForm_1a<29, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduwvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDDVLX : VAForm_1a<30, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextddvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDDVRX : VAForm_1a<31, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextddvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpdepd $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt index 038a7f1a84fe2..c8dae6a160a5b 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt @@ -330,6 +330,30 @@ # CHECK: vinsdrx 1, 2, 3 0x10 0x22 0x1b 0xcf +# CHECK: vextdubvlx 1, 2, 3, 3 +0x10 0x22 0x18 0xd8 + +# CHECK: vextdubvrx 1, 2, 3, 3 +0x10 0x22 0x18 0xd9 + +# CHECK: vextduhvlx 1, 2, 3, 3 +0x10 0x22 0x18 0xda + +# CHECK: vextduhvrx 1, 2, 3, 3 +0x10 0x22 0x18 0xdb + +# CHECK: vextduwvlx 1, 2, 3, 3 +0x10 0x22 0x18 0xdc + +# CHECK: vextduwvrx 1, 2, 3, 3 +0x10 0x22 0x18 0xdd + +# CHECK: vextddvlx 1, 2, 3, 3 +0x10 0x22 0x18 0xde + +# CHECK: vextddvrx 1, 2, 3, 3 +0x10 0x22 0x18 0xdf + # CHECK: lxvrbx 32, 1, 2 0x7c 0x01 0x10 0x1b diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s index c9f5547dd4c86..bd1187f18ed8f 100644 --- a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s +++ b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s @@ -456,6 +456,30 @@ # CHECK-BE: vinsdrx 1, 2, 3 # encoding: [0x10,0x22,0x1b,0xcf] # CHECK-LE: vinsdrx 1, 2, 3 # encoding: [0xcf,0x1b,0x22,0x10] vinsdrx 1, 2, 3 +# CHECK-BE: vextdubvlx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xd8] +# CHECK-LE: vextdubvlx 1, 2, 3, 3 # encoding: [0xd8,0x18,0x22,0x10] + vextdubvlx 1, 2, 3, 3 +# CHECK-BE: vextdubvrx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xd9] +# CHECK-LE: vextdubvrx 1, 2, 3, 3 # encoding: [0xd9,0x18,0x22,0x10] + vextdubvrx 1, 2, 3, 3 +# CHECK-BE: vextduhvlx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xda] +# CHECK-LE: vextduhvlx 1, 2, 3, 3 # encoding: [0xda,0x18,0x22,0x10] + vextduhvlx 1, 2, 3, 3 +# CHECK-BE: vextduhvrx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xdb] +# CHECK-LE: vextduhvrx 1, 2, 3, 3 # encoding: [0xdb,0x18,0x22,0x10] + vextduhvrx 1, 2, 3, 3 +# CHECK-BE: vextduwvlx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xdc] +# CHECK-LE: vextduwvlx 1, 2, 3, 3 # encoding: [0xdc,0x18,0x22,0x10] + vextduwvlx 1, 2, 3, 3 +# CHECK-BE: vextduwvrx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xdd] +# CHECK-LE: vextduwvrx 1, 2, 3, 3 # encoding: [0xdd,0x18,0x22,0x10] + vextduwvrx 1, 2, 3, 3 +# CHECK-BE: vextddvlx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xde] +# CHECK-LE: vextddvlx 1, 2, 3, 3 # encoding: [0xde,0x18,0x22,0x10] + vextddvlx 1, 2, 3, 3 +# CHECK-BE: vextddvrx 1, 2, 3, 3 # encoding: [0x10,0x22,0x18,0xdf] +# CHECK-LE: vextddvrx 1, 2, 3, 3 # encoding: [0xdf,0x18,0x22,0x10] + vextddvrx 1, 2, 3, 3 # CHECK-BE: lxvrbx 32, 1, 2 # encoding: [0x7c,0x01,0x10,0x1b] # CHECK-LE: lxvrbx 32, 1, 2 # encoding: [0x1b,0x10,0x01,0x7c] lxvrbx 32, 1, 2 From eed333149d178b69fdaf39b9419b7ca032520182 Mon Sep 17 00:00:00 2001 From: Yuanfang Chen Date: Sun, 26 Jul 2020 22:32:24 -0700 Subject: [PATCH 0160/1035] [NewPM] NFC. remove obsolete TODO comment The deleted TODO was implemented in D82344. --- llvm/include/llvm/IR/PassInstrumentation.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h index 37390e4e682ba..94833bb0d2c63 100644 --- a/llvm/include/llvm/IR/PassInstrumentation.h +++ b/llvm/include/llvm/IR/PassInstrumentation.h @@ -44,10 +44,6 @@ /// of a pass. For those callbacks returning false means pass will not be /// executed. /// -/// TODO: currently there is no way for a pass to opt-out of execution control -/// (e.g. become unskippable). PassManager is the only entity that determines -/// how pass instrumentation affects pass execution. -/// //===----------------------------------------------------------------------===// #ifndef LLVM_IR_PASSINSTRUMENTATION_H From f2ab2134c7c4949eea3f6ddd35bcea236fe12592 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 27 Jul 2020 10:15:44 +0300 Subject: [PATCH 0161/1035] [XRay] Account: recursion detection Summary: Recursion detection can be non-trivial. Currently, the state-of-the-art for LLVM, as far as i'm concerned, is D72362 `[clang-tidy] misc-no-recursion: a new check`. However, it is quite limited: * It does very basic call-graph based analysis, in the sense it will report even dynamically-unreachable recursion. * It is inherently limited to a single TU * It is hard to gauge how problematic each recursion is in practice. Some of that can be addressed by adding clang analyzer-based check, then it would at least support multiple TU's. However, we can approach this problem from another angle - dynamic run-time analysis. We already have means to capture a run-time callgraph (XRay, duh), and there are already means to reconstruct it within `llvm-xray` tool. This proposes to add a `-recursive-calls-only` switch to the `account` tool. When the switch is on, when re-constructing callgraph for latency reconstruction, each time we enter/leave some function, we increment/decrement an entry for the function in a "recursion depth" map. If, when we leave the function, said entry was at `1`, then that means the function didn't call itself, however if it is at `2` or more, then that means the function (possibly indirectly) called itself. If the depth is 1, we don't account the time spent there, unless within this call stack the function already recursed into itself. Note that we don't pay for recursion depth tracking when `recursive-calls-only` is not on, and the perf impact is insignificant (+0.3% regression) The overhead of the option is actually negative, around -5.26% user time on a medium-sized (3.5G) XRay log. As a practical example, that 3.5G log is a capture of the entire middle-end opt pipeline at `-O3` for RawSpeed unity build. There are total of `5500` functions in the log, however `-recursive-calls-only` says that `269`, or 5%, are recursive. Having this functionality could be helpful for recursion eradication. Reviewers: dberris, mboerger Reviewed By: dberris Subscribers: llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84582 --- ...ursive-calls-only-tail-call-deduction.yaml | 46 +++++++++++ .../X86/account-recursive-calls-only.yaml | 34 +++++++++ llvm/tools/llvm-xray/xray-account.cpp | 76 +++++++++++++++---- llvm/tools/llvm-xray/xray-account.h | 21 ++++- 4 files changed, 158 insertions(+), 19 deletions(-) create mode 100644 llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml create mode 100644 llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml new file mode 100644 index 0000000000000..4f5d01b2cf9b1 --- /dev/null +++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml @@ -0,0 +1,46 @@ +# RUN: llvm-xray account -d %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s +# RUN: llvm-xray account -d -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s + +--- +header: + version: 1 + type: 0 + constant-tsc: true + nonstop-tsc: true + cycle-frequency: 0 +records: +# Here we reconstruct the following call trace: +# +# f1() +# f2() +# f3() +# f2() +# +# But we find that we're missing an exit record for f2() because it's +# tail-called f3(). We make sure that if we see a trace like this that we can +# deduce tail calls, and account the time (potentially wrongly) to f2() when +# f1() exits. That is because we don't go back to f3()'s entry record to +# properly do the math on the timing of f2(). +# +# As a result, we can deduce that f2() is not recursive here. +# +# Note that by default, tail/sibling call deduction is disabled, and is enabled +# with a flag "-d" or "-deduce-sibling-calls". +# + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 10003 } + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10004 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10005 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 10006 } +... + +# ALL: Functions with latencies: 3 +# ALL-NEXT: funcid count [ min, med, 90p, 99p, max] sum function +# ALL-NEXT: 1 1 [ 4.000000, 4.000000, 4.000000, 4.000000, 4.000000] 4.000000 :0:0: @(1) +# ALL-NEXT: 2 2 [ 1.000000, 3.000000, 3.000000, 3.000000, 3.000000] 4.000000 :0:0: @(2) +# ALL-NEXT: 3 1 [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] 1.000000 :0:0: @(3) + +# RECURSIVE: Functions with latencies: 0 +# RECURSIVE-NEXT: funcid count [ min, med, 90p, 99p, max] sum function diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml new file mode 100644 index 0000000000000..d7b36200d10d3 --- /dev/null +++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml @@ -0,0 +1,34 @@ +# RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s +# RUN: llvm-xray account -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s + +--- +header: + version: 1 + type: 0 + constant-tsc: true + nonstop-tsc: true + cycle-frequency: 2601000000 +records: + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 0 } + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 100000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 200000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 300000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 400000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 500000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 600000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 700000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 800000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 900000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 1000000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 1100000000 } +... + +# ALL: Functions with latencies: 3 +# ALL-NEXT: funcid count [ min, med, 90p, 99p, max] sum function +# ALL-NEXT: 1 1 [ 0.038447, 0.038447, 0.038447, 0.038447, 0.038447] 0.038447 :0:0: @(1) +# ALL-NEXT: 2 2 [ 0.038447, 0.192234, 0.192234, 0.192234, 0.192234] 0.230681 :0:0: @(2) +# ALL-NEXT: 3 3 [ 0.038447, 0.038447, 0.115340, 0.115340, 0.115340] 0.192234 :0:0: @(3) + +# RECURSIVE: Functions with latencies: 1 +# RECURSIVE-NEXT: funcid count [ min, med, 90p, 99p, max] sum function +# RECURSIVE-NEXT: 3 2 [ 0.038447, 0.115340, 0.115340, 0.115340, 0.115340] 0.153787 :0:0: @(3) diff --git a/llvm/tools/llvm-xray/xray-account.cpp b/llvm/tools/llvm-xray/xray-account.cpp index 21da53f362b09..bde028a432fed 100644 --- a/llvm/tools/llvm-xray/xray-account.cpp +++ b/llvm/tools/llvm-xray/xray-account.cpp @@ -35,6 +35,9 @@ static cl::opt cl::sub(Account), cl::init(false)); static cl::alias AccountKeepGoing2("k", cl::aliasopt(AccountKeepGoing), cl::desc("Alias for -keep_going")); +static cl::opt AccountRecursiveCallsOnly( + "recursive-calls-only", cl::desc("Only count the calls that are recursive"), + cl::sub(Account), cl::init(false)); static cl::opt AccountDeduceSiblingCalls( "deduce-sibling-calls", cl::desc("Deduce sibling calls when unrolling function call stacks"), @@ -126,6 +129,32 @@ template T diff(T L, T R) { return std::max(L, R) - std::min(L, R); } } // namespace +using RecursionStatus = LatencyAccountant::FunctionStack::RecursionStatus; +RecursionStatus &RecursionStatus::operator++() { + auto Depth = Bitfield::get(Storage); + assert(Depth >= 0 && Depth < std::numeric_limits::max()); + ++Depth; + Bitfield::set(Storage, Depth); // ++Storage + // Did this function just (maybe indirectly) call itself the first time? + if (!isRecursive() && Depth == 2) // Storage == 2 / Storage s> 1 + Bitfield::set(Storage, + true); // Storage |= INT_MIN + return *this; +} +RecursionStatus &RecursionStatus::operator--() { + auto Depth = Bitfield::get(Storage); + assert(Depth > 0); + --Depth; + Bitfield::set(Storage, Depth); // --Storage + // Did we leave a function that previouly (maybe indirectly) called itself? + if (isRecursive() && Depth == 0) // Storage == INT_MIN + Bitfield::set(Storage, false); // Storage = 0 + return *this; +} +bool RecursionStatus::isRecursive() const { + return Bitfield::get(Storage); // Storage s< 0 +} + bool LatencyAccountant::accountRecord(const XRayRecord &Record) { setMinMax(PerThreadMinMaxTSC[Record.TId], Record.TSC); setMinMax(PerCPUMinMaxTSC[Record.CPU], Record.TSC); @@ -137,6 +166,8 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) { return false; auto &ThreadStack = PerThreadFunctionStack[Record.TId]; + if (RecursiveCallsOnly && !ThreadStack.RecursionDepth) + ThreadStack.RecursionDepth.emplace(); switch (Record.Type) { case RecordTypes::CUSTOM_EVENT: case RecordTypes::TYPED_EVENT: @@ -144,18 +175,24 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) { return true; case RecordTypes::ENTER: case RecordTypes::ENTER_ARG: { - ThreadStack.emplace_back(Record.FuncId, Record.TSC); + ThreadStack.Stack.emplace_back(Record.FuncId, Record.TSC); + if (ThreadStack.RecursionDepth) + ++(*ThreadStack.RecursionDepth)[Record.FuncId]; break; } case RecordTypes::EXIT: case RecordTypes::TAIL_EXIT: { - if (ThreadStack.empty()) + if (ThreadStack.Stack.empty()) return false; - if (ThreadStack.back().first == Record.FuncId) { - const auto &Top = ThreadStack.back(); - recordLatency(Top.first, diff(Top.second, Record.TSC)); - ThreadStack.pop_back(); + if (ThreadStack.Stack.back().first == Record.FuncId) { + const auto &Top = ThreadStack.Stack.back(); + if (!ThreadStack.RecursionDepth || + (*ThreadStack.RecursionDepth)[Top.first].isRecursive()) + recordLatency(Top.first, diff(Top.second, Record.TSC)); + if (ThreadStack.RecursionDepth) + --(*ThreadStack.RecursionDepth)[Top.first]; + ThreadStack.Stack.pop_back(); break; } @@ -164,11 +201,11 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) { // Look for the parent up the stack. auto Parent = - std::find_if(ThreadStack.rbegin(), ThreadStack.rend(), + std::find_if(ThreadStack.Stack.rbegin(), ThreadStack.Stack.rend(), [&](const std::pair &E) { return E.first == Record.FuncId; }); - if (Parent == ThreadStack.rend()) + if (Parent == ThreadStack.Stack.rend()) return false; // Account time for this apparently sibling call exit up the stack. @@ -199,11 +236,17 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) { // complexity to do correctly (need to backtrack, etc.). // // FIXME: Potentially implement the more complex deduction algorithm? - auto I = std::next(Parent).base(); - for (auto &E : make_range(I, ThreadStack.end())) { - recordLatency(E.first, diff(E.second, Record.TSC)); + auto R = make_range(std::next(Parent).base(), ThreadStack.Stack.end()); + for (auto &E : R) { + if (!ThreadStack.RecursionDepth || + (*ThreadStack.RecursionDepth)[E.first].isRecursive()) + recordLatency(E.first, diff(E.second, Record.TSC)); + } + for (auto &Top : reverse(R)) { + if (ThreadStack.RecursionDepth) + --(*ThreadStack.RecursionDepth)[Top.first]; + ThreadStack.Stack.pop_back(); } - ThreadStack.erase(I, ThreadStack.end()); break; } } @@ -425,7 +468,8 @@ static CommandRegistration Unused(&Account, []() -> Error { symbolize::LLVMSymbolizer Symbolizer; llvm::xray::FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer, FunctionAddresses); - xray::LatencyAccountant FCA(FuncIdHelper, AccountDeduceSiblingCalls); + xray::LatencyAccountant FCA(FuncIdHelper, AccountRecursiveCallsOnly, + AccountDeduceSiblingCalls); auto TraceOrErr = loadTraceFile(AccountInput); if (!TraceOrErr) return joinErrors( @@ -447,12 +491,12 @@ static CommandRegistration Unused(&Account, []() -> Error { << '\n'; for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) { errs() << "Thread ID: " << ThreadStack.first << "\n"; - if (ThreadStack.second.empty()) { + if (ThreadStack.second.Stack.empty()) { errs() << " (empty stack)\n"; continue; } - auto Level = ThreadStack.second.size(); - for (const auto &Entry : llvm::reverse(ThreadStack.second)) + auto Level = ThreadStack.second.Stack.size(); + for (const auto &Entry : llvm::reverse(ThreadStack.second.Stack)) errs() << " #" << Level-- << "\t" << FuncIdHelper.SymbolOrNumber(Entry.first) << '\n'; } diff --git a/llvm/tools/llvm-xray/xray-account.h b/llvm/tools/llvm-xray/xray-account.h index 575114d6096a2..371a9cc708e9a 100644 --- a/llvm/tools/llvm-xray/xray-account.h +++ b/llvm/tools/llvm-xray/xray-account.h @@ -18,6 +18,7 @@ #include #include "func-id-helper.h" +#include "llvm/ADT/Bitfields.h" #include "llvm/Support/Program.h" #include "llvm/Support/raw_ostream.h" #include "llvm/XRay/XRayRecord.h" @@ -33,7 +34,19 @@ class LatencyAccountant { PerThreadMinMaxTSCMap; typedef llvm::DenseMap> PerCPUMinMaxTSCMap; - typedef llvm::SmallVector, 32> FunctionStack; + struct FunctionStack { + llvm::SmallVector, 32> Stack; + class RecursionStatus { + uint32_t Storage = 0; + using Depth = Bitfield::Element; // Low 31 bits. + using IsRecursive = Bitfield::Element; // Sign bit. + public: + RecursionStatus &operator++(); + RecursionStatus &operator--(); + bool isRecursive() const; + }; + Optional> RecursionDepth; + }; typedef llvm::DenseMap PerThreadFunctionStackMap; private: @@ -43,6 +56,7 @@ class LatencyAccountant { PerCPUMinMaxTSCMap PerCPUMinMaxTSC; FuncIdConversionHelper &FuncIdHelper; + bool RecursiveCallsOnly = false; bool DeduceSiblingCalls = false; uint64_t CurrentMaxTSC = 0; @@ -52,8 +66,9 @@ class LatencyAccountant { public: explicit LatencyAccountant(FuncIdConversionHelper &FuncIdHelper, - bool DeduceSiblingCalls) - : FuncIdHelper(FuncIdHelper), DeduceSiblingCalls(DeduceSiblingCalls) {} + bool RecursiveCallsOnly, bool DeduceSiblingCalls) + : FuncIdHelper(FuncIdHelper), RecursiveCallsOnly(RecursiveCallsOnly), + DeduceSiblingCalls(DeduceSiblingCalls) {} const FunctionLatencyMap &getFunctionLatencies() const { return FunctionLatencies; From 14bc85e0ebb6c00c1672158ab6a692bfbb11e1cc Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 14 Jul 2020 16:20:00 +0100 Subject: [PATCH 0162/1035] [SVE] Don't use LocalStackAllocation for SVE objects I have introduced a new TargetFrameLowering query function: isStackIdSafeForLocalArea that queries whether or not it is safe for objects of a given stack id to be bundled into the local area. The default behaviour is to always bundle regardless of the stack id, however for AArch64 this is overriden so that it's only safe for fixed-size stack objects. There is future work here to extend this algorithm for multiple local areas so that SVE stack objects can be bundled together and accessed from their own virtual base-pointer. Differential Revision: https://reviews.llvm.org/D83859 --- .../llvm/CodeGen/TargetFrameLowering.h | 6 ++ llvm/lib/CodeGen/LocalStackSlotAllocation.cpp | 4 ++ .../lib/Target/AArch64/AArch64FrameLowering.h | 6 ++ .../CodeGen/AArch64/sve-localstackalloc.mir | 61 +++++++++++++++++++ 4 files changed, 77 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sve-localstackalloc.mir diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index c3a11b1996759..d6580430daf73 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -134,6 +134,12 @@ class TargetFrameLowering { /// was called). virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const; + /// This method returns whether or not it is safe for an object with the + /// given stack id to be bundled into the local area. + virtual bool isStackIdSafeForLocalArea(unsigned StackId) const { + return true; + } + /// getOffsetOfLocalArea - This method returns the offset of the local area /// from the stack pointer on entrance to a function. /// diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 6c5ef0255a082..204fb556d8105 100644 --- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -220,6 +220,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) { continue; if (StackProtectorFI == (int)i) continue; + if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) + continue; switch (MFI.getObjectSSPLayout(i)) { case MachineFrameInfo::SSPLK_None: @@ -254,6 +256,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) { continue; if (ProtectedObjs.count(i)) continue; + if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) + continue; AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 9d0a6d9eaf255..444740cb50ab9 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -105,6 +105,12 @@ class AArch64FrameLowering : public TargetFrameLowering { } } + bool isStackIdSafeForLocalArea(unsigned StackId) const override { + // We don't support putting SVE objects into the pre-allocated local + // frame block at the moment. + return StackId != TargetStackID::SVEVector; + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, uint64_t StackBumpBytes) const; diff --git a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir new file mode 100644 index 0000000000000..c20846c54b6a2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir @@ -0,0 +1,61 @@ +# RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -run-pass=localstackalloc -o - %s | FileCheck %s + +--- | + ; ModuleID = '' + source_filename = "" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown-linux-gnu" + + define @insert_32i8_idx( %a, i8 %elt, i64 %idx) #0 { + %ins = insertelement %a, i8 %elt, i64 %idx + ret %ins + } + + attributes #0 = { "target-features"="+sve" } + +... +--- +name: insert_32i8_idx +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: zpr, preferred-register: '' } + - { id: 1, class: zpr, preferred-register: '' } + - { id: 2, class: gpr32, preferred-register: '' } + - { id: 3, class: gpr64, preferred-register: '' } + - { id: 5, class: ppr_3b, preferred-register: '' } + - { id: 6, class: gpr64sp, preferred-register: '' } + - { id: 7, class: zpr, preferred-register: '' } + - { id: 8, class: zpr, preferred-register: '' } +liveins: + - { reg: '$z0', virtual-reg: '%0' } + - { reg: '$z1', virtual-reg: '%1' } + - { reg: '$w0', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +# CHECK-LABEL: name: insert_32i8_idx +# CHECK: localFrameSize: 0 +stack: + - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16, + stack-id: sve-vec, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: {} +body: | + bb.0 (%ir-block.0): + liveins: $z0, $z1, $w0 + + %2:gpr32 = COPY $w0 + %1:zpr = COPY $z1 + %0:zpr = COPY $z0 + %5:ppr_3b = PTRUE_B 31 + %6:gpr64sp = ADDXri %stack.0, 0, 0 + ST1B_IMM %1, %5, %6, 1 :: (store unknown-size, align 16) + ST1B_IMM %0, %5, %stack.0, 0 :: (store unknown-size into %stack.0, align 16) + %7:zpr = LD1B_IMM %5, %6, 1 :: (load unknown-size from %stack.0 + 16, align 16) + %8:zpr = LD1B_IMM %5, %stack.0, 0 :: (load unknown-size from %stack.0, align 16) + $z0 = COPY %8 + $z1 = COPY %7 + RET_ReallyLR implicit $z0, implicit $z1 + +... From 590dd73c6ebdc9fe1314dfa5bda5c2367d866574 Mon Sep 17 00:00:00 2001 From: Piotr Sobczak Date: Thu, 23 Jul 2020 19:26:49 +0200 Subject: [PATCH 0163/1035] [AMDGPU] Make generating cache invalidating instructions optional Summary: D78800 skipped generating cache invalidating instrucions altogether on AMDPAL. However, this is sometimes too restrictive - we want a more flexible option to be able to toggle this behaviour on and off while we work towards developing a correct implementation of the alternative memory model. Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, dexonsmith, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84448 --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 8 +- .../CodeGen/AMDGPU/memory-legalizer-amdpal.ll | 86 +++++++++++++++---- 2 files changed, 75 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 4e6c72ca20e28..21419aab1a43f 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -47,6 +47,10 @@ using namespace llvm::AMDGPU; #define DEBUG_TYPE "si-memory-legalizer" #define PASS_NAME "SI Memory Legalizer" +static cl::opt AmdgcnSkipCacheInvalidations( + "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, + cl::desc("Use this to skip inserting cache invalidating instructions.")); + namespace { LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); @@ -254,7 +258,7 @@ class SICacheControl { IsaVersion IV; - /// Whether to insert cache invalidation instructions. + /// Whether to insert cache invalidating instructions. bool InsertCacheInv; SICacheControl(const GCNSubtarget &ST); @@ -653,7 +657,7 @@ Optional SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( SICacheControl::SICacheControl(const GCNSubtarget &ST) { TII = ST.getInstrInfo(); IV = getIsaVersion(ST.getCPU()); - InsertCacheInv = !ST.isAmdPalOS(); + InsertCacheInv = !AmdgcnSkipCacheInvalidations; } /* static */ diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll index 6fe24c1dfb948..b414c83374b89 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll @@ -1,15 +1,23 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,CACHE_INV10 %s + +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,SKIP_CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,SKIP_CACHE_INV %s + ; FUNC-LABEL: {{^}}system_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { entry: @@ -34,7 +42,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_acq_rel() { entry: @@ -47,7 +59,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_seq_cst() { entry: @@ -60,7 +76,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_one_as_acquire() { entry: @@ -85,7 +105,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_one_as_acq_rel() { entry: @@ -98,7 +122,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_one_as_seq_cst() { entry: @@ -191,7 +219,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { entry: @@ -216,7 +248,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_acq_rel() { entry: @@ -229,7 +265,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_seq_cst() { entry: @@ -242,7 +282,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_one_as_acquire() { entry: @@ -267,7 +311,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_one_as_acq_rel() { entry: @@ -280,7 +328,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_one_as_seq_cst() { entry: From 36618274f3e2cdea98cd8202204b8ad2913aae46 Mon Sep 17 00:00:00 2001 From: George Mitenkov Date: Mon, 27 Jul 2020 10:19:48 +0300 Subject: [PATCH 0164/1035] [MLIR][LLVMDialect] Added volatile and nontemporal attributes to load/store This patch introduces 2 new optional attributes to `llvm.load` and `llvm.store` ops: `volatile` and `nontemporal`. These attributes are translated into proper LLVM as a `volatile` marker and a metadata node respectively. They are also helpful with SPIR-V to LLVM dialect conversion since they are the mappings for `Volatile` and `NonTemporal` Memory Operands. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D84396 --- mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 62 ++++++++++++--------- mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 58 ++++++++++++++++--- mlir/test/Target/llvmir.mlir | 29 ++++++++++ 3 files changed, 115 insertions(+), 34 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 5322e243427a9..29d7fd930030a 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -235,6 +235,19 @@ class MemoryOpWithAlignmentBase { }]; } +// Code definition that is used for nontemporal metadata creation. +class MemoryOpWithAlignmentAndAttributes : MemoryOpWithAlignmentBase { + code setNonTemporalMetadataCode = [{ + if ($nontemporal) { + llvm::Module *module = builder.GetInsertBlock()->getModule(); + llvm::MDNode *metadata = llvm::MDNode::get( + inst->getContext(), llvm::ConstantAsMetadata::get( + builder.getInt32(1))); + inst->setMetadata(module->getMDKindID("nontemporal"), metadata); + } + }]; +} + // Memory-related operations. def LLVM_AllocaOp : MemoryOpWithAlignmentBase, @@ -266,52 +279,49 @@ def LLVM_GEPOp : LLVM_OneResultOp<"getelementptr", [NoSideEffect]>, }]; } def LLVM_LoadOp : - MemoryOpWithAlignmentBase, + MemoryOpWithAlignmentAndAttributes, LLVM_OneResultOp<"load">, - Arguments<(ins LLVM_Type:$addr, OptionalAttr:$alignment)> { + Arguments<(ins LLVM_Type:$addr, + OptionalAttr:$alignment, + UnitAttr:$volatile_, + UnitAttr:$nontemporal)> { string llvmBuilder = [{ - auto *inst = builder.CreateLoad($addr); - }] # setAlignmentCode # [{ + auto *inst = builder.CreateLoad($addr, $volatile_); + }] # setAlignmentCode # setNonTemporalMetadataCode # [{ $res = inst; }]; let builders = [OpBuilder< - "OpBuilder &b, OperationState &result, Value addr, unsigned alignment = 0", + "OpBuilder &b, OperationState &result, Value addr, " + "unsigned alignment = 0, bool isVolatile = false, " + "bool isNonTemporal = false", [{ auto type = addr.getType().cast().getPointerElementTy(); - build(b, result, type, addr, alignment); + build(b, result, type, addr, alignment, isVolatile, isNonTemporal); }]>, OpBuilder< "OpBuilder &b, OperationState &result, Type t, Value addr, " - "unsigned alignment = 0", - [{ - if (alignment == 0) - return build(b, result, t, addr, IntegerAttr()); - build(b, result, t, addr, b.getI64IntegerAttr(alignment)); - }]>]; + "unsigned alignment = 0, bool isVolatile = false, " + "bool isNonTemporal = false">]; let parser = [{ return parseLoadOp(parser, result); }]; let printer = [{ printLoadOp(p, *this); }]; let verifier = alignmentVerifierCode; } def LLVM_StoreOp : - MemoryOpWithAlignmentBase, + MemoryOpWithAlignmentAndAttributes, LLVM_ZeroResultOp<"store">, Arguments<(ins LLVM_Type:$value, LLVM_Type:$addr, - OptionalAttr:$alignment)> { + OptionalAttr:$alignment, + UnitAttr:$volatile_, + UnitAttr:$nontemporal)> { string llvmBuilder = [{ - auto *inst = builder.CreateStore($value, $addr); - }] # setAlignmentCode; - let builders = [ - OpBuilder< + auto *inst = builder.CreateStore($value, $addr, $volatile_); + }] # setAlignmentCode # setNonTemporalMetadataCode; + let builders = [OpBuilder< "OpBuilder &b, OperationState &result, Value value, Value addr, " - "unsigned alignment = 0", - [{ - if (alignment == 0) - return build(b, result, ArrayRef{}, value, addr, IntegerAttr()); - build(b, result, ArrayRef{}, value, addr, - b.getI64IntegerAttr(alignment)); - }] - >]; + "unsigned alignment = 0, bool isVolatile = false, " + "bool isNonTemporal = false"> + ]; let parser = [{ return parseStoreOp(parser, result); }]; let printer = [{ printStoreOp(p, *this); }]; let verifier = alignmentVerifierCode; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index f448427099a44..cf7a5d926528f 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -31,6 +31,9 @@ using namespace mlir; using namespace mlir::LLVM; +static constexpr const char kVolatileAttrName[] = "volatile_"; +static constexpr const char kNonTemporalAttrName[] = "nontemporal"; + #include "mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc" //===----------------------------------------------------------------------===// @@ -178,12 +181,28 @@ CondBrOp::getMutableSuccessorOperands(unsigned index) { } //===----------------------------------------------------------------------===// -// Printing/parsing for LLVM::LoadOp. +// Builder, printer and parser for for LLVM::LoadOp. //===----------------------------------------------------------------------===// +void LoadOp::build(OpBuilder &builder, OperationState &result, Type t, + Value addr, unsigned alignment, bool isVolatile, + bool isNonTemporal) { + result.addOperands(addr); + result.addTypes(t); + if (isVolatile) + result.addAttribute(kVolatileAttrName, builder.getUnitAttr()); + if (isNonTemporal) + result.addAttribute(kNonTemporalAttrName, builder.getUnitAttr()); + if (alignment != 0) + result.addAttribute("alignment", builder.getI64IntegerAttr(alignment)); +} + static void printLoadOp(OpAsmPrinter &p, LoadOp &op) { - p << op.getOperationName() << ' ' << op.addr(); - p.printOptionalAttrDict(op.getAttrs()); + p << op.getOperationName() << ' '; + if (op.volatile_()) + p << "volatile "; + p << op.addr(); + p.printOptionalAttrDict(op.getAttrs(), {kVolatileAttrName}); p << " : " << op.addr().getType(); } @@ -201,12 +220,15 @@ static Type getLoadStoreElementType(OpAsmParser &parser, Type type, return llvmTy.getPointerElementTy(); } -// ::= `llvm.load` ssa-use attribute-dict? `:` type +// ::= `llvm.load` `volatile` ssa-use attribute-dict? `:` type static ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) { OpAsmParser::OperandType addr; Type type; llvm::SMLoc trailingTypeLoc; + if (succeeded(parser.parseOptionalKeyword("volatile"))) + result.addAttribute(kVolatileAttrName, parser.getBuilder().getUnitAttr()); + if (parser.parseOperand(addr) || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || parser.getCurrentLocation(&trailingTypeLoc) || parser.parseType(type) || @@ -220,21 +242,41 @@ static ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) { } //===----------------------------------------------------------------------===// -// Printing/parsing for LLVM::StoreOp. +// Builder, printer and parser for LLVM::StoreOp. //===----------------------------------------------------------------------===// +void StoreOp::build(OpBuilder &builder, OperationState &result, Value value, + Value addr, unsigned alignment, bool isVolatile, + bool isNonTemporal) { + result.addOperands({value, addr}); + result.addTypes(ArrayRef{}); + if (isVolatile) + result.addAttribute(kVolatileAttrName, builder.getUnitAttr()); + if (isNonTemporal) + result.addAttribute(kNonTemporalAttrName, builder.getUnitAttr()); + if (alignment != 0) + result.addAttribute("alignment", builder.getI64IntegerAttr(alignment)); +} + static void printStoreOp(OpAsmPrinter &p, StoreOp &op) { - p << op.getOperationName() << ' ' << op.value() << ", " << op.addr(); - p.printOptionalAttrDict(op.getAttrs()); + p << op.getOperationName() << ' '; + if (op.volatile_()) + p << "volatile "; + p << op.value() << ", " << op.addr(); + p.printOptionalAttrDict(op.getAttrs(), {kVolatileAttrName}); p << " : " << op.addr().getType(); } -// ::= `llvm.store` ssa-use `,` ssa-use attribute-dict? `:` type +// ::= `llvm.store` `volatile` ssa-use `,` ssa-use +// attribute-dict? `:` type static ParseResult parseStoreOp(OpAsmParser &parser, OperationState &result) { OpAsmParser::OperandType addr, value; Type type; llvm::SMLoc trailingTypeLoc; + if (succeeded(parser.parseOptionalKeyword("volatile"))) + result.addAttribute(kVolatileAttrName, parser.getBuilder().getUnitAttr()); + if (parser.parseOperand(value) || parser.parseComma() || parser.parseOperand(addr) || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || diff --git a/mlir/test/Target/llvmir.mlir b/mlir/test/Target/llvmir.mlir index 954b5b1345412..d6180cbf18494 100644 --- a/mlir/test/Target/llvmir.mlir +++ b/mlir/test/Target/llvmir.mlir @@ -1266,3 +1266,32 @@ llvm.func @cond_br_weights(%cond : !llvm.i1, %arg0 : !llvm.i32, %arg1 : !llvm.i } // CHECK: ![[NODE]] = !{!"branch_weights", i32 5, i32 10} + +// ----- + +llvm.func @volatile_store_and_load() { + %val = llvm.mlir.constant(5 : i32) : !llvm.i32 + %size = llvm.mlir.constant(1 : i64) : !llvm.i64 + %0 = llvm.alloca %size x !llvm.i32 : (!llvm.i64) -> (!llvm<"i32*">) + // CHECK: store volatile i32 5, i32* %{{.*}} + llvm.store volatile %val, %0 : !llvm<"i32*"> + // CHECK: %{{.*}} = load volatile i32, i32* %{{.*}} + %1 = llvm.load volatile %0: !llvm<"i32*"> + llvm.return +} + +// ----- + +// Check that nontemporal attribute is exported as metadata node. +llvm.func @nontemoral_store_and_load() { + %val = llvm.mlir.constant(5 : i32) : !llvm.i32 + %size = llvm.mlir.constant(1 : i64) : !llvm.i64 + %0 = llvm.alloca %size x !llvm.i32 : (!llvm.i64) -> (!llvm<"i32*">) + // CHECK: !nontemporal ![[NODE:[0-9]+]] + llvm.store %val, %0 {nontemporal} : !llvm<"i32*"> + // CHECK: !nontemporal ![[NODE]] + %1 = llvm.load %0 {nontemporal} : !llvm<"i32*"> + llvm.return +} + +// CHECK: ![[NODE]] = !{i32 1} From 81d7ebaf5c369d42b77f9e3e47e2ac22c306ec04 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 24 Jul 2020 14:49:17 +0200 Subject: [PATCH 0165/1035] [lldb/Utility] Fix a bug in RangeMap::CombineConsecutiveRanges The function didn't combine a large entry which overlapped several other entries, if those other entries were not overlapping among each other. E.g., (0,20),(5,6),(10,11) produced (0,20),(10,11) Now it just produced (0,20). --- lldb/include/lldb/Utility/RangeMap.h | 54 ++++++++++--------------- lldb/unittests/Utility/RangeMapTest.cpp | 26 ++++++++++++ 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h index fb24c5a434792..118fdfd85fa9f 100644 --- a/lldb/include/lldb/Utility/RangeMap.h +++ b/lldb/include/lldb/Utility/RangeMap.h @@ -194,41 +194,25 @@ template class RangeVector { #ifdef ASSERT_RANGEMAP_ARE_SORTED assert(IsSorted()); #endif - // Can't combine if ranges if we have zero or one range - if (m_entries.size() > 1) { - // The list should be sorted prior to calling this function - typename Collection::iterator pos; - typename Collection::iterator end; - typename Collection::iterator prev; - bool can_combine = false; - // First we determine if we can combine any of the Entry objects so we - // don't end up allocating and making a new collection for no reason - for (pos = m_entries.begin(), end = m_entries.end(), prev = end; - pos != end; prev = pos++) { - if (prev != end && prev->DoesAdjoinOrIntersect(*pos)) { - can_combine = true; - break; - } - } + auto first_intersect = std::adjacent_find( + m_entries.begin(), m_entries.end(), [](const Entry &a, const Entry &b) { + return a.DoesAdjoinOrIntersect(b); + }); + if (first_intersect == m_entries.end()) + return; - // We we can combine at least one entry, then we make a new collection - // and populate it accordingly, and then swap it into place. - if (can_combine) { - Collection minimal_ranges; - for (pos = m_entries.begin(), end = m_entries.end(), prev = end; - pos != end; prev = pos++) { - if (prev != end && prev->DoesAdjoinOrIntersect(*pos)) - minimal_ranges.back().SetRangeEnd( - std::max(prev->GetRangeEnd(), pos->GetRangeEnd())); - else - minimal_ranges.push_back(*pos); - } - // Use the swap technique in case our new vector is much smaller. We - // must swap when using the STL because std::vector objects never - // release or reduce the memory once it has been allocated/reserved. - m_entries.swap(minimal_ranges); - } + // We we can combine at least one entry, then we make a new collection and + // populate it accordingly, and then swap it into place. + auto pos = std::next(first_intersect); + Collection minimal_ranges(m_entries.begin(), pos); + for (; pos != m_entries.end(); ++pos) { + Entry &back = minimal_ranges.back(); + if (back.DoesAdjoinOrIntersect(*pos)) + back.SetRangeEnd(std::max(back.GetRangeEnd(), pos->GetRangeEnd())); + else + minimal_ranges.push_back(*pos); } + m_entries.swap(minimal_ranges); } BaseType GetMinRangeBase(BaseType fail_value) const { @@ -353,6 +337,10 @@ template class RangeVector { return nullptr; } + using const_iterator = typename Collection::const_iterator; + const_iterator begin() const { return m_entries.begin(); } + const_iterator end() const { return m_entries.end(); } + protected: void CombinePrevAndNext(typename Collection::iterator pos) { // Check if the prev or next entries in case they need to be unioned with diff --git a/lldb/unittests/Utility/RangeMapTest.cpp b/lldb/unittests/Utility/RangeMapTest.cpp index 8a243b6562181..97432dca983d3 100644 --- a/lldb/unittests/Utility/RangeMapTest.cpp +++ b/lldb/unittests/Utility/RangeMapTest.cpp @@ -12,6 +12,32 @@ using namespace lldb_private; +TEST(RangeVector, CombineConsecutiveRanges) { + using RangeVector = RangeVector; + using Entry = RangeVector::Entry; + + RangeVector V; + V.Append(0, 1); + V.Append(5, 1); + V.Append(6, 1); + V.Append(10, 9); + V.Append(15, 1); + V.Append(20, 9); + V.Append(21, 9); + V.Sort(); + V.CombineConsecutiveRanges(); + EXPECT_THAT(V, testing::ElementsAre(Entry(0, 1), Entry(5, 2), Entry(10, 9), + Entry(20, 10))); + + V.Clear(); + V.Append(0, 20); + V.Append(5, 1); + V.Append(10, 1); + V.Sort(); + V.CombineConsecutiveRanges(); + EXPECT_THAT(V, testing::ElementsAre(Entry(0, 20))); +} + using RangeDataVectorT = RangeDataVector; using EntryT = RangeDataVectorT::Entry; From e89414f4060d3ff2afcd1c89fc028d61270c4d22 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Mon, 20 Jul 2020 16:42:01 +0200 Subject: [PATCH 0166/1035] [lldb/Utility] Clean up Scalar constructors - move initialization to initializer lists - make desctructor non-virtual (nothing else is) - fix long double constructor so that it actually works --- lldb/include/lldb/Utility/Scalar.h | 53 +++++++++++---------------- lldb/source/Utility/Scalar.cpp | 4 -- lldb/unittests/Utility/ScalarTest.cpp | 1 + 3 files changed, 22 insertions(+), 36 deletions(-) diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h index 524b715230744..1dbcf80bfd89c 100644 --- a/lldb/include/lldb/Utility/Scalar.h +++ b/lldb/include/lldb/Utility/Scalar.h @@ -60,41 +60,30 @@ class Scalar { }; // Constructors and Destructors - Scalar(); - Scalar(int v) : m_type(e_sint), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(int) * 8, v, true); - } - Scalar(unsigned int v) : m_type(e_uint), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(int) * 8, v); - } - Scalar(long v) : m_type(e_slong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long) * 8, v, true); - } - Scalar(unsigned long v) : m_type(e_ulong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long) * 8, v); - } - Scalar(long long v) : m_type(e_slonglong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long long) * 8, v, true); - } + Scalar() : m_type(e_void), m_float(0.0f) {} + Scalar(int v) + : m_type(e_sint), m_integer(sizeof(v) * 8, v, true), m_float(0.0f) {} + Scalar(unsigned int v) + : m_type(e_uint), m_integer(sizeof(v) * 8, v, false), m_float(0.0f) {} + Scalar(long v) + : m_type(e_slong), m_integer(sizeof(v) * 8, v, true), m_float(0.0f) {} + Scalar(unsigned long v) + : m_type(e_ulong), m_integer(sizeof(v) * 8, v, false), m_float(0.0f) {} + Scalar(long long v) + : m_type(e_slonglong), m_integer(sizeof(v) * 8, v, true), m_float(0.0f) {} Scalar(unsigned long long v) - : m_type(e_ulonglong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long long) * 8, v); - } - Scalar(float v) : m_type(e_float), m_float(v) { m_float = llvm::APFloat(v); } - Scalar(double v) : m_type(e_double), m_float(v) { - m_float = llvm::APFloat(v); + : m_type(e_ulonglong), m_integer(sizeof(v) * 8, v, false), m_float(0.0f) { } - Scalar(long double v) - : m_type(e_long_double), - m_float(llvm::APFloat::x87DoubleExtended(), - llvm::APInt(BITWIDTH_INT128, NUM_OF_WORDS_INT128, - (reinterpret_cast(&v))->x)) {} - Scalar(llvm::APInt v) : m_type(), m_float(static_cast(0)) { - m_integer = llvm::APInt(std::move(v)); - m_type = GetBestTypeForBitSize(m_integer.getBitWidth(), true); + Scalar(float v) : m_type(e_float), m_float(v) {} + Scalar(double v) : m_type(e_double), m_float(v) {} + Scalar(long double v) : m_type(e_long_double), m_float(double(v)) { + bool ignore; + m_float.convert(llvm::APFloat::x87DoubleExtended(), + llvm::APFloat::rmNearestTiesToEven, &ignore); } - // Scalar(const RegisterValue& reg_value); - virtual ~Scalar(); + Scalar(llvm::APInt v) + : m_type(GetBestTypeForBitSize(v.getBitWidth(), true)), + m_integer(std::move(v)), m_float(0.0f) {} /// Return the most efficient Scalar::Type for the requested bit size. static Type GetBestTypeForBitSize(size_t bit_size, bool sign); diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp index 27d5b3b88d338..9309f8d662da8 100644 --- a/lldb/source/Utility/Scalar.cpp +++ b/lldb/source/Utility/Scalar.cpp @@ -126,8 +126,6 @@ static Scalar::Type PromoteToMaxType( return Scalar::e_void; } -Scalar::Scalar() : m_type(e_void), m_float(static_cast(0)) {} - bool Scalar::GetData(DataExtractor &data, size_t limit_byte_size) const { size_t byte_size = GetByteSize(); if (byte_size == 0) { @@ -232,8 +230,6 @@ void Scalar::GetValue(Stream *s, bool show_type) const { } } -Scalar::~Scalar() = default; - Scalar::Type Scalar::GetBestTypeForBitSize(size_t bit_size, bool sign) { // Scalar types are always host types, hence the sizeof(). if (sign) { diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp index f6bc6a404c159..70ce0a81627d2 100644 --- a/lldb/unittests/Utility/ScalarTest.cpp +++ b/lldb/unittests/Utility/ScalarTest.cpp @@ -92,6 +92,7 @@ TEST(ScalarTest, Getters) { CheckConversion(0x8765432112345678ull); CheckConversion(42.25f); CheckConversion(42.25); + CheckConversion(42.25L); EXPECT_EQ(APInt(128, 1) << 70, Scalar(std::pow(2.0f, 70.0f)).SInt128(APInt())); EXPECT_EQ(APInt(128, -1, true) << 70, From e1eacf27c6f4ba82b8da34e62f62b44b81ffa316 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Mon, 27 Jul 2020 17:07:27 +0900 Subject: [PATCH 0167/1035] [InstCombine] Fold freeze into phi if one operand is not undef This patch adds folding freeze into phi if it has only one operand to target. Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D84601 --- .../InstCombine/InstructionCombining.cpp | 19 ++++++++++++++- .../test/Transforms/InstCombine/freeze-phi.ll | 24 +++++++++---------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index fdf0aaf9b176d..711be57a0bafa 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1048,7 +1048,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { BasicBlock *NonConstBB = nullptr; for (unsigned i = 0; i != NumPHIValues; ++i) { Value *InVal = PN->getIncomingValue(i); - if (isa(InVal) && !isa(InVal)) + // If I is a freeze instruction, count undef as a non-constant. + if (isa(InVal) && !isa(InVal) && + (!isa(I) || isGuaranteedNotToBeUndefOrPoison(InVal))) continue; if (isa(InVal)) return nullptr; // Itself a phi. @@ -1141,6 +1143,15 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { Builder); NewPN->addIncoming(InV, PN->getIncomingBlock(i)); } + } else if (auto *FI = dyn_cast(&I)) { + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV; + if (NonConstBB == PN->getIncomingBlock(i)) + InV = Builder.CreateFreeze(PN->getIncomingValue(i), "phi.fr"); + else + InV = PN->getIncomingValue(i); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } } else { CastInst *CI = cast(&I); Type *RetTy = CI->getType(); @@ -3370,6 +3381,12 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); + // freeze (phi const, x) --> phi const, (freeze x) + if (auto *PN = dyn_cast(Op0)) { + if (Instruction *NV = foldOpIntoPhi(I, PN)) + return NV; + } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/freeze-phi.ll b/llvm/test/Transforms/InstCombine/freeze-phi.ll index 430c2d2e8fe66..b8dde691a11c1 100644 --- a/llvm/test/Transforms/InstCombine/freeze-phi.ll +++ b/llvm/test/Transforms/InstCombine/freeze-phi.ll @@ -51,11 +51,11 @@ define <2 x i32> @vec_undef(i1 %cond) { ; CHECK: A: ; CHECK-NEXT: br label [[C:%.*]] ; CHECK: B: +; CHECK-NEXT: [[PHI_FR:%.*]] = freeze <2 x i32> ; CHECK-NEXT: br label [[C]] ; CHECK: C: -; CHECK-NEXT: [[Y:%.*]] = phi <2 x i32> [ , [[A]] ], [ , [[B]] ] -; CHECK-NEXT: [[Y_FR:%.*]] = freeze <2 x i32> [[Y]] -; CHECK-NEXT: ret <2 x i32> [[Y_FR]] +; CHECK-NEXT: [[Y:%.*]] = phi <2 x i32> [ , [[A]] ], [ [[PHI_FR]], [[B]] ] +; CHECK-NEXT: ret <2 x i32> [[Y]] ; br i1 %cond, label %A, label %B A: @@ -74,11 +74,11 @@ define i32 @one(i1 %cond, i32 %x) { ; CHECK: A: ; CHECK-NEXT: br label [[C:%.*]] ; CHECK: B: +; CHECK-NEXT: [[PHI_FR:%.*]] = freeze i32 [[X:%.*]] ; CHECK-NEXT: br label [[C]] ; CHECK: C: -; CHECK-NEXT: [[Y:%.*]] = phi i32 [ 0, [[A]] ], [ [[X:%.*]], [[B]] ] -; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] -; CHECK-NEXT: ret i32 [[Y_FR]] +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ 0, [[A]] ], [ [[PHI_FR]], [[B]] ] +; CHECK-NEXT: ret i32 [[Y]] ; br i1 %cond, label %A, label %B A: @@ -154,15 +154,15 @@ define i32 @one_undef(i8 %cond) { ; CHECK-NEXT: i8 1, label [[C:%.*]] ; CHECK-NEXT: ] ; CHECK: A: +; CHECK-NEXT: [[PHI_FR:%.*]] = freeze i32 undef ; CHECK-NEXT: br label [[D:%.*]] ; CHECK: B: ; CHECK-NEXT: br label [[D]] ; CHECK: C: ; CHECK-NEXT: br label [[D]] ; CHECK: D: -; CHECK-NEXT: [[Y:%.*]] = phi i32 [ undef, [[A]] ], [ 32, [[B]] ], [ 0, [[C]] ] -; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] -; CHECK-NEXT: ret i32 [[Y_FR]] +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ [[PHI_FR]], [[A]] ], [ 32, [[B]] ], [ 0, [[C]] ] +; CHECK-NEXT: ret i32 [[Y]] ; switch i8 %cond, label %A [ i8 0, label %B @@ -189,15 +189,15 @@ define i32 @one_constexpr(i8 %cond, i32 %x) { ; CHECK-NEXT: i8 1, label [[C:%.*]] ; CHECK-NEXT: ] ; CHECK: A: +; CHECK-NEXT: [[PHI_FR:%.*]] = freeze i32 ptrtoint (i8* getelementptr inbounds (i8, i8* @glb, i64 2) to i32) ; CHECK-NEXT: br label [[D:%.*]] ; CHECK: B: ; CHECK-NEXT: br label [[D]] ; CHECK: C: ; CHECK-NEXT: br label [[D]] ; CHECK: D: -; CHECK-NEXT: [[Y:%.*]] = phi i32 [ ptrtoint (i8* getelementptr inbounds (i8, i8* @glb, i64 2) to i32), [[A]] ], [ 32, [[B]] ], [ 0, [[C]] ] -; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y]] -; CHECK-NEXT: ret i32 [[Y_FR]] +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ [[PHI_FR]], [[A]] ], [ 32, [[B]] ], [ 0, [[C]] ] +; CHECK-NEXT: ret i32 [[Y]] ; switch i8 %cond, label %A [ i8 0, label %B From d9bbe85943f6322e8fa1e85f72e53dd579c14a2f Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 27 Jul 2020 08:16:28 +0000 Subject: [PATCH 0168/1035] [Alignment][NFC] Update Bitcodewriter to use Align Differential Revision: https://reviews.llvm.org/D83533 --- llvm/include/llvm/Bitstream/BitcodeCommon.h | 30 ++++++++++++++++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 20 +++++------ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 40 ++++++++++++--------- 3 files changed, 62 insertions(+), 28 deletions(-) create mode 100644 llvm/include/llvm/Bitstream/BitcodeCommon.h diff --git a/llvm/include/llvm/Bitstream/BitcodeCommon.h b/llvm/include/llvm/Bitstream/BitcodeCommon.h new file mode 100644 index 0000000000000..84b35987c4a90 --- /dev/null +++ b/llvm/include/llvm/Bitstream/BitcodeCommon.h @@ -0,0 +1,30 @@ +//===- BitcodeCommon.h - Common code for encode/decode --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header defines common code to be used by BitcodeWriter and +// BitcodeReader. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_BITSTREAM_BITCODECOMMON_H +#define LLVM_BITSTREAM_BITCODECOMMON_H + +#include "llvm/ADT/Bitfields.h" + +namespace llvm { + +struct AllocaPackedValues { + using Align = Bitfield::Element; + using UsedWithInAlloca = Bitfield::Element; + using ExplicitType = Bitfield::Element; + using SwiftError = Bitfield::Element; +}; + +} // namespace llvm + +#endif // LLVM_BITSTREAM_BITCODECOMMON_H diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 9632b5700e8af..908e70b2789da 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -20,8 +20,9 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" -#include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Bitcode/LLVMBitCodes.h" +#include "llvm/Bitstream/BitcodeCommon.h" +#include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -4813,17 +4814,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) { case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align] if (Record.size() != 4) return error("Invalid record"); - uint64_t AlignRecord = Record[3]; - const uint64_t InAllocaMask = uint64_t(1) << 5; - const uint64_t ExplicitTypeMask = uint64_t(1) << 6; - const uint64_t SwiftErrorMask = uint64_t(1) << 7; - const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask | - SwiftErrorMask; - bool InAlloca = AlignRecord & InAllocaMask; - bool SwiftError = AlignRecord & SwiftErrorMask; + using APV = AllocaPackedValues; + const uint64_t Rec = Record[3]; + const bool InAlloca = Bitfield::get(Rec); + const bool SwiftError = Bitfield::get(Rec); FullTy = getFullyStructuredTypeByID(Record[0]); Type *Ty = flattenPointerTypes(FullTy); - if ((AlignRecord & ExplicitTypeMask) == 0) { + if (!Bitfield::get(Rec)) { auto *PTy = dyn_cast_or_null(Ty); if (!PTy) return error("Old-style alloca with a non-pointer type"); @@ -4832,7 +4829,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { Type *OpTy = getTypeByID(Record[1]); Value *Size = getFnValueByID(Record[2], OpTy); MaybeAlign Align; - if (Error Err = parseAlignmentValue(AlignRecord & ~FlagMask, Align)) { + if (Error Err = + parseAlignmentValue(Bitfield::get(Rec), Align)) { return Err; } if (!Ty || !Size) diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 7d410b24573e7..f566ddf0864c6 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -27,6 +27,7 @@ #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/LLVMBitCodes.h" #include "llvm/Bitstream/BitCodes.h" +#include "llvm/Bitstream/BitcodeCommon.h" #include "llvm/Bitstream/BitstreamWriter.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Attributes.h" @@ -394,6 +395,8 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase { unsigned getEncodedSyncScopeID(SyncScope::ID SSID) { return unsigned(SSID); } + + unsigned getEncodedAlign(MaybeAlign Alignment) { return encode(Alignment); } }; /// Class to manage the bitcode writing for a combined index. @@ -1181,10 +1184,14 @@ void ModuleBitcodeWriter::writeModuleInfo() { // compute the maximum alignment value. std::map SectionMap; std::map GCMap; - unsigned MaxAlignment = 0; + MaybeAlign MaxAlignment; unsigned MaxGlobalType = 0; + const auto UpdateMaxAlignment = [&MaxAlignment](const MaybeAlign A) { + if (A) + MaxAlignment = !MaxAlignment ? *A : std::max(*MaxAlignment, *A); + }; for (const GlobalVariable &GV : M.globals()) { - MaxAlignment = std::max(MaxAlignment, GV.getAlignment()); + UpdateMaxAlignment(GV.getAlign()); MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getValueType())); if (GV.hasSection()) { // Give section names unique ID's. @@ -1197,7 +1204,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { } } for (const Function &F : M) { - MaxAlignment = std::max(MaxAlignment, F.getAlignment()); + UpdateMaxAlignment(F.getAlign()); if (F.hasSection()) { // Give section names unique ID's. unsigned &Entry = SectionMap[std::string(F.getSection())]; @@ -1233,10 +1240,10 @@ void ModuleBitcodeWriter::writeModuleInfo() { //| constant Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Initializer. Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5)); // Linkage. - if (MaxAlignment == 0) // Alignment. + if (!MaxAlignment) // Alignment. Abbv->Add(BitCodeAbbrevOp(0)); else { - unsigned MaxEncAlignment = Log2_32(MaxAlignment)+1; + unsigned MaxEncAlignment = getEncodedAlign(MaxAlignment); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(MaxEncAlignment+1))); } @@ -1289,7 +1296,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { Vals.push_back(GV.isDeclaration() ? 0 : (VE.getValueID(GV.getInitializer()) + 1)); Vals.push_back(getEncodedLinkage(GV)); - Vals.push_back(Log2_32(GV.getAlignment())+1); + Vals.push_back(getEncodedAlign(GV.getAlign())); Vals.push_back(GV.hasSection() ? SectionMap[std::string(GV.getSection())] : 0); if (GV.isThreadLocal() || @@ -1335,7 +1342,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { Vals.push_back(F.isDeclaration()); Vals.push_back(getEncodedLinkage(F)); Vals.push_back(VE.getAttributeListID(F.getAttributes())); - Vals.push_back(Log2_32(F.getAlignment())+1); + Vals.push_back(getEncodedAlign(F.getAlign())); Vals.push_back(F.hasSection() ? SectionMap[std::string(F.getSection())] : 0); Vals.push_back(getEncodedVisibility(F)); @@ -2945,14 +2952,13 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Vals.push_back(VE.getTypeID(AI.getAllocatedType())); Vals.push_back(VE.getTypeID(I.getOperand(0)->getType())); Vals.push_back(VE.getValueID(I.getOperand(0))); // size. - unsigned AlignRecord = Log2_32(AI.getAlignment()) + 1; - assert(Log2_32(Value::MaximumAlignment) + 1 < 1 << 5 && - "not enough bits for maximum alignment"); - assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64"); - AlignRecord |= AI.isUsedWithInAlloca() << 5; - AlignRecord |= 1 << 6; - AlignRecord |= AI.isSwiftError() << 7; - Vals.push_back(AlignRecord); + using APV = AllocaPackedValues; + unsigned Record = 0; + Bitfield::set(Record, getEncodedAlign(AI.getAlign())); + Bitfield::set(Record, AI.isUsedWithInAlloca()); + Bitfield::set(Record, true); + Bitfield::set(Record, AI.isSwiftError()); + Vals.push_back(Record); break; } @@ -2966,7 +2972,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, AbbrevToUse = FUNCTION_INST_LOAD_ABBREV; } Vals.push_back(VE.getTypeID(I.getType())); - Vals.push_back(Log2_32(cast(I).getAlignment())+1); + Vals.push_back(getEncodedAlign(cast(I).getAlign())); Vals.push_back(cast(I).isVolatile()); if (cast(I).isAtomic()) { Vals.push_back(getEncodedOrdering(cast(I).getOrdering())); @@ -2980,7 +2986,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Code = bitc::FUNC_CODE_INST_STORE; pushValueAndType(I.getOperand(1), InstID, Vals); // ptrty + ptr pushValueAndType(I.getOperand(0), InstID, Vals); // valty + val - Vals.push_back(Log2_32(cast(I).getAlignment())+1); + Vals.push_back(getEncodedAlign(cast(I).getAlign())); Vals.push_back(cast(I).isVolatile()); if (cast(I).isAtomic()) { Vals.push_back(getEncodedOrdering(cast(I).getOrdering())); From 974ffee9ccd70703c6edb880ac4934a5dc12e56d Mon Sep 17 00:00:00 2001 From: Kirill Bobyrev Date: Mon, 27 Jul 2020 10:43:38 +0200 Subject: [PATCH 0169/1035] [clangd] Switch from EXPECT_TRUE to ASSERT_TRUE in remote marshalling tests Summary: When dereferencing Optional's it makes sense to use ASSERT_TRUE for better test failures readability. Switch from EXPECT_TRUE to ASSERT_TRUE where it is appropriate. Reviewers: kadircet Reviewed By: kadircet Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D84535 Signed-off-by: Kirill Bobyrev --- .../unittests/remote/MarshallingTests.cpp | 55 ++++++++----------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp b/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp index 7db7c03d61c9b..147601b665c4c 100644 --- a/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp +++ b/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp @@ -49,11 +49,11 @@ TEST(RemoteMarshallingTest, URITranslation) { "clangd/unittests/remote/MarshallingTests.cpp", Strings); auto Serialized = ProtobufMarshaller.toProtobuf(Original); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(Serialized); EXPECT_EQ(Serialized->location().file_path(), "clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp"); auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(Deserialized); EXPECT_STREQ(Deserialized->Location.FileURI, testPathURI("home/my-projects/llvm-project/clang-tools-extra/" "clangd/unittests/remote/MarshallingTests.cpp", @@ -61,38 +61,34 @@ TEST(RemoteMarshallingTest, URITranslation) { // Can't have empty paths. *Serialized->mutable_location()->mutable_file_path() = std::string(); - Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_FALSE(Deserialized); + EXPECT_FALSE(ProtobufMarshaller.fromProtobuf(*Serialized)); clangd::Ref WithInvalidURI; // Invalid URI results in serialization failure. WithInvalidURI.Location.FileURI = "This is not a URI"; - Serialized = ProtobufMarshaller.toProtobuf(WithInvalidURI); - EXPECT_FALSE(Serialized); + EXPECT_FALSE(ProtobufMarshaller.toProtobuf(WithInvalidURI)); // Can not use URIs with scheme different from "file". auto UnittestURI = URI::create(testPath("project/lib/HelloWorld.cpp"), "unittest"); - EXPECT_TRUE(bool(UnittestURI)); + ASSERT_TRUE(bool(UnittestURI)); WithInvalidURI.Location.FileURI = Strings.save(UnittestURI->toString()).begin(); - Serialized = ProtobufMarshaller.toProtobuf(WithInvalidURI); - EXPECT_FALSE(Serialized); + EXPECT_FALSE(ProtobufMarshaller.toProtobuf(WithInvalidURI)); // Paths transmitted over the wire can not be absolute, they have to be // relative. Ref WithAbsolutePath; *WithAbsolutePath.mutable_location()->mutable_file_path() = "/usr/local/user/home/HelloWorld.cpp"; - Deserialized = ProtobufMarshaller.fromProtobuf(WithAbsolutePath); - EXPECT_FALSE(Deserialized); + EXPECT_FALSE(ProtobufMarshaller.fromProtobuf(WithAbsolutePath)); } TEST(RemoteMarshallingTest, SymbolSerialization) { clangd::Symbol Sym; auto ID = SymbolID::fromStr("057557CEBF6E6B2D"); - EXPECT_TRUE(bool(ID)); + ASSERT_TRUE(bool(ID)); Sym.ID = *ID; index::SymbolInfo Info; @@ -140,9 +136,9 @@ TEST(RemoteMarshallingTest, SymbolSerialization) { // Check that symbols are exactly the same if the path to indexed project is // the same on indexing machine and the client. auto Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(Serialized); auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(Deserialized); EXPECT_EQ(toYAML(Sym), toYAML(*Deserialized)); // Serialized paths are relative and have UNIX slashes. EXPECT_EQ(convert_to_slash(Serialized->definition().file_path(), @@ -154,44 +150,39 @@ TEST(RemoteMarshallingTest, SymbolSerialization) { // Missing definition is OK. Sym.Definition = clangd::SymbolLocation(); Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_TRUE(Serialized); - Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(Serialized); + EXPECT_TRUE(ProtobufMarshaller.fromProtobuf(*Serialized)); // Relative path is absolute. *Serialized->mutable_canonical_declaration()->mutable_file_path() = convert_to_slash("/path/to/Declaration.h"); - Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_FALSE(Deserialized); + EXPECT_FALSE(ProtobufMarshaller.fromProtobuf(*Serialized)); // Fail with an invalid URI. Location.FileURI = "Not A URI"; Sym.Definition = Location; - Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_FALSE(Serialized); + EXPECT_FALSE(ProtobufMarshaller.toProtobuf(Sym)); // Schemes other than "file" can not be used. auto UnittestURI = URI::create(testPath("home/SomePath.h"), "unittest"); - EXPECT_TRUE(bool(UnittestURI)); + ASSERT_TRUE(bool(UnittestURI)); Location.FileURI = Strings.save(UnittestURI->toString()).begin(); Sym.Definition = Location; - Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_FALSE(Serialized); + EXPECT_FALSE(ProtobufMarshaller.toProtobuf(Sym)); // Passing root that is not prefix of the original file path. Location.FileURI = testPathURI("home/File.h", Strings); Sym.Definition = Location; // Check that the symbol is valid and passing the correct path works. Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(Serialized); Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(Deserialized); EXPECT_STREQ(Deserialized->Definition.FileURI, testPathURI("home/File.h", Strings)); // Fail with a wrong root. Marshaller WrongMarshaller(testPath("nothome/"), testPath("home/")); - Serialized = WrongMarshaller.toProtobuf(Sym); - EXPECT_FALSE(Serialized); + EXPECT_FALSE(WrongMarshaller.toProtobuf(Sym)); } TEST(RemoteMarshallingTest, RefSerialization) { @@ -214,9 +205,9 @@ TEST(RemoteMarshallingTest, RefSerialization) { testPath("llvm-project/")); auto Serialized = ProtobufMarshaller.toProtobuf(Ref); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(Serialized); auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(Deserialized); EXPECT_EQ(toYAML(Ref), toYAML(*Deserialized)); } @@ -270,11 +261,11 @@ TEST(RemoteMarshallingTest, IncludeHeaderURIs) { Marshaller ProtobufMarshaller(convert_to_slash("/"), convert_to_slash("/")); auto Serialized = ProtobufMarshaller.toProtobuf(Sym); + ASSERT_TRUE(Serialized); EXPECT_EQ(static_cast(Serialized->headers_size()), ValidHeaders.size()); - EXPECT_TRUE(Serialized); auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(Deserialized); Sym.IncludeHeaders = ValidHeaders; EXPECT_EQ(toYAML(Sym), toYAML(*Deserialized)); From 19e472fd84ec75f5323a147bedfffabbb23d0978 Mon Sep 17 00:00:00 2001 From: Alex Richardson Date: Mon, 27 Jul 2020 10:15:17 +0100 Subject: [PATCH 0170/1035] [libcxx][lit] Fix running testsuite with python2.7 after 9020d28688492c437abb648b6ab69baeba523219 Python 2.7 fails with TypeError: unsupported operand type(s) for +: 'NoneType' and 'str' if you pass None as the prefix argument to NamedTemporaryFile. Reviewed By: ldionne, bjope, #libc Differential Revision: https://reviews.llvm.org/D84595 --- libcxx/utils/libcxx/test/dsl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py index cd500e132946e..95d23df7ece13 100644 --- a/libcxx/utils/libcxx/test/dsl.py +++ b/libcxx/utils/libcxx/test/dsl.py @@ -52,7 +52,7 @@ def _executeScriptInternal(test, commands): res = ('', '', 127, None) return res -def _makeConfigTest(config, testPrefix=None): +def _makeConfigTest(config, testPrefix=''): sourceRoot = os.path.join(config.test_exec_root, '__config_src__') execRoot = os.path.join(config.test_exec_root, '__config_exec__') suite = lit.Test.TestSuite('__config__', sourceRoot, execRoot, config) @@ -83,7 +83,7 @@ def sourceBuilds(config, source): _executeScriptInternal(test, ['rm %t.exe']) return exitCode == 0 -def programOutput(config, program, args=[], testPrefix=None): +def programOutput(config, program, args=[], testPrefix=''): """ Compiles a program for the test target, run it on the test target and return the output. From fa1145a8d2f1bd00a60d0ed4572901d2b1403157 Mon Sep 17 00:00:00 2001 From: Isaac Richter Date: Mon, 27 Jul 2020 11:49:24 +0300 Subject: [PATCH 0171/1035] [lld][ELF] Add LOG2CEIL builtin ldscript function This patch adds support for the LOG2CEIL builtin function in linker scripts: https://sourceware.org/binutils/docs/ld/Builtin-Functions.html#index-LOG2CEIL_0028exp_0029 As documented for LD, and to keep compatibility, LOG2CEIL(0) returns 0 (not -inf). The test vectors are somewhat arbitrary. We check minimum values (0-4); middle values (2^32, and 2^32+1); and the maximum value (2^64-1). The checks for LOG2CEIL explicitly use full 64-bit values (16 hex digits). This is needed to properly verify that -inf and other interesting results aren't returned. (For some reason, all other tests in operators.test use only 14 digits.) Differential revision: https://reviews.llvm.org/D84054 --- lld/ELF/ScriptParser.cpp | 10 ++++++++++ lld/test/ELF/linkerscript/operators.test | 16 ++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index fea6b7a274e77..17ac7ff6d5f4a 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" #include "llvm/Support/ScopedPrinter.h" #include @@ -1329,6 +1330,15 @@ Expr ScriptParser::readPrimary() { return cmd->getLMA(); }; } + if (tok == "LOG2CEIL") { + expect("("); + Expr a = readExpr(); + expect(")"); + return [=] { + // LOG2CEIL(0) is defined to be 0. + return llvm::Log2_64_Ceil(std::max(a().getValue(), UINT64_C(1))); + }; + } if (tok == "MAX" || tok == "MIN") { expect("("); Expr a = readExpr(); diff --git a/lld/test/ELF/linkerscript/operators.test b/lld/test/ELF/linkerscript/operators.test index 1d40c81321d96..8ba8ee7ea41dc 100644 --- a/lld/test/ELF/linkerscript/operators.test +++ b/lld/test/ELF/linkerscript/operators.test @@ -38,6 +38,14 @@ SECTIONS { minus_abs = _end - _start; max = MAX(11, 22); min = MIN(11, 22); + log2ceil0 = LOG2CEIL(0); + log2ceil1 = LOG2CEIL(1); + log2ceil2 = LOG2CEIL(2); + log2ceil3 = LOG2CEIL(3); + log2ceil4 = LOG2CEIL(4); + log2ceil100000000 = LOG2CEIL(0x100000000); + log2ceil100000001 = LOG2CEIL(0x100000001); + log2ceilmax = LOG2CEIL(0xffffffffffffffff); logicaland1 = 0 && 0; logicaland2 = 0 && 1; logicaland3 = 1 && 0; @@ -78,6 +86,14 @@ SECTIONS { # CHECK-NEXT: 0000000000fff0 A minus_abs # CHECK-NEXT: 00000000000016 A max # CHECK-NEXT: 0000000000000b A min +# CHECK-NEXT: 0000000000000000 A log2ceil0 +# CHECK-NEXT: 0000000000000000 A log2ceil1 +# CHECK-NEXT: 0000000000000001 A log2ceil2 +# CHECK-NEXT: 0000000000000002 A log2ceil3 +# CHECK-NEXT: 0000000000000002 A log2ceil4 +# CHECK-NEXT: 0000000000000020 A log2ceil100000000 +# CHECK-NEXT: 0000000000000021 A log2ceil100000001 +# CHECK-NEXT: 0000000000000040 A log2ceilmax # CHECK-NEXT: 00000000000000 A logicaland1 # CHECK-NEXT: 00000000000000 A logicaland2 # CHECK-NEXT: 00000000000000 A logicaland3 From 8b74596b7e7c82396efdf95858fcec615a656caf Mon Sep 17 00:00:00 2001 From: Afanasyev Ivan Date: Mon, 27 Jul 2020 10:19:55 +0100 Subject: [PATCH 0172/1035] [Docs] remove unused arguments in documentation examples on vectorization passes Reviewers: nadav, tyler.nowicki Reviewed By: nadav Differential Revision: https://reviews.llvm.org/D83851 --- llvm/docs/Vectorizers.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst index c322797025fb6..702090447c692 100644 --- a/llvm/docs/Vectorizers.rst +++ b/llvm/docs/Vectorizers.rst @@ -193,7 +193,7 @@ reduction operations, such as addition, multiplication, XOR, AND and OR. .. code-block:: c++ - int foo(int *A, int *B, int n) { + int foo(int *A, int n) { unsigned sum = 0; for (int i = 0; i < n; ++i) sum += A[i] + 5; @@ -210,7 +210,7 @@ array. The Loop Vectorizer knows to vectorize induction variables. .. code-block:: c++ - void bar(float *A, float* B, float K, int n) { + void bar(float *A, int n) { for (int i = 0; i < n; ++i) A[i] = i; } @@ -254,7 +254,7 @@ The Loop Vectorizer can vectorize loops that count backwards. .. code-block:: c++ - int foo(int *A, int *B, int n) { + int foo(int *A, int n) { for (int i = n; i > 0; --i) A[i] +=1; } @@ -284,7 +284,7 @@ vectorization is profitable. .. code-block:: c++ - int foo(int *A, char *B, int n, int k) { + int foo(int *A, char *B, int n) { for (int i = 0; i < n; ++i) A[i] += 4 * B[i]; } @@ -360,7 +360,7 @@ to be used simultaneously. .. code-block:: c++ - int foo(int *A, int *B, int n) { + int foo(int *A, int n) { unsigned sum = 0; for (int i = 0; i < n; ++i) sum += A[i]; From 37ac559fccd46dcec246ceb3907c8d3910728c69 Mon Sep 17 00:00:00 2001 From: Kirill Bobyrev Date: Mon, 27 Jul 2020 11:21:55 +0200 Subject: [PATCH 0173/1035] [clangd] Add option to use remote index as static index Reviewers: hokein Reviewed By: hokein Subscribers: usaxena95, mgorny, ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D83817 --- clang-tools-extra/clangd/Features.inc.in | 1 + clang-tools-extra/clangd/tool/CMakeLists.txt | 1 + clang-tools-extra/clangd/tool/ClangdMain.cpp | 33 ++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/clang-tools-extra/clangd/Features.inc.in b/clang-tools-extra/clangd/Features.inc.in index da75aa67a65b8..8584b87c62051 100644 --- a/clang-tools-extra/clangd/Features.inc.in +++ b/clang-tools-extra/clangd/Features.inc.in @@ -1 +1,2 @@ #define CLANGD_BUILD_XPC @CLANGD_BUILD_XPC@ +#define CLANGD_ENABLE_REMOTE @CLANGD_ENABLE_REMTE@ diff --git a/clang-tools-extra/clangd/tool/CMakeLists.txt b/clang-tools-extra/clangd/tool/CMakeLists.txt index 3368013f50798..670e5a17013ab 100644 --- a/clang-tools-extra/clangd/tool/CMakeLists.txt +++ b/clang-tools-extra/clangd/tool/CMakeLists.txt @@ -27,6 +27,7 @@ clang_target_link_libraries(clangd clangToolingCore clangToolingRefactoring clangToolingSyntax + clangdRemoteIndex ) target_link_libraries(clangd PRIVATE diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index 7bce1c062e817..8d1bf5c422605 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -14,6 +14,7 @@ #include "Transport.h" #include "index/Background.h" #include "index/Serialization.h" +#include "index/remote/Client.h" #include "refactor/Rename.h" #include "support/Path.h" #include "support/Shutdown.h" @@ -449,6 +450,21 @@ opt EnableConfig{ init(true), }; +#ifdef CLANGD_ENABLE_REMOTE +opt RemoteIndexAddress{ + "remote-index-address", + cat(Features), + desc("Address of the remote index server"), +}; + +// FIXME(kirillbobyrev): Should this be the location of compile_commands.json? +opt ProjectRoot{ + "project-root", + cat(Features), + desc("Path to the project root. Requires remote-index-address to be set."), +}; +#endif + /// Supports a test URI scheme with relaxed constraints for lit tests. /// The path in a test URI will be combined with a platform-specific fake /// directory to form an absolute path. For example, test:///a.cpp is resolved @@ -680,6 +696,23 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var if (Sync) AsyncIndexLoad.wait(); } +#ifdef CLANGD_ENABLE_REMOTE + if (RemoteIndexAddress.empty() != ProjectRoot.empty()) { + llvm::errs() << "remote-index-address and project-path have to be " + "specified at the same time."; + return 1; + } + if (!RemoteIndexAddress.empty()) { + if (IndexFile.empty()) { + log("Connecting to remote index at {0}", RemoteIndexAddress); + StaticIdx = remote::getClient(RemoteIndexAddress, ProjectRoot); + EnableBackgroundIndex = false; + } else { + elog("When enabling remote index, IndexFile should not be specified. " + "Only one can be used at time. Remote index will ignored."); + } + } +#endif Opts.StaticIndex = StaticIdx.get(); Opts.AsyncThreadsCount = WorkerThreadsCount; Opts.BuildRecoveryAST = RecoveryAST; From 2e828e7579928e8cc1c5e53c84ab99ffb5afca03 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Mon, 27 Jul 2020 11:46:51 +0200 Subject: [PATCH 0174/1035] [lldb] Fix e89414f406 for msvc MSVC finds the APInt construction ambiguous. Use a case to help it choose the right constructor. --- lldb/include/lldb/Utility/Scalar.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h index 1dbcf80bfd89c..45ba7c012229b 100644 --- a/lldb/include/lldb/Utility/Scalar.h +++ b/lldb/include/lldb/Utility/Scalar.h @@ -62,18 +62,23 @@ class Scalar { // Constructors and Destructors Scalar() : m_type(e_void), m_float(0.0f) {} Scalar(int v) - : m_type(e_sint), m_integer(sizeof(v) * 8, v, true), m_float(0.0f) {} + : m_type(e_sint), m_integer(sizeof(v) * 8, uint64_t(v), true), + m_float(0.0f) {} Scalar(unsigned int v) - : m_type(e_uint), m_integer(sizeof(v) * 8, v, false), m_float(0.0f) {} + : m_type(e_uint), m_integer(sizeof(v) * 8, uint64_t(v), false), + m_float(0.0f) {} Scalar(long v) - : m_type(e_slong), m_integer(sizeof(v) * 8, v, true), m_float(0.0f) {} + : m_type(e_slong), m_integer(sizeof(v) * 8, uint64_t(v), true), + m_float(0.0f) {} Scalar(unsigned long v) - : m_type(e_ulong), m_integer(sizeof(v) * 8, v, false), m_float(0.0f) {} + : m_type(e_ulong), m_integer(sizeof(v) * 8, uint64_t(v), false), + m_float(0.0f) {} Scalar(long long v) - : m_type(e_slonglong), m_integer(sizeof(v) * 8, v, true), m_float(0.0f) {} + : m_type(e_slonglong), m_integer(sizeof(v) * 8, uint64_t(v), true), + m_float(0.0f) {} Scalar(unsigned long long v) - : m_type(e_ulonglong), m_integer(sizeof(v) * 8, v, false), m_float(0.0f) { - } + : m_type(e_ulonglong), m_integer(sizeof(v) * 8, uint64_t(v), false), + m_float(0.0f) {} Scalar(float v) : m_type(e_float), m_float(v) {} Scalar(double v) : m_type(e_double), m_float(v) {} Scalar(long double v) : m_type(e_long_double), m_float(double(v)) { From c891f519e17e38ec466deab1730c48e9d7a2eca8 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Mon, 27 Jul 2020 19:04:50 +0900 Subject: [PATCH 0175/1035] [JumpThreading] Add a test that threads jumps with frozen branch conditions --- .../freeze-lvi-edgevaluelocal.ll | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 llvm/test/Transforms/JumpThreading/freeze-lvi-edgevaluelocal.ll diff --git a/llvm/test/Transforms/JumpThreading/freeze-lvi-edgevaluelocal.ll b/llvm/test/Transforms/JumpThreading/freeze-lvi-edgevaluelocal.ll new file mode 100644 index 0000000000000..07dd956777f68 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/freeze-lvi-edgevaluelocal.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -jump-threading -S < %s | FileCheck %s + +declare void @f() +declare void @f2() +declare void @f3() + +; br on cond.fr should be removed & ENTRY -> A -> B should be threaded +define i32 @simple(i1 %cond) { +; CHECK-LABEL: @simple( +; CHECK-NEXT: ENTRY: +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND:%.*]] +; CHECK-NEXT: br i1 [[COND]], label [[A:%.*]], label [[EXIT:%.*]] +; CHECK: A: +; CHECK-NEXT: br i1 [[COND_FR]], label [[B:%.*]], label [[EXIT]] +; CHECK: B: +; CHECK-NEXT: call void @f() +; CHECK-NEXT: ret i32 1 +; CHECK: EXIT: +; CHECK-NEXT: ret i32 0 +; +ENTRY: + %cond.fr = freeze i1 %cond + br i1 %cond, label %A, label %EXIT +A: + br i1 %cond.fr, label %B, label %EXIT +B: + call void @f() + ret i32 1 +EXIT: + ret i32 0 +} + +define void @switch(i32 %cond) { +; CHECK-LABEL: @switch( +; CHECK-NEXT: ENTRY: +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i32 [[COND:%.*]] +; CHECK-NEXT: switch i32 [[COND]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i32 0, label [[A:%.*]] +; CHECK-NEXT: i32 1, label [[B:%.*]] +; CHECK-NEXT: ] +; CHECK: DEFAULT: +; CHECK-NEXT: switch i32 [[COND_FR]], label [[PRESERVED1:%.*]] [ +; CHECK-NEXT: i32 0, label [[PRESERVED2:%.*]] +; CHECK-NEXT: ] +; CHECK: PRESERVED1: +; CHECK-NEXT: call void @f() +; CHECK-NEXT: ret void +; CHECK: PRESERVED2: +; CHECK-NEXT: call void @f2() +; CHECK-NEXT: ret void +; CHECK: A: +; CHECK-NEXT: switch i32 [[COND_FR]], label [[A_NOTTAKEN:%.*]] [ +; CHECK-NEXT: i32 0, label [[A_TAKEN:%.*]] +; CHECK-NEXT: ] +; CHECK: A_TAKEN: +; CHECK-NEXT: call void @f() +; CHECK-NEXT: ret void +; CHECK: A_NOTTAKEN: +; CHECK-NEXT: call void @f2() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: switch i32 [[COND_FR]], label [[B_TAKEN:%.*]] [ +; CHECK-NEXT: i32 0, label [[B_NOTTAKEN:%.*]] +; CHECK-NEXT: ] +; CHECK: B_TAKEN: +; CHECK-NEXT: call void @f() +; CHECK-NEXT: ret void +; CHECK: B_NOTTAKEN: +; CHECK-NEXT: call void @f2() +; CHECK-NEXT: ret void +; +ENTRY: + %cond.fr = freeze i32 %cond + switch i32 %cond, label %DEFAULT [ + i32 0, label %A + i32 1, label %B + ] +DEFAULT: + switch i32 %cond.fr, label %PRESERVED1 [ + i32 0, label %PRESERVED2 + ] +PRESERVED1: + call void @f() + ret void +PRESERVED2: + call void @f2() + ret void + +A: + switch i32 %cond.fr, label %A_NOTTAKEN [ + i32 0, label %A_TAKEN + ] +A_TAKEN: + call void @f() + ret void +A_NOTTAKEN: + call void @f2() + ret void + +B: + switch i32 %cond.fr, label %B_TAKEN [ + i32 0, label %B_NOTTAKEN + ] +B_TAKEN: + call void @f() + ret void +B_NOTTAKEN: + call void @f2() + ret void +} From 6701c0bf730157775736c1314758e89bfdccb14a Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Mon, 27 Jul 2020 19:08:45 +0900 Subject: [PATCH 0176/1035] [JumpThreading] Add a test case that has a phi with undef; NFC --- llvm/test/Transforms/JumpThreading/freeze.ll | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/llvm/test/Transforms/JumpThreading/freeze.ll b/llvm/test/Transforms/JumpThreading/freeze.ll index 37d2ad672f789..3c6aa98c32b84 100644 --- a/llvm/test/Transforms/JumpThreading/freeze.ll +++ b/llvm/test/Transforms/JumpThreading/freeze.ll @@ -49,6 +49,50 @@ F2: ret i32 %B } +define i32 @test1_undef(i1 %cond) { +; CHECK-LABEL: @test1_undef( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: T1: +; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: F1: +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: br label [[MERGE]] +; CHECK: Merge: +; CHECK-NEXT: [[A:%.*]] = phi i1 [ true, [[T1]] ], [ undef, [[F1]] ] +; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] +; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] +; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK: T2: +; CHECK-NEXT: call void @f3() +; CHECK-NEXT: ret i32 [[B]] +; CHECK: F2: +; CHECK-NEXT: ret i32 [[B]] +; + br i1 %cond, label %T1, label %F1 + +T1: + %v1 = call i32 @f1() + br label %Merge + +F1: + %v2 = call i32 @f2() + br label %Merge + +Merge: + %A = phi i1 [true, %T1], [undef, %F1] + %B = phi i32 [%v1, %T1], [%v2, %F1] + %A.fr = freeze i1 %A + br i1 %A.fr, label %T2, label %F2 + +T2: + call void @f3() + ret i32 %B + +F2: + ret i32 %B +} + define i32 @test2(i1 %cond, i1 %cond2) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] From 1956cf1042d3c406d9e9cefe47d3b43adf2bdbe1 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Tue, 7 Jul 2020 15:28:24 +0200 Subject: [PATCH 0177/1035] [lldb/DWARF] Don't treat class declarations with children as definitions Summary: This effectively reverts r188124, which added code to handle (DW_AT_)declarations of structures with some kinds of children as definitions. The commit message claims this is a workaround for some kind of debug info produced by gcc. However, it does not go into specifics, so it's hard to reproduce or verify that this is indeed still a problem. Having this code is definitely a problem though, because it mistakenly declares incomplete dwarf declarations to be complete. Both clang (with -flimit-debug-info) and gcc (by default) generate DW_AT_declarations of structs with children. This happens when full debug info for a class is not emitted in a given compile unit (e.g. because of vtable homing), but the class has inline methods which are used in the given compile unit. In that case, the compilers emit a DW_AT_declaration of a class, but add a DW_TAG_subprogram child to it to describe the inlined instance of the method. Even though the class tag has some children, it definitely does not contain enough information to construct a full class definition (most notably, it lacks any members). Keeping the class as incomplete allows us to search for a real definition in other modules, helping the -flimit-debug-info flow. And in case the definition is not found we can display a error message saying that, instead of just showing an empty struct. Reviewers: clayborg, aprantl, JDevlieghere, shafik Subscribers: lldb-commits Tags: #lldb Differential Revision: https://reviews.llvm.org/D83302 --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 27 --- .../limit-debug-info/TestLimitDebugInfo.py | 25 ++- .../functionalities/limit-debug-info/main.cpp | 5 +- .../functionalities/limit-debug-info/one.cpp | 3 + .../functionalities/limit-debug-info/onetwo.h | 15 ++ .../functionalities/limit-debug-info/two.cpp | 4 + .../DWARF/DW_AT_declaration-with-children.s | 160 ++++++++++++++++++ 7 files changed, 208 insertions(+), 31 deletions(-) create mode 100644 lldb/test/Shell/SymbolFile/DWARF/DW_AT_declaration-with-children.s diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 35e7c34734e2c..7e3628504727a 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1641,33 +1641,6 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename, *unique_ast_entry_up); - if (attrs.is_forward_declaration && die.HasChildren()) { - // Check to see if the DIE actually has a definition, some version of - // GCC will - // emit DIEs with DW_AT_declaration set to true, but yet still have - // subprogram, members, or inheritance, so we can't trust it - DWARFDIE child_die = die.GetFirstChild(); - while (child_die) { - switch (child_die.Tag()) { - case DW_TAG_inheritance: - case DW_TAG_subprogram: - case DW_TAG_member: - case DW_TAG_APPLE_property: - case DW_TAG_class_type: - case DW_TAG_structure_type: - case DW_TAG_enumeration_type: - case DW_TAG_typedef: - case DW_TAG_union_type: - child_die.Clear(); - attrs.is_forward_declaration = false; - break; - default: - child_die = child_die.GetSibling(); - break; - } - } - } - if (!attrs.is_forward_declaration) { // Always start the definition for a class type so that if the class // has child classes or types that require the class to be created diff --git a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py index 9408ad6eee1d1..aa383d0005e41 100644 --- a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py +++ b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py @@ -38,7 +38,8 @@ def test_one_and_two_debug(self): self._check_debug_info_is_limited(target) - self.registerSharedLibrariesWithTarget(target, ["one", "two"]) + lldbutil.run_to_name_breakpoint(self, "main", + extra_images=["one", "two"]) # But when other shared libraries are loaded, we should be able to see # all members. @@ -58,6 +59,10 @@ def test_one_and_two_debug(self): self.expect_expr("array_of_two[2].one[2].member", result_value="174") self.expect_expr("array_of_two[2].member", result_value="274") + self.expect_expr("get_one().member", result_value="124") + self.expect_expr("get_two().one().member", result_value="124") + self.expect_expr("get_two().member", result_value="224") + @skipIf(bugnumber="pr46284", debug_info="gmodules") @skipIfWindows # Clang emits type info even with -flimit-debug-info def test_two_debug(self): @@ -66,7 +71,8 @@ def test_two_debug(self): self._check_debug_info_is_limited(target) - self.registerSharedLibrariesWithTarget(target, ["one", "two"]) + lldbutil.run_to_name_breakpoint(self, "main", + extra_images=["one", "two"]) # This time, we should only see the members from the second library. self.expect_expr("inherits_from_one.member", result_value="47") @@ -91,6 +97,12 @@ def test_two_debug(self): substrs=["no member named 'member' in 'array::One'"]) self.expect_expr("array_of_two[2].member", result_value="274") + self.expect("expr get_one().member", error=True, + substrs=["calling 'get_one' with incomplete return type 'result::One'"]) + self.expect("expr get_two().one().member", error=True, + substrs=["calling 'one' with incomplete return type 'result::One'"]) + self.expect_expr("get_two().member", result_value="224") + @skipIf(bugnumber="pr46284", debug_info="gmodules") @skipIfWindows # Clang emits type info even with -flimit-debug-info def test_one_debug(self): @@ -99,7 +111,8 @@ def test_one_debug(self): self._check_debug_info_is_limited(target) - self.registerSharedLibrariesWithTarget(target, ["one", "two"]) + lldbutil.run_to_name_breakpoint(self, "main", + extra_images=["one", "two"]) # In this case we should only see the members from the second library. # Note that we cannot see inherits_from_two.one because without debug @@ -126,3 +139,9 @@ def test_one_debug(self): substrs=["no member named 'one' in 'array::Two'"]) self.expect("expr array_of_two[2].member", error=True, substrs=["no member named 'member' in 'array::Two'"]) + + self.expect_expr("get_one().member", result_value="124") + self.expect("expr get_two().one().member", error=True, + substrs=["calling 'get_two' with incomplete return type 'result::Two'"]) + self.expect("expr get_two().member", error=True, + substrs=["calling 'get_two' with incomplete return type 'result::Two'"]) diff --git a/lldb/test/API/functionalities/limit-debug-info/main.cpp b/lldb/test/API/functionalities/limit-debug-info/main.cpp index 0a25de13d9fb2..1aad7e6f1e610 100644 --- a/lldb/test/API/functionalities/limit-debug-info/main.cpp +++ b/lldb/test/API/functionalities/limit-debug-info/main.cpp @@ -25,4 +25,7 @@ struct TwoAsMember { array::One array_of_one[3]; array::Two array_of_two[3]; -int main() { return 0; } +result::One get_one() { return result::One(124); } +result::Two get_two() { return result::Two(224); } + +int main() { return get_one().member; } diff --git a/lldb/test/API/functionalities/limit-debug-info/one.cpp b/lldb/test/API/functionalities/limit-debug-info/one.cpp index c1eb6310dd679..70353a084edc5 100644 --- a/lldb/test/API/functionalities/limit-debug-info/one.cpp +++ b/lldb/test/API/functionalities/limit-debug-info/one.cpp @@ -3,3 +3,6 @@ One::~One() = default; member::One::~One() = default; array::One::~One() = default; + +result::One::One(int member) : member(member) {} +result::One::~One() = default; diff --git a/lldb/test/API/functionalities/limit-debug-info/onetwo.h b/lldb/test/API/functionalities/limit-debug-info/onetwo.h index 67609dd7ff61c..24a18f6a5dcc3 100644 --- a/lldb/test/API/functionalities/limit-debug-info/onetwo.h +++ b/lldb/test/API/functionalities/limit-debug-info/onetwo.h @@ -39,3 +39,18 @@ struct Two { virtual ~Two(); }; } // namespace array + +namespace result { +struct One { + int member; + One(int member); + virtual ~One(); +}; + +struct Two { + int member; + Two(int member); + One one() const; + virtual ~Two(); +}; +} // namespace result diff --git a/lldb/test/API/functionalities/limit-debug-info/two.cpp b/lldb/test/API/functionalities/limit-debug-info/two.cpp index 04683da6e9cce..468cb91f6a5c3 100644 --- a/lldb/test/API/functionalities/limit-debug-info/two.cpp +++ b/lldb/test/API/functionalities/limit-debug-info/two.cpp @@ -3,3 +3,7 @@ Two::~Two() = default; member::Two::~Two() = default; array::Two::~Two() = default; + +result::Two::Two(int member) : member(member) {} +result::Two::~Two() = default; +result::One result::Two::one() const { return One(member - 100); } diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_declaration-with-children.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_declaration-with-children.s new file mode 100644 index 0000000000000..7ed33ce50297b --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_declaration-with-children.s @@ -0,0 +1,160 @@ +# Test that a forward-declared (DW_AT_declaration) structure is treated as a +# forward-declaration even if it has children. These types can be produced due +# to vtable-based type homing, or other -flimit-debug-info optimizations. + +# REQUIRES: x86 + +# RUN: llvm-mc --triple x86_64-pc-linux %s --filetype=obj > %t +# RUN: %lldb %t -o "expr a" -o exit 2>&1 | FileCheck %s --check-prefix=EXPR +# RUN: %lldb %t -o "target var a" -o exit 2>&1 | FileCheck %s --check-prefix=VAR + +# EXPR: incomplete type 'A' where a complete type is required + +# FIXME: This should also produce some kind of an error. +# VAR: (A) a = {} + + .text +_ZN1AC2Ev: + retq +.LZN1AC2Ev_end: + + .data +a: + .quad $_ZTV1A+16 + .quad $0xdeadbeef + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 1 # DW_CHILDREN_yes + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 52 # DW_AT_artificial + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 8 # Abbreviation Code + .byte 15 # DW_TAG_pointer_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 10 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 100 # DW_AT_object_pointer + .byte 19 # DW_FORM_ref4 + .byte 71 # DW_AT_specification + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 11 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 52 # DW_AT_artificial + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .quad _ZN1AC2Ev # DW_AT_low_pc + .long .LZN1AC2Ev_end-_ZN1AC2Ev # DW_AT_high_pc + .byte 2 # Abbrev [2] DW_TAG_variable + .asciz "a" # DW_AT_name + .long .LA-.Lcu_begin0 # DW_AT_type + .byte 9 # DW_AT_location + .byte 3 + .quad a +.LA: + .byte 3 # Abbrev [3] DW_TAG_structure_type + .asciz "A" # DW_AT_name + # DW_AT_declaration + .byte 4 # Abbrev [4] DW_TAG_subprogram + .asciz "A" # DW_AT_name + # DW_AT_declaration + .byte 5 # Abbrev [5] DW_TAG_formal_parameter + .long .LAptr-.Lcu_begin0 # DW_AT_type + # DW_AT_artificial + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark +.LAptr: + .byte 8 # Abbrev [8] DW_TAG_pointer_type + .long .LA-.Lcu_begin0 # DW_AT_type + .byte 10 # Abbrev [10] DW_TAG_subprogram + .quad _ZN1AC2Ev # DW_AT_low_pc + .long .LZN1AC2Ev_end-_ZN1AC2Ev # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long 147 # DW_AT_object_pointer + .long 68 # DW_AT_specification + .byte 11 # Abbrev [11] DW_TAG_formal_parameter + .byte 2 # DW_AT_location + .byte 145 + .byte 120 + .asciz "this" # DW_AT_name + .long .LAptr-.Lcu_begin0 # DW_AT_type + # DW_AT_artificial + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark +.Ldebug_info_end0: From a51829913dba28dae603fdcdddd242c7e20192a1 Mon Sep 17 00:00:00 2001 From: Alex Zinenko Date: Wed, 22 Jul 2020 13:03:24 +0200 Subject: [PATCH 0178/1035] [mlir] Support for mutable types Introduce support for mutable storage in the StorageUniquer infrastructure. This makes MLIR have key-value storage instead of just uniqued key storage. A storage instance now contains a unique immutable key and a mutable value, both stored in the arena allocator that belongs to the context. This is a preconditio for supporting recursive types that require delayed initialization, in particular LLVM structure types. The functionality is exercised in the test pass with trivial self-recursive type. So far, recursive types can only be printed in parsed in a closed type system. Removing this restriction is left for future work. Differential Revision: https://reviews.llvm.org/D84171 --- .../Tutorials/DefiningAttributesAndTypes.md | 131 +++++++++++++++++- mlir/include/mlir/IR/AttributeSupport.h | 7 + mlir/include/mlir/IR/Attributes.h | 8 +- mlir/include/mlir/IR/StorageUniquerSupport.h | 8 ++ mlir/include/mlir/IR/TypeSupport.h | 9 ++ mlir/include/mlir/IR/Types.h | 16 ++- mlir/include/mlir/Support/StorageUniquer.h | 30 ++++ mlir/lib/Support/StorageUniquer.cpp | 16 +++ mlir/test/IR/recursive-type.mlir | 16 +++ mlir/test/lib/Dialect/Test/TestDialect.cpp | 67 ++++++++- mlir/test/lib/Dialect/Test/TestTypes.h | 54 ++++++++ mlir/test/lib/IR/CMakeLists.txt | 1 + mlir/test/lib/IR/TestTypes.cpp | 78 +++++++++++ mlir/tools/mlir-opt/mlir-opt.cpp | 2 + 14 files changed, 425 insertions(+), 18 deletions(-) create mode 100644 mlir/test/IR/recursive-type.mlir create mode 100644 mlir/test/lib/IR/TestTypes.cpp diff --git a/mlir/docs/Tutorials/DefiningAttributesAndTypes.md b/mlir/docs/Tutorials/DefiningAttributesAndTypes.md index cab2441b33200..45756e1a31eab 100644 --- a/mlir/docs/Tutorials/DefiningAttributesAndTypes.md +++ b/mlir/docs/Tutorials/DefiningAttributesAndTypes.md @@ -47,7 +47,8 @@ namespace MyTypes { enum Kinds { // These kinds will be used in the examples below. Simple = Type::Kind::FIRST_PRIVATE_EXPERIMENTAL_0_TYPE, - Complex + Complex, + Recursive }; } ``` @@ -58,13 +59,17 @@ As described above, `Type` objects in MLIR are value-typed and rely on having an implicitly internal storage object that holds the actual data for the type. When defining a new `Type` it isn't always necessary to define a new storage class. So before defining the derived `Type`, it's important to know which of the two -classes of `Type` we are defining. Some types are `primitives` meaning they do +classes of `Type` we are defining. Some types are _primitives_ meaning they do not have any parameters and are singletons uniqued by kind, like the [`index` type](LangRef.md#index-type). Parametric types on the other hand, have additional information that differentiates different instances of the same `Type` kind. For example the [`integer` type](LangRef.md#integer-type) has a bitwidth, making `i8` and `i16` be different instances of -[`integer` type](LangRef.md#integer-type). +[`integer` type](LangRef.md#integer-type). Types can also have a mutable +component, which can be used, for example, to construct self-referring recursive +types. The mutable component _cannot_ be used to differentiate types within the +same kind, so usually such types are also parametric where the parameters serve +to identify them. #### Simple non-parametric types @@ -240,6 +245,126 @@ public: }; ``` +#### Types with a mutable component + +Types with a mutable component require defining a type storage class regardless +of being parametric. The storage contains both the parameters and the mutable +component and is accessed in a thread-safe way by the type support +infrastructure. + +##### Defining a type storage + +In addition to the requirements for the type storage class for parametric types, +the storage class for types with a mutable component must additionally obey the +following. + +* The mutable component must not participate in the storage key. +* Provide a mutation method that is used to modify an existing instance of the + storage. This method modifies the mutable component based on arguments, + using `allocator` for any new dynamically-allocated storage, and indicates + whether the modification was successful. + - `LogicalResult mutate(StorageAllocator &allocator, Args ...&& args)` + +Let's define a simple storage for recursive types, where a type is identified by +its name and can contain another type including itself. + +```c++ +/// Here we define a storage class for a RecursiveType that is identified by its +/// name and contains another type. +struct RecursiveTypeStorage : public TypeStorage { + /// The type is uniquely identified by its name. Note that the contained type + /// is _not_ a part of the key. + using KeyTy = StringRef; + + /// Construct the storage from the type name. Explicitly initialize the + /// containedType to nullptr, which is used as marker for the mutable + /// component being not yet initialized. + RecursiveTypeStorage(StringRef name) : name(name), containedType(nullptr) {} + + /// Define the comparison function. + bool operator==(const KeyTy &key) const { return key == name; } + + /// Define a construction method for creating a new instance of the storage. + static RecursiveTypeStorage *construct(StorageAllocator &allocator, + const KeyTy &key) { + // Note that the key string is copied into the allocator to ensure it + // remains live as long as the storage itself. + return new (allocator.allocate()) + RecursiveTypeStorage(allocator.copyInto(key)); + } + + /// Define a mutation method for changing the type after it is created. In + /// many cases, we only want to set the mutable component once and reject + /// any further modification, which can be achieved by returning failure from + /// this function. + LogicalResult mutate(StorageAllocator &, Type body) { + // If the contained type has been initialized already, and the call tries + // to change it, reject the change. + if (containedType && containedType != body) + return failure(); + + // Change the body successfully. + containedType = body; + return success(); + } + + StringRef name; + Type containedType; +}; +``` + +##### Type class definition + +Having defined the storage class, we can define the type class itself. This is +similar to parametric types. `Type::TypeBase` provides a `mutate` method that +forwards its arguments to the `mutate` method of the storage and ensures the +modification happens under lock. + +```c++ +class RecursiveType : public Type::TypeBase { +public: + /// Inherit parent constructors. + using Base::Base; + + /// This static method is used to support type inquiry through isa, cast, + /// and dyn_cast. + static bool kindof(unsigned kind) { return kind == MyTypes::Recursive; } + + /// Creates an instance of the Recursive type. This only takes the type name + /// and returns the type with uninitialized body. + static RecursiveType get(MLIRContext *ctx, StringRef name) { + // Call into the base to get a uniqued instance of this type. The parameter + // (name) is passed after the kind. + return Base::get(ctx, MyTypes::Recursive, name); + } + + /// Now we can change the mutable component of the type. This is an instance + /// method callable on an already existing RecursiveType. + void setBody(Type body) { + // Call into the base to mutate the type. + LogicalResult result = Base::mutate(body); + // Most types expect mutation to always succeed, but types can implement + // custom logic for handling mutation failures. + assert(succeeded(result) && + "attempting to change the body of an already-initialized type"); + // Avoid unused-variable warning when building without assertions. + (void) result; + } + + /// Returns the contained type, which may be null if it has not been + /// initialized yet. + Type getBody() { + return getImpl()->containedType; + } + + /// Returns the name. + StringRef getName() { + return getImpl()->name; + } +}; +``` + ### Registering types with a Dialect Once the dialect types have been defined, they must then be registered with a diff --git a/mlir/include/mlir/IR/AttributeSupport.h b/mlir/include/mlir/IR/AttributeSupport.h index cd722bb0f2c5b..72a89be438673 100644 --- a/mlir/include/mlir/IR/AttributeSupport.h +++ b/mlir/include/mlir/IR/AttributeSupport.h @@ -139,6 +139,13 @@ class AttributeUniquer { kind, std::forward(args)...); } + template + static LogicalResult mutate(MLIRContext *ctx, ImplType *impl, + Args &&...args) { + assert(impl && "cannot mutate null attribute"); + return ctx->getAttributeUniquer().mutate(impl, std::forward(args)...); + } + private: /// Initialize the given attribute storage instance. static void initializeAttributeStorage(AttributeStorage *storage, diff --git a/mlir/include/mlir/IR/Attributes.h b/mlir/include/mlir/IR/Attributes.h index 89dad2ec40cf7..5ecf5763ecd47 100644 --- a/mlir/include/mlir/IR/Attributes.h +++ b/mlir/include/mlir/IR/Attributes.h @@ -48,10 +48,10 @@ struct SparseElementsAttributeStorage; /// Attributes are known-constant values of operations and functions. /// -/// Instances of the Attribute class are references to immutable, uniqued, -/// and immortal values owned by MLIRContext. As such, an Attribute is a thin -/// wrapper around an underlying storage pointer. Attributes are usually passed -/// by value. +/// Instances of the Attribute class are references to immortal key-value pairs +/// with immutable, uniqued key owned by MLIRContext. As such, an Attribute is a +/// thin wrapper around an underlying storage pointer. Attributes are usually +/// passed by value. class Attribute { public: /// Integer identifier for all the concrete attribute kinds. diff --git a/mlir/include/mlir/IR/StorageUniquerSupport.h b/mlir/include/mlir/IR/StorageUniquerSupport.h index c2250e8547163..4c7693c28d2fd 100644 --- a/mlir/include/mlir/IR/StorageUniquerSupport.h +++ b/mlir/include/mlir/IR/StorageUniquerSupport.h @@ -105,6 +105,14 @@ class StorageUserBase : public BaseT, public Traits... { return UniquerT::template get(loc.getContext(), kind, args...); } + /// Mutate the current storage instance. This will not change the unique key. + /// The arguments are forwarded to 'ConcreteT::mutate'. + template + LogicalResult mutate(Args &&...args) { + return UniquerT::mutate(this->getContext(), getImpl(), + std::forward(args)...); + } + /// Default implementation that just returns success. template static LogicalResult verifyConstructionInvariants(Args... args) { diff --git a/mlir/include/mlir/IR/TypeSupport.h b/mlir/include/mlir/IR/TypeSupport.h index 7961dd22d47dd..ddb91e09dc89a 100644 --- a/mlir/include/mlir/IR/TypeSupport.h +++ b/mlir/include/mlir/IR/TypeSupport.h @@ -132,6 +132,15 @@ struct TypeUniquer { }, kind, std::forward(args)...); } + + /// Change the mutable component of the given type instance in the provided + /// context. + template + static LogicalResult mutate(MLIRContext *ctx, ImplType *impl, + Args &&...args) { + assert(impl && "cannot mutate null type"); + return ctx->getTypeUniquer().mutate(impl, std::forward(args)...); + } }; } // namespace detail diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h index c14f8558d8503..83636585c499f 100644 --- a/mlir/include/mlir/IR/Types.h +++ b/mlir/include/mlir/IR/Types.h @@ -27,15 +27,17 @@ struct FunctionTypeStorage; struct OpaqueTypeStorage; } // namespace detail -/// Instances of the Type class are immutable and uniqued. They wrap a pointer -/// to the storage object owned by MLIRContext. Therefore, instances of Type -/// are passed around by value. +/// Instances of the Type class are uniqued, have an immutable identifier and an +/// optional mutable component. They wrap a pointer to the storage object owned +/// by MLIRContext. Therefore, instances of Type are passed around by value. /// /// Some types are "primitives" meaning they do not have any parameters, for /// example the Index type. Parametric types have additional information that /// differentiates the types of the same kind between them, for example the /// Integer type has bitwidth, making i8 and i16 belong to the same kind by be -/// different instances of the IntegerType. +/// different instances of the IntegerType. Type parameters are part of the +/// unique immutable key. The mutable component of the type can be modified +/// after the type is created, but cannot affect the identity of the type. /// /// Types are constructed and uniqued via the 'detail::TypeUniquer' class. /// @@ -62,6 +64,7 @@ struct OpaqueTypeStorage; /// - The type kind (for LLVM-style RTTI). /// - The dialect that defined the type. /// - Any parameters of the type. +/// - An optional mutable component. /// For non-parametric types, a convenience DefaultTypeStorage is provided. /// Parametric storage types must derive TypeStorage and respect the following: /// - Define a type alias, KeyTy, to a type that uniquely identifies the @@ -75,11 +78,14 @@ struct OpaqueTypeStorage; /// - Provide a method, 'bool operator==(const KeyTy &) const', to /// compare the storage instance against an instance of the key type. /// -/// - Provide a construction method: +/// - Provide a static construction method: /// 'DerivedStorage *construct(TypeStorageAllocator &, const KeyTy &key)' /// that builds a unique instance of the derived storage. The arguments to /// this function are an allocator to store any uniqued data within the /// context and the key type for this storage. +/// +/// - If they have a mutable component, this component must not be a part of +// the key. class Type { public: /// Integer identifier for all the concrete type kinds. diff --git a/mlir/include/mlir/Support/StorageUniquer.h b/mlir/include/mlir/Support/StorageUniquer.h index f13a2fef9d50e..3100b4454197b 100644 --- a/mlir/include/mlir/Support/StorageUniquer.h +++ b/mlir/include/mlir/Support/StorageUniquer.h @@ -10,6 +10,7 @@ #define MLIR_SUPPORT_STORAGEUNIQUER_H #include "mlir/Support/LLVM.h" +#include "mlir/Support/LogicalResult.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Support/Allocator.h" @@ -60,6 +61,20 @@ using has_impltype_hash_t = decltype(ImplTy::hashKey(std::declval())); /// that is called when erasing a storage instance. This should cleanup any /// fields of the storage as necessary and not attempt to free the memory /// of the storage itself. +/// +/// Storage classes may have an optional mutable component, which must not take +/// part in the unique immutable key. In this case, storage classes may be +/// mutated with `mutate` and must additionally respect the following: +/// - Provide a mutation method: +/// 'LogicalResult mutate(StorageAllocator &, <...>)' +/// that is called when mutating a storage instance. The first argument is +/// an allocator to store any mutable data, and the remaining arguments are +/// forwarded from the call site. The storage can be mutated at any time +/// after creation. Care must be taken to avoid excessive mutation since +/// the allocated storage can keep containing previous states. The return +/// value of the function is used to indicate whether the mutation was +/// successful, e.g., to limit the number of mutations or enable deferred +/// one-time assignment of the mutable component. class StorageUniquer { public: StorageUniquer(); @@ -166,6 +181,17 @@ class StorageUniquer { return static_cast(getImpl(kind, ctorFn)); } + /// Changes the mutable component of 'storage' by forwarding the trailing + /// arguments to the 'mutate' function of the derived class. + template + LogicalResult mutate(Storage *storage, Args &&...args) { + auto mutationFn = [&](StorageAllocator &allocator) -> LogicalResult { + return static_cast(*storage).mutate( + allocator, std::forward(args)...); + }; + return mutateImpl(mutationFn); + } + /// Erases a uniqued instance of 'Storage'. This function is used for derived /// types that have complex storage or uniquing constraints. template @@ -206,6 +232,10 @@ class StorageUniquer { function_ref isEqual, function_ref cleanupFn); + /// Implementation for mutating an instance of a derived storage. + LogicalResult + mutateImpl(function_ref mutationFn); + /// The internal implementation class. std::unique_ptr impl; diff --git a/mlir/lib/Support/StorageUniquer.cpp b/mlir/lib/Support/StorageUniquer.cpp index 40304a544c4fb..f7c953e98140f 100644 --- a/mlir/lib/Support/StorageUniquer.cpp +++ b/mlir/lib/Support/StorageUniquer.cpp @@ -124,6 +124,16 @@ struct StorageUniquerImpl { storageTypes.erase(existing); } + /// Mutates an instance of a derived storage in a thread-safe way. + LogicalResult + mutate(function_ref mutationFn) { + if (!threadingIsEnabled) + return mutationFn(allocator); + + llvm::sys::SmartScopedWriter lock(mutex); + return mutationFn(allocator); + } + //===--------------------------------------------------------------------===// // Instance Storage //===--------------------------------------------------------------------===// @@ -214,3 +224,9 @@ void StorageUniquer::eraseImpl(unsigned kind, unsigned hashValue, function_ref cleanupFn) { impl->erase(kind, hashValue, isEqual, cleanupFn); } + +/// Implementation for mutating an instance of a derived storage. +LogicalResult StorageUniquer::mutateImpl( + function_ref mutationFn) { + return impl->mutate(mutationFn); +} diff --git a/mlir/test/IR/recursive-type.mlir b/mlir/test/IR/recursive-type.mlir new file mode 100644 index 0000000000000..6f90c3c780518 --- /dev/null +++ b/mlir/test/IR/recursive-type.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -test-recursive-types | FileCheck %s + +// CHECK-LABEL: @roundtrip +func @roundtrip() { + // CHECK: !test.test_rec> + "test.dummy_op_for_roundtrip"() : () -> !test.test_rec> + // CHECK: !test.test_rec> + "test.dummy_op_for_roundtrip"() : () -> !test.test_rec> + return +} + +// CHECK-LABEL: @create +func @create() { + // CHECK: !test.test_rec> + return +} diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp index 47aa86c45cc64..cdbf974679bde 100644 --- a/mlir/test/lib/Dialect/Test/TestDialect.cpp +++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp @@ -16,6 +16,7 @@ #include "mlir/IR/TypeUtilities.h" #include "mlir/Transforms/FoldUtils.h" #include "mlir/Transforms/InliningUtils.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringSwitch.h" using namespace mlir; @@ -137,19 +138,73 @@ TestDialect::TestDialect(MLIRContext *context) >(); addInterfaces(); - addTypes(); + addTypes(); allowUnknownOperations(); } -Type TestDialect::parseType(DialectAsmParser &parser) const { - if (failed(parser.parseKeyword("test_type"))) +static Type parseTestType(DialectAsmParser &parser, + llvm::SetVector &stack) { + StringRef typeTag; + if (failed(parser.parseKeyword(&typeTag))) + return Type(); + + if (typeTag == "test_type") + return TestType::get(parser.getBuilder().getContext()); + + if (typeTag != "test_rec") + return Type(); + + StringRef name; + if (parser.parseLess() || parser.parseKeyword(&name)) + return Type(); + auto rec = TestRecursiveType::create(parser.getBuilder().getContext(), name); + + // If this type already has been parsed above in the stack, expect just the + // name. + if (stack.contains(rec)) { + if (failed(parser.parseGreater())) + return Type(); + return rec; + } + + // Otherwise, parse the body and update the type. + if (failed(parser.parseComma())) + return Type(); + stack.insert(rec); + Type subtype = parseTestType(parser, stack); + stack.pop_back(); + if (!subtype || failed(parser.parseGreater()) || failed(rec.setBody(subtype))) return Type(); - return TestType::get(getContext()); + + return rec; +} + +Type TestDialect::parseType(DialectAsmParser &parser) const { + llvm::SetVector stack; + return parseTestType(parser, stack); +} + +static void printTestType(Type type, DialectAsmPrinter &printer, + llvm::SetVector &stack) { + if (type.isa()) { + printer << "test_type"; + return; + } + + auto rec = type.cast(); + printer << "test_rec<" << rec.getName(); + if (!stack.contains(rec)) { + printer << ", "; + stack.insert(rec); + printTestType(rec.getBody(), printer, stack); + stack.pop_back(); + } + printer << ">"; } void TestDialect::printType(Type type, DialectAsmPrinter &printer) const { - assert(type.isa() && "unexpected type"); - printer << "test_type"; + llvm::SetVector stack; + printTestType(type, printer, stack); } LogicalResult TestDialect::verifyOperationAttribute(Operation *op, diff --git a/mlir/test/lib/Dialect/Test/TestTypes.h b/mlir/test/lib/Dialect/Test/TestTypes.h index 0596f61c1fa1c..9e2c297c6a892 100644 --- a/mlir/test/lib/Dialect/Test/TestTypes.h +++ b/mlir/test/lib/Dialect/Test/TestTypes.h @@ -39,6 +39,60 @@ struct TestType : public Type::TypeBase()) + TestRecursiveTypeStorage(allocator.copyInto(key)); + } + + LogicalResult mutate(TypeStorageAllocator &allocator, Type newBody) { + // Cannot set a different body than before. + if (body && body != newBody) + return failure(); + + body = newBody; + return success(); + } + + StringRef name; + Type body; +}; + +/// Simple recursive type identified by its name and pointing to another named +/// type, potentially itself. This requires the body to be mutated separately +/// from type creation. +class TestRecursiveType + : public Type::TypeBase { +public: + using Base::Base; + + static bool kindof(unsigned kind) { + return kind == Type::Kind::FIRST_PRIVATE_EXPERIMENTAL_9_TYPE + 1; + } + + static TestRecursiveType create(MLIRContext *ctx, StringRef name) { + return Base::get(ctx, Type::Kind::FIRST_PRIVATE_EXPERIMENTAL_9_TYPE + 1, + name); + } + + /// Body getter and setter. + LogicalResult setBody(Type body) { return Base::mutate(body); } + Type getBody() { return getImpl()->body; } + + /// Name/key getter. + StringRef getName() { return getImpl()->name; } +}; + } // end namespace mlir #endif // MLIR_TESTTYPES_H diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt index 5456dc9e8816e..f77b26e5ca184 100644 --- a/mlir/test/lib/IR/CMakeLists.txt +++ b/mlir/test/lib/IR/CMakeLists.txt @@ -5,6 +5,7 @@ add_mlir_library(MLIRTestIR TestMatchers.cpp TestSideEffects.cpp TestSymbolUses.cpp + TestTypes.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/IR/TestTypes.cpp b/mlir/test/lib/IR/TestTypes.cpp new file mode 100644 index 0000000000000..f62c06eededfb --- /dev/null +++ b/mlir/test/lib/IR/TestTypes.cpp @@ -0,0 +1,78 @@ +//===- TestTypes.cpp - Test passes for MLIR types -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestTypes.h" +#include "TestDialect.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +struct TestRecursiveTypesPass + : public PassWrapper { + LogicalResult createIRWithTypes(); + + void runOnFunction() override { + FuncOp func = getFunction(); + + // Just make sure recurisve types are printed and parsed. + if (func.getName() == "roundtrip") + return; + + // Create a recursive type and print it as a part of a dummy op. + if (func.getName() == "create") { + if (failed(createIRWithTypes())) + signalPassFailure(); + return; + } + + // Unknown key. + func.emitOpError() << "unexpected function name"; + signalPassFailure(); + } +}; +} // end namespace + +LogicalResult TestRecursiveTypesPass::createIRWithTypes() { + MLIRContext *ctx = &getContext(); + FuncOp func = getFunction(); + auto type = TestRecursiveType::create(ctx, "some_long_and_unique_name"); + if (failed(type.setBody(type))) + return func.emitError("expected to be able to set the type body"); + + // Setting the same body is fine. + if (failed(type.setBody(type))) + return func.emitError( + "expected to be able to set the type body to the same value"); + + // Setting a different body is not. + if (succeeded(type.setBody(IndexType::get(ctx)))) + return func.emitError( + "not expected to be able to change function body more than once"); + + // Expecting to get the same type for the same name. + auto other = TestRecursiveType::create(ctx, "some_long_and_unique_name"); + if (type != other) + return func.emitError("expected type name to be the uniquing key"); + + // Create the op to check how the type is printed. + OperationState state(func.getLoc(), "test.dummy_type_test_op"); + state.addTypes(type); + func.getBody().front().push_front(Operation::create(state)); + + return success(); +} + +namespace mlir { + +void registerTestRecursiveTypesPass() { + PassRegistration reg( + "test-recursive-types", "Test support for recursive types"); +} + +} // end namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index f749c7ad98adf..f60864a6a371b 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -63,6 +63,7 @@ void registerTestMemRefDependenceCheck(); void registerTestMemRefStrideCalculation(); void registerTestOpaqueLoc(); void registerTestPreparationPassWithAllowedMemrefResults(); +void registerTestRecursiveTypesPass(); void registerTestReducer(); void registerTestGpuParallelLoopMappingPass(); void registerTestSCFUtilsPass(); @@ -138,6 +139,7 @@ void registerTestPasses() { registerTestMemRefStrideCalculation(); registerTestOpaqueLoc(); registerTestPreparationPassWithAllowedMemrefResults(); + registerTestRecursiveTypesPass(); registerTestReducer(); registerTestGpuParallelLoopMappingPass(); registerTestSCFUtilsPass(); From 102997cd0edb127050553a33a902bd6621c2c184 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 27 Jul 2020 13:35:35 +0200 Subject: [PATCH 0179/1035] Speculative build fix for scudo/standalone/tests/combined_test.cpp --- compiler-rt/lib/scudo/standalone/tests/combined_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index b5035d05aabb0..c144ad0ae32a3 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -12,6 +12,7 @@ #include "combined.h" #include +#include #include #include #include From 47a0254229ca425aa4e169c2db14e92b8db86784 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 27 Jul 2020 13:39:54 +0200 Subject: [PATCH 0180/1035] Speculative build fix for clangd/Features.inc.in --- clang-tools-extra/clangd/Features.inc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clangd/Features.inc.in b/clang-tools-extra/clangd/Features.inc.in index 8584b87c62051..6797232ddac7c 100644 --- a/clang-tools-extra/clangd/Features.inc.in +++ b/clang-tools-extra/clangd/Features.inc.in @@ -1,2 +1,2 @@ #define CLANGD_BUILD_XPC @CLANGD_BUILD_XPC@ -#define CLANGD_ENABLE_REMOTE @CLANGD_ENABLE_REMTE@ +#define CLANGD_ENABLE_REMOTE @CLANGD_ENABLE_REMOTE@ From 13c9bbc28ef9cf9976a0962e6c930a7dfc52c877 Mon Sep 17 00:00:00 2001 From: Nathan James Date: Mon, 27 Jul 2020 12:48:53 +0100 Subject: [PATCH 0181/1035] [clang-tidy] Refactor IncludeInserter Simplified how `IncludeInserter` is used in Checks by abstracting away the SourceManager and PPCallbacks inside the method `registerPreprocessor`. Changed checks that use `IncludeInserter` to no longer use a `std::unique_ptr`, instead the IncludeInserter is just a member of the class thats initialized with an `IncludeStyle`. Saving an unnecessary allocation. This results in the removal of the field `IncludeSorter::IncludeStyle` from the checks, as its wrapped in the `IncludeInserter`. No longer need to create an instance of the `IncludeInserter` in the registerPPCallbacks, now that method only needs to contain: ``` Inserter.registerPreprocessor(PP); ``` Also added a helper method to `IncludeInserter` called `createMainFileInclusionInsertion`, purely sugar but does better express intentions. Reviewed By: aaron.ballman Differential Revision: https://reviews.llvm.org/D83680 --- .../abseil/StringFindStartswithCheck.cpp | 12 ++-- .../abseil/StringFindStartswithCheck.h | 3 +- .../cppcoreguidelines/InitVariablesCheck.cpp | 12 ++-- .../cppcoreguidelines/InitVariablesCheck.h | 3 +- .../ProBoundsConstantArrayIndexCheck.cpp | 15 ++--- .../ProBoundsConstantArrayIndexCheck.h | 3 +- .../modernize/MakeSmartPtrCheck.cpp | 12 ++-- .../clang-tidy/modernize/MakeSmartPtrCheck.h | 3 +- .../clang-tidy/modernize/PassByValueCheck.cpp | 12 ++-- .../clang-tidy/modernize/PassByValueCheck.h | 3 +- .../modernize/ReplaceAutoPtrCheck.cpp | 21 +++---- .../modernize/ReplaceAutoPtrCheck.h | 3 +- .../modernize/ReplaceRandomShuffleCheck.cpp | 13 ++-- .../modernize/ReplaceRandomShuffleCheck.h | 3 +- .../performance/MoveConstructorInitCheck.cpp | 10 ++-- .../performance/MoveConstructorInitCheck.h | 3 +- .../TypePromotionInMathFnCheck.cpp | 13 ++-- .../performance/TypePromotionInMathFnCheck.h | 3 +- .../UnnecessaryValueParamCheck.cpp | 12 ++-- .../performance/UnnecessaryValueParamCheck.h | 3 +- .../clang-tidy/utils/IncludeInserter.cpp | 46 ++++++++++----- .../clang-tidy/utils/IncludeInserter.h | 59 +++++++++++-------- .../utils/TransformerClangTidyCheck.cpp | 23 ++++---- .../utils/TransformerClangTidyCheck.h | 3 +- .../clang-tidy/IncludeInserterTest.cpp | 10 ++-- 25 files changed, 149 insertions(+), 154 deletions(-) diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp index 11bbcbcb527f5..e775fc21d2d0f 100644 --- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp @@ -26,8 +26,8 @@ StringFindStartswithCheck::StringFindStartswithCheck(StringRef Name, : ClangTidyCheck(Name, Context), StringLikeClasses(utils::options::parseStringList( Options.get("StringLikeClasses", "::std::basic_string"))), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), AbseilStringsMatchHeader( Options.get("AbseilStringsMatchHeader", "absl/strings/match.h")) {} @@ -105,23 +105,21 @@ void StringFindStartswithCheck::check(const MatchFinder::MatchResult &Result) { // Create a preprocessor #include FixIt hint (CreateIncludeInsertion checks // whether this already exists). - Diagnostic << IncludeInserter->CreateIncludeInsertion( + Diagnostic << IncludeInserter.createIncludeInsertion( Source.getFileID(ComparisonExpr->getBeginLoc()), AbseilStringsMatchHeader, false); } void StringFindStartswithCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void StringFindStartswithCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "StringLikeClasses", utils::options::serializeStringList(StringLikeClasses)); - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); Options.store(Opts, "AbseilStringsMatchHeader", AbseilStringsMatchHeader); } diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h index d232d3b3efb61..2bb20f78b68ce 100644 --- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h @@ -35,9 +35,8 @@ class StringFindStartswithCheck : public ClangTidyCheck { void storeOptions(ClangTidyOptions::OptionMap &Opts) override; private: - std::unique_ptr IncludeInserter; const std::vector StringLikeClasses; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; const std::string AbseilStringsMatchHeader; }; diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp index f1755d3f9b855..3f51ef595b463 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp @@ -26,12 +26,12 @@ AST_MATCHER(VarDecl, isLocalVarDecl) { return Node.isLocalVarDecl(); } InitVariablesCheck::InitVariablesCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), MathHeader(Options.get("MathHeader", "math.h")) {} void InitVariablesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); Options.store(Opts, "MathHeader", MathHeader); } @@ -51,9 +51,7 @@ void InitVariablesCheck::registerMatchers(MatchFinder *Finder) { void InitVariablesCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = - std::make_unique(SM, getLangOpts(), IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void InitVariablesCheck::check(const MatchFinder::MatchResult &Result) { @@ -104,7 +102,7 @@ void InitVariablesCheck::check(const MatchFinder::MatchResult &Result) { MatchedDecl->getName().size()), InitializationString); if (AddMathInclude) { - Diagnostic << IncludeInserter->CreateIncludeInsertion( + Diagnostic << IncludeInserter.createIncludeInsertion( Source.getFileID(MatchedDecl->getBeginLoc()), MathHeader, false); } } diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h index 61521b118a99e..0f778104ce398 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h @@ -31,8 +31,7 @@ class InitVariablesCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr IncludeInserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; const std::string MathHeader; }; diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp index 96b0bb0f9b02d..f45801f1ea723 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp @@ -21,20 +21,18 @@ namespace cppcoreguidelines { ProBoundsConstantArrayIndexCheck::ProBoundsConstantArrayIndexCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), GslHeader(Options.get("GslHeader", "")), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) {} void ProBoundsConstantArrayIndexCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "GslHeader", GslHeader); - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } void ProBoundsConstantArrayIndexCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void ProBoundsConstantArrayIndexCheck::registerMatchers(MatchFinder *Finder) { @@ -87,9 +85,8 @@ void ProBoundsConstantArrayIndexCheck::check( IndexRange.getBegin().getLocWithOffset(-1)), ", ") << FixItHint::CreateReplacement(Matched->getEndLoc(), ")") - << Inserter->CreateIncludeInsertion( - Result.SourceManager->getMainFileID(), GslHeader, - /*IsAngled=*/false); + << Inserter.createMainFileIncludeInsertion(GslHeader, + /*IsAngled=*/false); } return; } diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h index ac7475b4372db..04a51b93a04c5 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h @@ -23,8 +23,7 @@ namespace cppcoreguidelines { /// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines-pro-bounds-constant-array-index.html class ProBoundsConstantArrayIndexCheck : public ClangTidyCheck { const std::string GslHeader; - const utils::IncludeSorter::IncludeStyle IncludeStyle; - std::unique_ptr Inserter; + utils::IncludeInserter Inserter; public: ProBoundsConstantArrayIndexCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp index c677043946f7f..5818b8cd06b5c 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp @@ -44,8 +44,8 @@ const char MakeSmartPtrCheck::PointerType[] = "pointerType"; MakeSmartPtrCheck::MakeSmartPtrCheck(StringRef Name, ClangTidyContext *Context, StringRef MakeSmartPtrFunctionName) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), MakeSmartPtrFunctionHeader( Options.get("MakeSmartPtrFunctionHeader", StdMemoryHeader)), MakeSmartPtrFunctionName( @@ -53,7 +53,7 @@ MakeSmartPtrCheck::MakeSmartPtrCheck(StringRef Name, ClangTidyContext *Context, IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {} void MakeSmartPtrCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); Options.store(Opts, "MakeSmartPtrFunctionHeader", MakeSmartPtrFunctionHeader); Options.store(Opts, "MakeSmartPtrFunction", MakeSmartPtrFunctionName); Options.store(Opts, "IgnoreMacros", IgnoreMacros); @@ -67,9 +67,7 @@ bool MakeSmartPtrCheck::isLanguageVersionSupported( void MakeSmartPtrCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void MakeSmartPtrCheck::registerMatchers(ast_matchers::MatchFinder *Finder) { @@ -432,7 +430,7 @@ void MakeSmartPtrCheck::insertHeader(DiagnosticBuilder &Diag, FileID FD) { if (MakeSmartPtrFunctionHeader.empty()) { return; } - Diag << Inserter->CreateIncludeInsertion( + Diag << Inserter.createIncludeInsertion( FD, MakeSmartPtrFunctionHeader, /*IsAngled=*/MakeSmartPtrFunctionHeader == StdMemoryHeader); } diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h index 1f738737ab264..7a1bba624c539 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h @@ -46,8 +46,7 @@ class MakeSmartPtrCheck : public ClangTidyCheck { static const char PointerType[]; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; const std::string MakeSmartPtrFunctionHeader; const std::string MakeSmartPtrFunctionName; const bool IgnoreMacros; diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp index b6dedfbc2b6eb..b955ea7f7572b 100644 --- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp @@ -120,12 +120,12 @@ collectParamDecls(const CXXConstructorDecl *Ctor, PassByValueCheck::PassByValueCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), ValuesOnly(Options.get("ValuesOnly", false)) {} void PassByValueCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); Options.store(Opts, "ValuesOnly", ValuesOnly); } @@ -167,9 +167,7 @@ void PassByValueCheck::registerMatchers(MatchFinder *Finder) { void PassByValueCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void PassByValueCheck::check(const MatchFinder::MatchResult &Result) { @@ -216,7 +214,7 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) { Diag << FixItHint::CreateInsertion(Initializer->getRParenLoc(), ")") << FixItHint::CreateInsertion( Initializer->getLParenLoc().getLocWithOffset(1), "std::move(") - << Inserter->CreateIncludeInsertion( + << Inserter.createIncludeInsertion( Result.SourceManager->getFileID(Initializer->getSourceLocation()), "utility", /*IsAngled=*/true); diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h index 7abda91ac5632..82cd9d44c5e3a 100644 --- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h @@ -31,8 +31,7 @@ class PassByValueCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; const bool ValuesOnly; }; diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp index f98254dbf7c83..25ffbe2b8738d 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp @@ -74,11 +74,11 @@ AST_MATCHER(Decl, isFromStdNamespace) { ReplaceAutoPtrCheck::ReplaceAutoPtrCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) {} void ReplaceAutoPtrCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } void ReplaceAutoPtrCheck::registerMatchers(MatchFinder *Finder) { @@ -131,9 +131,7 @@ void ReplaceAutoPtrCheck::registerMatchers(MatchFinder *Finder) { void ReplaceAutoPtrCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) { @@ -146,12 +144,11 @@ void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) { if (Range.isInvalid()) return; - auto Diag = - diag(Range.getBegin(), "use std::move to transfer ownership") - << FixItHint::CreateInsertion(Range.getBegin(), "std::move(") - << FixItHint::CreateInsertion(Range.getEnd(), ")") - << Inserter->CreateIncludeInsertion(SM.getMainFileID(), "utility", - /*IsAngled=*/true); + auto Diag = diag(Range.getBegin(), "use std::move to transfer ownership") + << FixItHint::CreateInsertion(Range.getBegin(), "std::move(") + << FixItHint::CreateInsertion(Range.getEnd(), ")") + << Inserter.createMainFileIncludeInsertion("utility", + /*IsAngled=*/true); return; } diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h index e2b04073a65f4..8288c7e47d35e 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h @@ -53,8 +53,7 @@ class ReplaceAutoPtrCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; }; } // namespace modernize diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp index 66917df3e91d2..0191f5d5c5deb 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp @@ -23,8 +23,9 @@ namespace modernize { ReplaceRandomShuffleCheck::ReplaceRandomShuffleCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) { +} void ReplaceRandomShuffleCheck::registerMatchers(MatchFinder *Finder) { const auto Begin = hasArgument(0, expr()); @@ -44,14 +45,12 @@ void ReplaceRandomShuffleCheck::registerMatchers(MatchFinder *Finder) { void ReplaceRandomShuffleCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void ReplaceRandomShuffleCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); } void ReplaceRandomShuffleCheck::check(const MatchFinder::MatchResult &Result) { @@ -92,7 +91,7 @@ void ReplaceRandomShuffleCheck::check(const MatchFinder::MatchResult &Result) { Diag << FixItHint::CreateRemoval(MatchedDecl->getSourceRange()); Diag << FixItHint::CreateInsertion(MatchedDecl->getBeginLoc(), NewName); - Diag << IncludeInserter->CreateIncludeInsertion( + Diag << IncludeInserter.createIncludeInsertion( Result.Context->getSourceManager().getFileID( MatchedCallExpr->getBeginLoc()), "random", /*IsAngled=*/true); diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h index c4ac74d6e662a..990dcffc79a6a 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h @@ -34,8 +34,7 @@ class ReplaceRandomShuffleCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr IncludeInserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; }; } // namespace modernize diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp index 4cbb014867c4d..6b42cd3180d73 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp @@ -23,8 +23,8 @@ namespace performance { MoveConstructorInitCheck::MoveConstructorInitCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) {} void MoveConstructorInitCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( @@ -90,13 +90,11 @@ void MoveConstructorInitCheck::check(const MatchFinder::MatchResult &Result) { void MoveConstructorInitCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void MoveConstructorInitCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } } // namespace performance diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h index 0473978f29db4..0b637b617782e 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h +++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h @@ -36,8 +36,7 @@ class MoveConstructorInitCheck : public ClangTidyCheck { void storeOptions(ClangTidyOptions::OptionMap &Opts) override; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; }; } // namespace performance diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp index 597445d0fc266..2105aa9947bb0 100644 --- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp @@ -31,19 +31,18 @@ AST_MATCHER_P(Type, isBuiltinType, BuiltinType::Kind, Kind) { TypePromotionInMathFnCheck::TypePromotionInMathFnCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) { +} void TypePromotionInMathFnCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void TypePromotionInMathFnCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); } void TypePromotionInMathFnCheck::registerMatchers(MatchFinder *Finder) { @@ -191,7 +190,7 @@ void TypePromotionInMathFnCheck::check(const MatchFinder::MatchResult &Result) { // , because the functions we're suggesting moving away from are all // declared in . if (FnInCmath) - Diag << IncludeInserter->CreateIncludeInsertion( + Diag << IncludeInserter.createIncludeInsertion( Result.Context->getSourceManager().getFileID(Call->getBeginLoc()), "cmath", /*IsAngled=*/true); } diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h index d1cc042ca6a35..dd7c1c090bad3 100644 --- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h +++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h @@ -36,8 +36,7 @@ class TypePromotionInMathFnCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr IncludeInserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; }; } // namespace performance diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp index 5de53b1840f12..9aef5a8681694 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp @@ -68,8 +68,8 @@ bool isExplicitTemplateSpecialization(const FunctionDecl &Function) { UnnecessaryValueParamCheck::UnnecessaryValueParamCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), AllowedTypes( utils::options::parseStringList(Options.get("AllowedTypes", ""))) {} @@ -173,14 +173,12 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) { void UnnecessaryValueParamCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void UnnecessaryValueParamCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); Options.store(Opts, "AllowedTypes", utils::options::serializeStringList(AllowedTypes)); } @@ -204,7 +202,7 @@ void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Var, Context.getLangOpts()); Diag << FixItHint::CreateInsertion(CopyArgument.getBeginLoc(), "std::move(") << FixItHint::CreateInsertion(EndLoc, ")") - << Inserter->CreateIncludeInsertion( + << Inserter.createIncludeInsertion( SM.getFileID(CopyArgument.getBeginLoc()), "utility", /*IsAngled=*/true); } diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h index 1d2367148ef82..a84079e7da229 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h @@ -41,8 +41,7 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck { llvm::DenseMap MutationAnalyzers; - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; const std::vector AllowedTypes; }; diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp index df87dbe49cff8..268692c3ba42e 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp +++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "IncludeInserter.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/Preprocessor.h" #include "clang/Lex/Token.h" namespace clang { @@ -26,7 +28,7 @@ class IncludeInserterCallback : public PPCallbacks { StringRef /*SearchPath*/, StringRef /*RelativePath*/, const Module * /*ImportedModule*/, SrcMgr::CharacteristicKind /*FileType*/) override { - Inserter->AddInclude(FileNameRef, IsAngled, HashLocation, + Inserter->addInclude(FileNameRef, IsAngled, HashLocation, IncludeToken.getEndLoc()); } @@ -34,45 +36,61 @@ class IncludeInserterCallback : public PPCallbacks { IncludeInserter *Inserter; }; -IncludeInserter::IncludeInserter(const SourceManager &SourceMgr, - const LangOptions &LangOpts, - IncludeSorter::IncludeStyle Style) - : SourceMgr(SourceMgr), Style(Style) {} +IncludeInserter::IncludeInserter(IncludeSorter::IncludeStyle Style) + : Style(Style) {} -IncludeInserter::~IncludeInserter() {} +void IncludeInserter::registerPreprocessor(Preprocessor *PP) { + assert(PP && "PP shouldn't be null"); + SourceMgr = &PP->getSourceManager(); -std::unique_ptr IncludeInserter::CreatePPCallbacks() { - return std::make_unique(this); + // If this gets registered multiple times, clear the maps + if (!IncludeSorterByFile.empty()) + IncludeSorterByFile.clear(); + if (!InsertedHeaders.empty()) + InsertedHeaders.clear(); + PP->addPPCallbacks(std::make_unique(this)); } IncludeSorter &IncludeInserter::getOrCreate(FileID FileID) { + assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call " + "registerPreprocessor()?"); // std::unique_ptr is cheap to construct, so force a construction now to save // the lookup needed if we were to insert into the map. std::unique_ptr &Entry = IncludeSorterByFile[FileID]; if (!Entry) { // If it wasn't found, Entry will be default constructed to nullptr. Entry = std::make_unique( - &SourceMgr, FileID, - SourceMgr.getFilename(SourceMgr.getLocForStartOfFile(FileID)), Style); + SourceMgr, FileID, + SourceMgr->getFilename(SourceMgr->getLocForStartOfFile(FileID)), Style); } return *Entry; } llvm::Optional -IncludeInserter::CreateIncludeInsertion(FileID FileID, StringRef Header, +IncludeInserter::createIncludeInsertion(FileID FileID, StringRef Header, bool IsAngled) { // We assume the same Header will never be included both angled and not // angled. - if (!InsertedHeaders[FileID].insert(std::string(Header)).second) + if (!InsertedHeaders[FileID].insert(Header).second) return llvm::None; return getOrCreate(FileID).CreateIncludeInsertion(Header, IsAngled); } -void IncludeInserter::AddInclude(StringRef FileName, bool IsAngled, +llvm::Optional +IncludeInserter::createMainFileIncludeInsertion(StringRef Header, + bool IsAngled) { + assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call " + "registerPreprocessor()?"); + return createIncludeInsertion(SourceMgr->getMainFileID(), Header, IsAngled); +} + +void IncludeInserter::addInclude(StringRef FileName, bool IsAngled, SourceLocation HashLocation, SourceLocation EndLocation) { - FileID FileID = SourceMgr.getFileID(HashLocation); + assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call " + "registerPreprocessor()?"); + FileID FileID = SourceMgr->getFileID(HashLocation); getOrCreate(FileID).AddInclude(FileName, IsAngled, HashLocation, EndLocation); } diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h index 0d4b951beb1ff..70c36ce8895c4 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h +++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h @@ -11,13 +11,11 @@ #include "IncludeSorter.h" #include "clang/Basic/Diagnostic.h" -#include "clang/Basic/LangOptions.h" -#include "clang/Basic/SourceManager.h" -#include "clang/Lex/PPCallbacks.h" +#include "llvm/ADT/StringSet.h" #include -#include namespace clang { +class Preprocessor; namespace tidy { namespace utils { @@ -26,16 +24,17 @@ namespace utils { /// /// ``IncludeInserter`` can be used in clang-tidy checks in the following way: /// \code +/// #include "../ClangTidyCheck.h" /// #include "../utils/IncludeInserter.h" -/// #include "clang/Frontend/CompilerInstance.h" +/// +/// namespace clang { +/// namespace tidy { /// /// class MyCheck : public ClangTidyCheck { /// public: /// void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, /// Preprocessor *ModuleExpanderPP) override { -/// Inserter = std::make_unique( -/// SM, getLangOpts(), utils::IncludeSorter::IS_Google); -/// PP->addPPCallbacks(Inserter->CreatePPCallbacks()); +/// Inserter.registerPreprocessor(); /// } /// /// void registerMatchers(ast_matchers::MatchFinder* Finder) override { ... } @@ -43,39 +42,53 @@ namespace utils { /// void check( /// const ast_matchers::MatchFinder::MatchResult& Result) override { /// ... -/// Inserter->CreateIncludeInsertion( -/// Result.SourceManager->getMainFileID(), "path/to/Header.h", -/// /*IsAngled=*/false); +/// Inserter.createMainFileIncludeInsertion("path/to/Header.h", +/// /*IsAngled=*/false); /// ... /// } /// /// private: -/// std::unique_ptr Inserter; +/// utils::IncludeInserter Inserter{utils::IncludeSorter::IS_Google}; /// }; +/// } // namespace tidy +/// } // namespace clang /// \endcode class IncludeInserter { public: - IncludeInserter(const SourceManager &SourceMgr, const LangOptions &LangOpts, - IncludeSorter::IncludeStyle Style); - ~IncludeInserter(); + /// Initializes the IncludeInserter using the IncludeStyle \p Style. + /// In most cases the \p Style will be retrieved from the ClangTidyOptions + /// using \code + /// Options.getLocalOrGlobal("IncludeStyle", ) + /// \endcode + explicit IncludeInserter(IncludeSorter::IncludeStyle Style); + + /// Registers this with the Preprocessor \p PP, must be called before this + /// class is used. + void registerPreprocessor(Preprocessor *PP); - /// Create ``PPCallbacks`` for registration with the compiler's preprocessor. - std::unique_ptr CreatePPCallbacks(); + /// Creates a \p Header inclusion directive fixit in the File \p FileID. + /// Returns ``llvm::None`` on error or if the inclusion directive already + /// exists. + llvm::Optional + createIncludeInsertion(FileID FileID, llvm::StringRef Header, bool IsAngled); - /// Creates a \p Header inclusion directive fixit. Returns ``llvm::None`` on - /// error or if inclusion directive already exists. + /// Creates a \p Header inclusion directive fixit in the main file. + /// Returns``llvm::None`` on error or if the inclusion directive already + /// exists. llvm::Optional - CreateIncludeInsertion(FileID FileID, llvm::StringRef Header, bool IsAngled); + createMainFileIncludeInsertion(llvm::StringRef Header, bool IsAngled); + + IncludeSorter::IncludeStyle getStyle() const { return Style; } private: - void AddInclude(StringRef FileName, bool IsAngled, + void addInclude(StringRef FileName, bool IsAngled, SourceLocation HashLocation, SourceLocation EndLocation); IncludeSorter &getOrCreate(FileID FileID); llvm::DenseMap> IncludeSorterByFile; - llvm::DenseMap> InsertedHeaders; - const SourceManager &SourceMgr; + llvm::DenseMap> InsertedHeaders; + const SourceManager *SourceMgr{nullptr}; const IncludeSorter::IncludeStyle Style; friend class IncludeInserterCallback; }; diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp index 03af5dd1565f8..2c116b210d050 100644 --- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp @@ -32,8 +32,8 @@ TransformerClangTidyCheck::TransformerClangTidyCheck( MakeRule, StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), Rule(MakeRule(getLangOpts(), Options)), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - IncludeSorter::IS_LLVM)) { + Inserter( + Options.getLocalOrGlobal("IncludeStyle", IncludeSorter::IS_LLVM)) { if (Rule) assert(llvm::all_of(Rule->Cases, hasExplanation) && "clang-tidy checks must have an explanation by default;" @@ -44,8 +44,8 @@ TransformerClangTidyCheck::TransformerClangTidyCheck(RewriteRule R, StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), Rule(std::move(R)), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - IncludeSorter::IS_LLVM)) { + Inserter( + Options.getLocalOrGlobal("IncludeStyle", IncludeSorter::IS_LLVM)) { assert(llvm::all_of(Rule->Cases, hasExplanation) && "clang-tidy checks must have an explanation by default;" " explicitly provide an empty explanation if none is desired"); @@ -53,15 +53,12 @@ TransformerClangTidyCheck::TransformerClangTidyCheck(RewriteRule R, void TransformerClangTidyCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - // Only allocate and register the IncludeInsert when some `Case` will add + // Only register the IncludeInsert when some `Case` will add // includes. if (Rule && llvm::any_of(Rule->Cases, [](const RewriteRule::Case &C) { return !C.AddedIncludes.empty(); - })) { - Inserter = - std::make_unique(SM, getLangOpts(), IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); - } + })) + Inserter.registerPreprocessor(PP); } void TransformerClangTidyCheck::registerMatchers( @@ -102,15 +99,15 @@ void TransformerClangTidyCheck::check( Diag << FixItHint::CreateReplacement(T.Range, T.Replacement); for (const auto &I : Case.AddedIncludes) { - Diag << Inserter->CreateIncludeInsertion( - Result.SourceManager->getMainFileID(), I.first, + Diag << Inserter.createMainFileIncludeInsertion( + I.first, /*IsAngled=*/I.second == transformer::IncludeFormat::Angled); } } void TransformerClangTidyCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } } // namespace utils diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h index 829a22fe8e2cc..404f474a24cae 100644 --- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h +++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h @@ -70,8 +70,7 @@ class TransformerClangTidyCheck : public ClangTidyCheck { private: Optional Rule; - const IncludeSorter::IncludeStyle IncludeStyle; - std::unique_ptr Inserter; + IncludeInserter Inserter; }; } // namespace utils diff --git a/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp b/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp index ed5f02576f04a..e70d3fb91bf25 100644 --- a/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp @@ -33,9 +33,7 @@ class IncludeInserterCheckBase : public ClangTidyCheck { void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) override { - Inserter = std::make_unique( - SM, getLangOpts(), utils::IncludeSorter::IS_Google); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void registerMatchers(ast_matchers::MatchFinder *Finder) override { @@ -46,15 +44,15 @@ class IncludeInserterCheckBase : public ClangTidyCheck { auto Diag = diag(Result.Nodes.getNodeAs("stmt")->getBeginLoc(), "foo, bar"); for (StringRef Header : HeadersToInclude()) { - Diag << Inserter->CreateIncludeInsertion( - Result.SourceManager->getMainFileID(), Header, IsAngledInclude()); + Diag << Inserter.createMainFileIncludeInsertion(Header, + IsAngledInclude()); } } virtual std::vector HeadersToInclude() const = 0; virtual bool IsAngledInclude() const = 0; - std::unique_ptr Inserter; + utils::IncludeInserter Inserter{utils::IncludeSorter::IS_Google}; }; class NonSystemHeaderInserterCheck : public IncludeInserterCheckBase { From 216b67e2023315ff30c2802c911a8ae0c7640c30 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Wed, 22 Jul 2020 14:53:26 +0100 Subject: [PATCH 0182/1035] AArch64: diagnose out of range relocation addends on MachO. MachO only has 24-bit addends for most relocations, small enough that it can overflow in semi-reasonable functions and cause insidious bugs if compiled without assertions enabled. Switch it to an actual error instead. The condition isn't quite identical because ld64 treats the addend as a signed number. --- .../MCTargetDesc/AArch64MachObjectWriter.cpp | 6 +++++- llvm/test/MC/AArch64/macho-addend-range.s | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MC/AArch64/macho-addend-range.s diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index b0f414bd27edd..012661edbbfda 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -373,7 +373,11 @@ void AArch64MachObjectWriter::recordRelocation( Type == MachO::ARM64_RELOC_PAGE21 || Type == MachO::ARM64_RELOC_PAGEOFF12) && Value) { - assert((Value & 0xff000000) == 0 && "Added relocation out of range!"); + if (!isInt<24>(Value)) { + Asm.getContext().reportError(Fixup.getLoc(), + "addend too big for relocation"); + return; + } MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; diff --git a/llvm/test/MC/AArch64/macho-addend-range.s b/llvm/test/MC/AArch64/macho-addend-range.s new file mode 100644 index 0000000000000..fdcb9123c7684 --- /dev/null +++ b/llvm/test/MC/AArch64/macho-addend-range.s @@ -0,0 +1,14 @@ +// RUN: not llvm-mc -triple arm64-apple-ios -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s + .global _foo + adrp x0, (_foo + 1)@PAGE + adrp x0, (_foo - 1)@PAGE + adrp x0, (_foo + 0x7fffff)@PAGE + adrp x0, (_foo - 0x800000)@PAGE + + // CHECK-NOT: error: + // CHECK: error: addend too big for relocation + // CHECK: adrp x0, (_foo + 0x800000)@PAGE + // CHECK: error: addend too big for relocation + // CHECK: adrp x0, (_foo - 0x800001)@PAGE + adrp x0, (_foo + 0x800000)@PAGE + adrp x0, (_foo - 0x800001)@PAGE From 40d11a878044711708fb6738e4b78a4c9ac3de7b Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 27 Jul 2020 14:03:28 +0200 Subject: [PATCH 0183/1035] ClangdMain.cpp: this #ifdef should be an #if CLANGD_ENABLE_REMOTE is always defined; to 0 or 1. --- clang-tools-extra/clangd/tool/ClangdMain.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index 8d1bf5c422605..f04dad7186df9 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -696,7 +696,7 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var if (Sync) AsyncIndexLoad.wait(); } -#ifdef CLANGD_ENABLE_REMOTE +#if CLANGD_ENABLE_REMOTE if (RemoteIndexAddress.empty() != ProjectRoot.empty()) { llvm::errs() << "remote-index-address and project-path have to be " "specified at the same time."; From 0de629d7ae53e821274dfbe0fb3676ea08893a35 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 27 Jul 2020 14:03:55 +0200 Subject: [PATCH 0184/1035] [gn] Set CLANGD_ENABLE_REMOTE=0 To fix the build after 37ac559fccd4. --- llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn index dade4cc73bf9f..84d3f14bb2f27 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn @@ -16,6 +16,7 @@ write_cmake_config("features") { } else { values += [ "CLANGD_BUILD_XPC=0" ] } + values += [ "CLANGD_ENABLE_REMOTE=0" ] public_configs = [ ":features_config" ] } From 529441e88e81a2e7dae6108e3d95e043c670e1a6 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 27 Jul 2020 14:11:29 +0200 Subject: [PATCH 0185/1035] Fix another #ifdef CLANGD_ENABLE_REMOTE --- clang-tools-extra/clangd/tool/ClangdMain.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index f04dad7186df9..0d4267774c92f 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -450,7 +450,7 @@ opt EnableConfig{ init(true), }; -#ifdef CLANGD_ENABLE_REMOTE +#if CLANGD_ENABLE_REMOTE opt RemoteIndexAddress{ "remote-index-address", cat(Features), From f49a7ad8c0854a01b945c27de2fd313b9013ae0d Mon Sep 17 00:00:00 2001 From: Kirill Bobyrev Date: Mon, 27 Jul 2020 14:15:59 +0200 Subject: [PATCH 0186/1035] [clangd] Add marshalling code for all request types Summary: Only FuzzyFindRequest is implemented via Marshaller even though other requests also follow a similar pattern. Unify them under the marshalling umbrella and make the server requests even more uniform to complement D84499. Reviewers: kadircet Reviewed By: kadircet Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits, sammccall Tags: #clang Differential Revision: https://reviews.llvm.org/D84525 --- .../index/remote/marshalling/Marshalling.cpp | 63 +++++++++++++++---- .../index/remote/marshalling/Marshalling.h | 7 ++- .../clangd/index/remote/server/Server.cpp | 34 +++++----- .../unittests/remote/MarshallingTests.cpp | 59 ++++++++++++++++- 4 files changed, 133 insertions(+), 30 deletions(-) diff --git a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp index b6c83c9740727..b2085bc21f486 100644 --- a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp +++ b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp @@ -17,6 +17,7 @@ #include "index/SymbolOrigin.h" #include "support/Logger.h" #include "clang/Index/IndexSymbol.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallString.h" @@ -30,6 +31,22 @@ namespace clang { namespace clangd { namespace remote { +namespace { + +template +llvm::Expected> getIDs(MessageT *Message) { + llvm::DenseSet Result; + for (const auto &ID : Message->ids()) { + auto SID = SymbolID::fromStr(StringRef(ID)); + if (!SID) + return SID.takeError(); + Result.insert(*SID); + } + return Result; +} + +} // namespace + Marshaller::Marshaller(llvm::StringRef RemoteIndexRoot, llvm::StringRef LocalIndexRoot) : Strings(Arena) { @@ -49,27 +66,50 @@ Marshaller::Marshaller(llvm::StringRef RemoteIndexRoot, assert(!RemoteIndexRoot.empty() || !LocalIndexRoot.empty()); } -clangd::FuzzyFindRequest -Marshaller::fromProtobuf(const FuzzyFindRequest *Request) { +llvm::Expected +Marshaller::fromProtobuf(const LookupRequest *Message) { + clangd::LookupRequest Req; + auto IDs = getIDs(Message); + if (!IDs) + return IDs.takeError(); + Req.IDs = std::move(*IDs); + return Req; +} + +llvm::Expected +Marshaller::fromProtobuf(const FuzzyFindRequest *Message) { assert(RemoteIndexRoot); clangd::FuzzyFindRequest Result; - Result.Query = Request->query(); - for (const auto &Scope : Request->scopes()) + Result.Query = Message->query(); + for (const auto &Scope : Message->scopes()) Result.Scopes.push_back(Scope); - Result.AnyScope = Request->any_scope(); - if (Request->limit()) - Result.Limit = Request->limit(); - Result.RestrictForCodeCompletion = Request->restricted_for_code_completion(); - for (const auto &Path : Request->proximity_paths()) { + Result.AnyScope = Message->any_scope(); + if (Message->limit()) + Result.Limit = Message->limit(); + Result.RestrictForCodeCompletion = Message->restricted_for_code_completion(); + for (const auto &Path : Message->proximity_paths()) { llvm::SmallString<256> LocalPath = llvm::StringRef(*RemoteIndexRoot); llvm::sys::path::append(LocalPath, Path); Result.ProximityPaths.push_back(std::string(LocalPath)); } - for (const auto &Type : Request->preferred_types()) + for (const auto &Type : Message->preferred_types()) Result.ProximityPaths.push_back(Type); return Result; } +llvm::Expected +Marshaller::fromProtobuf(const RefsRequest *Message) { + clangd::RefsRequest Req; + auto IDs = getIDs(Message); + if (!IDs) + return IDs.takeError(); + Req.IDs = std::move(*IDs); + Req.Filter = static_cast(Message->filter()); + if (Message->limit()) + Req.Limit = Message->limit(); + return Req; +} + llvm::Optional Marshaller::fromProtobuf(const Symbol &Message) { if (!Message.has_info() || !Message.has_canonical_declaration()) { elog("Cannot convert Symbol from protobuf (missing info, definition or " @@ -157,8 +197,7 @@ FuzzyFindRequest Marshaller::toProtobuf(const clangd::FuzzyFindRequest &From) { RPCRequest.set_restricted_for_code_completion(From.RestrictForCodeCompletion); for (const auto &Path : From.ProximityPaths) { llvm::SmallString<256> RelativePath = llvm::StringRef(Path); - if (llvm::sys::path::replace_path_prefix(RelativePath, *LocalIndexRoot, - "")) + if (llvm::sys::path::replace_path_prefix(RelativePath, *LocalIndexRoot, "")) RPCRequest.add_proximity_paths(llvm::sys::path::convert_to_slash( RelativePath, llvm::sys::path::Style::posix)); } diff --git a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h index 9129cff24db57..5d82cdb7e7650 100644 --- a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h +++ b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h @@ -38,10 +38,15 @@ class Marshaller { Marshaller() = delete; Marshaller(llvm::StringRef RemoteIndexRoot, llvm::StringRef LocalIndexRoot); - clangd::FuzzyFindRequest fromProtobuf(const FuzzyFindRequest *Request); llvm::Optional fromProtobuf(const Symbol &Message); llvm::Optional fromProtobuf(const Ref &Message); + llvm::Expected + fromProtobuf(const LookupRequest *Message); + llvm::Expected + fromProtobuf(const FuzzyFindRequest *Message); + llvm::Expected fromProtobuf(const RefsRequest *Message); + /// toProtobuf() functions serialize native clangd types and strip IndexRoot /// from the file paths specific to indexing machine. fromProtobuf() functions /// deserialize clangd types and translate relative paths into machine-native diff --git a/clang-tools-extra/clangd/index/remote/server/Server.cpp b/clang-tools-extra/clangd/index/remote/server/Server.cpp index 07b1c736b6725..7bf47a288e79c 100644 --- a/clang-tools-extra/clangd/index/remote/server/Server.cpp +++ b/clang-tools-extra/clangd/index/remote/server/Server.cpp @@ -9,6 +9,7 @@ #include "index/Index.h" #include "index/Serialization.h" #include "index/remote/marshalling/Marshalling.h" +#include "support/Logger.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Path.h" @@ -59,14 +60,12 @@ class RemoteIndexServer final : public SymbolIndex::Service { grpc::Status Lookup(grpc::ServerContext *Context, const LookupRequest *Request, grpc::ServerWriter *Reply) override { - clangd::LookupRequest Req; - for (const auto &ID : Request->ids()) { - auto SID = SymbolID::fromStr(StringRef(ID)); - if (!SID) - return grpc::Status::CANCELLED; - Req.IDs.insert(*SID); + auto Req = ProtobufMarshaller->fromProtobuf(Request); + if (!Req) { + elog("Can not parse LookupRequest from protobuf: {0}", Req.takeError()); + return grpc::Status::CANCELLED; } - Index->lookup(Req, [&](const clangd::Symbol &Sym) { + Index->lookup(*Req, [&](const clangd::Symbol &Sym) { auto SerializedSymbol = ProtobufMarshaller->toProtobuf(Sym); if (!SerializedSymbol) return; @@ -83,8 +82,13 @@ class RemoteIndexServer final : public SymbolIndex::Service { grpc::Status FuzzyFind(grpc::ServerContext *Context, const FuzzyFindRequest *Request, grpc::ServerWriter *Reply) override { - const auto Req = ProtobufMarshaller->fromProtobuf(Request); - bool HasMore = Index->fuzzyFind(Req, [&](const clangd::Symbol &Sym) { + auto Req = ProtobufMarshaller->fromProtobuf(Request); + if (!Req) { + elog("Can not parse FuzzyFindRequest from protobuf: {0}", + Req.takeError()); + return grpc::Status::CANCELLED; + } + bool HasMore = Index->fuzzyFind(*Req, [&](const clangd::Symbol &Sym) { auto SerializedSymbol = ProtobufMarshaller->toProtobuf(Sym); if (!SerializedSymbol) return; @@ -100,14 +104,12 @@ class RemoteIndexServer final : public SymbolIndex::Service { grpc::Status Refs(grpc::ServerContext *Context, const RefsRequest *Request, grpc::ServerWriter *Reply) override { - clangd::RefsRequest Req; - for (const auto &ID : Request->ids()) { - auto SID = SymbolID::fromStr(StringRef(ID)); - if (!SID) - return grpc::Status::CANCELLED; - Req.IDs.insert(*SID); + auto Req = ProtobufMarshaller->fromProtobuf(Request); + if (!Req) { + elog("Can not parse RefsRequest from protobuf: {0}", Req.takeError()); + return grpc::Status::CANCELLED; } - bool HasMore = Index->refs(Req, [&](const clangd::Ref &Reference) { + bool HasMore = Index->refs(*Req, [&](const clangd::Ref &Reference) { auto SerializedRef = ProtobufMarshaller->toProtobuf(Reference); if (!SerializedRef) return; diff --git a/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp b/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp index 147601b665c4c..f975d1c35e1ed 100644 --- a/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp +++ b/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp @@ -18,6 +18,7 @@ #include "clang/Index/IndexSymbol.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" #include "llvm/Support/Path.h" #include "llvm/Support/StringSaver.h" #include "gmock/gmock.h" @@ -271,6 +272,30 @@ TEST(RemoteMarshallingTest, IncludeHeaderURIs) { EXPECT_EQ(toYAML(Sym), toYAML(*Deserialized)); } +TEST(RemoteMarshallingTest, LookupRequestSerialization) { + clangd::LookupRequest Request; + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000001"))); + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000002"))); + + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + EXPECT_EQ(static_cast(Serialized.ids_size()), Request.IDs.size()); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + ASSERT_TRUE(bool(Deserialized)); + EXPECT_EQ(Deserialized->IDs, Request.IDs); +} + +TEST(RemoteMarshallingTest, LookupRequestFailingSerialization) { + clangd::LookupRequest Request; + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + Serialized.add_ids("Invalid Symbol ID"); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + EXPECT_FALSE(Deserialized); + llvm::consumeError(Deserialized.takeError()); +} + TEST(RemoteMarshallingTest, FuzzyFindRequestSerialization) { clangd::FuzzyFindRequest Request; Request.ProximityPaths = {testPath("local/Header.h"), @@ -280,11 +305,43 @@ TEST(RemoteMarshallingTest, FuzzyFindRequestSerialization) { auto Serialized = ProtobufMarshaller.toProtobuf(Request); EXPECT_EQ(Serialized.proximity_paths_size(), 2); auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); - EXPECT_THAT(Deserialized.ProximityPaths, + ASSERT_TRUE(bool(Deserialized)); + EXPECT_THAT(Deserialized->ProximityPaths, testing::ElementsAre(testPath("remote/Header.h"), testPath("remote/subdir/OtherHeader.h"))); } +TEST(RemoteMarshallingTest, RefsRequestSerialization) { + clangd::RefsRequest Request; + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000001"))); + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000002"))); + + Request.Limit = 9000; + Request.Filter = RefKind::Spelled | RefKind::Declaration; + + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + EXPECT_EQ(static_cast(Serialized.ids_size()), Request.IDs.size()); + EXPECT_EQ(Serialized.limit(), Request.Limit); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + ASSERT_TRUE(bool(Deserialized)); + EXPECT_EQ(Deserialized->IDs, Request.IDs); + ASSERT_TRUE(Deserialized->Limit); + EXPECT_EQ(*Deserialized->Limit, Request.Limit); + EXPECT_EQ(Deserialized->Filter, Request.Filter); +} + +TEST(RemoteMarshallingTest, RefsRequestFailingSerialization) { + clangd::RefsRequest Request; + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + Serialized.add_ids("Invalid Symbol ID"); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + EXPECT_FALSE(Deserialized); + llvm::consumeError(Deserialized.takeError()); +} + TEST(RemoteMarshallingTest, RelativePathToURITranslation) { Marshaller ProtobufMarshaller(/*RemoteIndexRoot=*/"", /*LocalIndexRoot=*/testPath("home/project/")); From 1ebcf03551c3136b8153d44968204c022f121ae6 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 26 Jul 2020 15:19:09 -0400 Subject: [PATCH 0187/1035] [InstSimplify] add tests for min/max intrinsics; NFC --- .../InstSimplify/maxmin_intrinsics.ll | 218 ++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll b/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll index 99a8656b34aa6..13c9f0ebd4abb 100644 --- a/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll +++ b/llvm/test/Transforms/InstSimplify/maxmin_intrinsics.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instsimplify -S | FileCheck %s +declare i81 @llvm.smax.i81(i81, i81) declare i8 @llvm.smax.i8(i8, i8) declare <2 x i8> @llvm.smax.v2i8(<2 x i8>, <2 x i8>) +declare i3 @llvm.smin.i3(i3, i3) declare i8 @llvm.smin.i8(i8, i8) declare <2 x i8> @llvm.smin.v2i8(<2 x i8>, <2 x i8>) declare i8 @llvm.umax.i8(i8, i8) @@ -10,6 +12,78 @@ declare <2 x i8> @llvm.umax.v2i8(<2 x i8>, <2 x i8>) declare i8 @llvm.umin.i8(i8, i8) declare <2 x i8> @llvm.umin.v2i8(<2 x i8>, <2 x i8>) +define i81 @smax_sameval(i81 %x) { +; CHECK-LABEL: @smax_sameval( +; CHECK-NEXT: [[R:%.*]] = call i81 @llvm.smax.i81(i81 [[X:%.*]], i81 [[X]]) +; CHECK-NEXT: ret i81 [[R]] +; + %r = call i81 @llvm.smax.i81(i81 %x, i81 %x) + ret i81 %r +} + +define i3 @smin_sameval(i3 %x) { +; CHECK-LABEL: @smin_sameval( +; CHECK-NEXT: [[R:%.*]] = call i3 @llvm.smin.i3(i3 [[X:%.*]], i3 [[X]]) +; CHECK-NEXT: ret i3 [[R]] +; + %r = call i3 @llvm.smin.i3(i3 %x, i3 %x) + ret i3 %r +} + +define <2 x i8> @umax_sameval(<2 x i8> %x) { +; CHECK-LABEL: @umax_sameval( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> [[X:%.*]], <2 x i8> [[X]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> %x, <2 x i8> %x) + ret <2 x i8> %r +} + +define <2 x i8> @umin_sameval(<2 x i8> %x) { +; CHECK-LABEL: @umin_sameval( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> [[X]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> %x) + ret <2 x i8> %r +} + +define i81 @smax_undef(i81 %x) { +; CHECK-LABEL: @smax_undef( +; CHECK-NEXT: [[R:%.*]] = call i81 @llvm.smax.i81(i81 undef, i81 [[X:%.*]]) +; CHECK-NEXT: ret i81 [[R]] +; + %r = call i81 @llvm.smax.i81(i81 undef, i81 %x) + ret i81 %r +} + +define i3 @smin_undef(i3 %x) { +; CHECK-LABEL: @smin_undef( +; CHECK-NEXT: [[R:%.*]] = call i3 @llvm.smin.i3(i3 [[X:%.*]], i3 undef) +; CHECK-NEXT: ret i3 [[R]] +; + %r = call i3 @llvm.smin.i3(i3 %x, i3 undef) + ret i3 %r +} + +define <2 x i8> @umax_undef(<2 x i8> %x) { +; CHECK-LABEL: @umax_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> %x) + ret <2 x i8> %r +} + +define <2 x i8> @umin_undef(<2 x i8> %x) { +; CHECK-LABEL: @umin_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> undef) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> undef) + ret <2 x i8> %r +} + define i8 @smax_maxval(i8 %x) { ; CHECK-LABEL: @smax_maxval( ; CHECK-NEXT: ret i8 127 @@ -73,3 +147,147 @@ define <2 x i8> @umin_minval_commute(<2 x i8> %x) { %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> zeroinitializer) ret <2 x i8> %r } + +define i8 @smax_minval(i8 %x) { +; CHECK-LABEL: @smax_minval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 -128) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.smax.i8(i8 %x, i8 -128) + ret i8 %r +} + +define <2 x i8> @smax_minval_commute(<2 x i8> %x) { +; CHECK-LABEL: @smax_minval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> %x) + ret <2 x i8> %r +} + +define i8 @smin_maxval(i8 %x) { +; CHECK-LABEL: @smin_maxval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smin.i8(i8 127, i8 [[X:%.*]]) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.smin.i8(i8 127, i8 %x) + ret i8 %r +} + +define <2 x i8> @smin_maxval_commute(<2 x i8> %x) { +; CHECK-LABEL: @smin_maxval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smin.v2i8(<2 x i8> %x, <2 x i8> ) + ret <2 x i8> %r +} + +define i8 @umax_minval(i8 %x) { +; CHECK-LABEL: @umax_minval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 0) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.umax.i8(i8 %x, i8 0) + ret i8 %r +} + +define <2 x i8> @umax_minval_commute(<2 x i8> %x) { +; CHECK-LABEL: @umax_minval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> zeroinitializer, <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> zeroinitializer, <2 x i8> %x) + ret <2 x i8> %r +} + +define i8 @umin_maxval(i8 %x) { +; CHECK-LABEL: @umin_maxval( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umin.i8(i8 -1, i8 [[X:%.*]]) +; CHECK-NEXT: ret i8 [[R]] +; + %r = call i8 @llvm.umin.i8(i8 255, i8 %x) + ret i8 %r +} + +define <2 x i8> @umin_maxval_commute(<2 x i8> %x) { +; CHECK-LABEL: @umin_maxval_commute( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> ) + ret <2 x i8> %r +} + +define <2 x i8> @smax_maxval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @smax_maxval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> %x) + ret <2 x i8> %r +} + +define <2 x i8> @smin_minval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @smin_minval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smin.v2i8(<2 x i8> %x, <2 x i8> ) + ret <2 x i8> %r +} + +define <2 x i8> @umax_maxval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @umax_maxval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> %x) + ret <2 x i8> %r +} + +define <2 x i8> @umin_minval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @umin_minval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> ) + ret <2 x i8> %r +} + +define <2 x i8> @smax_minval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @smax_minval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smax.v2i8(<2 x i8> , <2 x i8> %x) + ret <2 x i8> %r +} + +define <2 x i8> @smin_maxval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @smin_maxval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.smin.v2i8(<2 x i8> %x, <2 x i8> ) + ret <2 x i8> %r +} + +define <2 x i8> @umax_minval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @umax_minval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> [[X:%.*]]) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> , <2 x i8> %x) + ret <2 x i8> %r +} + +define <2 x i8> @umin_maxval_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @umin_maxval_partial_undef( +; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> ) + ret <2 x i8> %r +} From db203e0268479d16d36a318c726cce5a4a5f75a6 Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Mon, 27 Jul 2020 14:05:23 +0200 Subject: [PATCH 0188/1035] [lldb] Modernize away some snprintf calls Reviewers: #lldb, JDevlieghere Reviewed By: #lldb, JDevlieghere Subscribers: JDevlieghere Differential Revision: https://reviews.llvm.org/D84530 --- lldb/source/Commands/CommandObjectProcess.cpp | 20 +++++++++---------- lldb/source/Core/Communication.cpp | 5 ++--- lldb/source/Core/Debugger.cpp | 4 +--- lldb/source/Core/SourceManager.cpp | 9 +++++---- lldb/source/Core/ValueObject.cpp | 10 +++------- lldb/source/Core/ValueObjectChild.cpp | 11 ++-------- .../source/Interpreter/CommandInterpreter.cpp | 10 ++-------- 7 files changed, 25 insertions(+), 44 deletions(-) diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index f86779d85b5fa..fd8d38e856372 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -48,19 +48,19 @@ class CommandObjectProcessLaunchOrAttach : public CommandObjectParsed { state = process->GetState(); if (process->IsAlive() && state != eStateConnected) { - char message[1024]; + std::string message; if (process->GetState() == eStateAttaching) - ::snprintf(message, sizeof(message), - "There is a pending attach, abort it and %s?", - m_new_process_action.c_str()); + message = + llvm::formatv("There is a pending attach, abort it and {0}?", + m_new_process_action); else if (process->GetShouldDetach()) - ::snprintf(message, sizeof(message), - "There is a running process, detach from it and %s?", - m_new_process_action.c_str()); + message = llvm::formatv( + "There is a running process, detach from it and {0}?", + m_new_process_action); else - ::snprintf(message, sizeof(message), - "There is a running process, kill it and %s?", - m_new_process_action.c_str()); + message = + llvm::formatv("There is a running process, kill it and {0}?", + m_new_process_action); if (!m_interpreter.Confirm(message, true)) { result.SetStatus(eReturnStatusFailed); diff --git a/lldb/source/Core/Communication.cpp b/lldb/source/Core/Communication.cpp index b358e70b1a91e..859f5be74b439 100644 --- a/lldb/source/Core/Communication.cpp +++ b/lldb/source/Core/Communication.cpp @@ -199,9 +199,8 @@ bool Communication::StartReadThread(Status *error_ptr) { LLDB_LOG(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_COMMUNICATION), "{0} Communication::StartReadThread ()", this); - char thread_name[1024]; - snprintf(thread_name, sizeof(thread_name), "", - GetBroadcasterName().AsCString()); + const std::string thread_name = + llvm::formatv("", GetBroadcasterName()); m_read_thread_enabled = true; m_read_thread_did_exit = false; diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 5f4f1e266d81c..05cfac19915e1 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -666,9 +666,7 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton) m_event_handler_thread(), m_io_handler_thread(), m_sync_broadcaster(nullptr, "lldb.debugger.sync"), m_forward_listener_sp(), m_clear_once() { - char instance_cstr[256]; - snprintf(instance_cstr, sizeof(instance_cstr), "debugger_%d", (int)GetID()); - m_instance_name.SetCString(instance_cstr); + m_instance_name.SetString(llvm::formatv("debugger_{0}", GetID()).str()); if (log_callback) m_log_callback_stream_sp = std::make_shared(log_callback, baton); diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index 7414dd281d43a..e79fcb48742d8 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -183,14 +183,14 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile( break; } - char prefix[32] = ""; + std::string prefix; if (bp_locs) { uint32_t bp_count = bp_locs->NumLineEntriesWithLine(line); if (bp_count > 0) - ::snprintf(prefix, sizeof(prefix), "[%u] ", bp_count); + prefix = llvm::formatv("[{0}]", bp_count); else - ::snprintf(prefix, sizeof(prefix), " "); + prefix = " "; } char buffer[3]; @@ -206,7 +206,8 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile( .str()); } - s->Printf("%s%s %-4u\t", prefix, current_line_highlight.c_str(), line); + s->Printf("%s%s %-4u\t", prefix.c_str(), current_line_highlight.c_str(), + line); // So far we treated column 0 as a special 'no column value', but // DisplaySourceLines starts counting columns from 0 (and no column is diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index 3a775b07e5e1f..d3a1971235ca0 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -1702,8 +1702,7 @@ ValueObjectSP ValueObject::GetSyntheticArrayMember(size_t index, bool can_create) { ValueObjectSP synthetic_child_sp; if (IsPointerType() || IsArrayType()) { - char index_str[64]; - snprintf(index_str, sizeof(index_str), "[%" PRIu64 "]", (uint64_t)index); + std::string index_str = llvm::formatv("[{0}]", index); ConstString index_const_str(index_str); // Check if we have already created a synthetic array member in this valid // object. If we have we will re-use it. @@ -1730,8 +1729,7 @@ ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to, bool can_create) { ValueObjectSP synthetic_child_sp; if (IsScalarType()) { - char index_str[64]; - snprintf(index_str, sizeof(index_str), "[%i-%i]", from, to); + std::string index_str = llvm::formatv("[{0}-{1}]", from, to); ConstString index_const_str(index_str); // Check if we have already created a synthetic array member in this valid // object. If we have we will re-use it. @@ -1768,9 +1766,7 @@ ValueObjectSP ValueObject::GetSyntheticChildAtOffset( ValueObjectSP synthetic_child_sp; if (name_const_str.IsEmpty()) { - char name_str[64]; - snprintf(name_str, sizeof(name_str), "@%i", offset); - name_const_str.SetCString(name_str); + name_const_str.SetString("@" + std::to_string(offset)); } // Check if we have already created a synthetic array member in this valid diff --git a/lldb/source/Core/ValueObjectChild.cpp b/lldb/source/Core/ValueObjectChild.cpp index 6205ed32c615a..28cb49328f34e 100644 --- a/lldb/source/Core/ValueObjectChild.cpp +++ b/lldb/source/Core/ValueObjectChild.cpp @@ -57,15 +57,8 @@ size_t ValueObjectChild::CalculateNumChildren(uint32_t max) { static void AdjustForBitfieldness(ConstString &name, uint8_t bitfield_bit_size) { - if (name && bitfield_bit_size) { - const char *compiler_type_name = name.AsCString(); - if (compiler_type_name) { - std::vector bitfield_type_name(strlen(compiler_type_name) + 32, 0); - ::snprintf(&bitfield_type_name.front(), bitfield_type_name.size(), - "%s:%u", compiler_type_name, bitfield_bit_size); - name.SetCString(&bitfield_type_name.front()); - } - } + if (name && bitfield_bit_size) + name.SetString(llvm::formatv("{0}:{1}", name, bitfield_bit_size).str()); } ConstString ValueObjectChild::GetTypeName() { diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 50a69b29260ca..aca3654b03097 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -1987,10 +1987,7 @@ void CommandInterpreter::BuildAliasCommandArgs(CommandObject *alias_cmd_obj, if (value_type != OptionParser::eOptionalArgument) new_args.AppendArgument(value); else { - char buffer[255]; - ::snprintf(buffer, sizeof(buffer), "%s%s", option.c_str(), - value.c_str()); - new_args.AppendArgument(llvm::StringRef(buffer)); + new_args.AppendArgument(option + value); } } else if (static_cast(index) >= cmd_args.GetArgumentCount()) { @@ -2012,10 +2009,7 @@ void CommandInterpreter::BuildAliasCommandArgs(CommandObject *alias_cmd_obj, if (value_type != OptionParser::eOptionalArgument) new_args.AppendArgument(cmd_args.GetArgumentAtIndex(index)); else { - char buffer[255]; - ::snprintf(buffer, sizeof(buffer), "%s%s", option.c_str(), - cmd_args.GetArgumentAtIndex(index)); - new_args.AppendArgument(buffer); + new_args.AppendArgument(option + cmd_args.GetArgumentAtIndex(index)); } used[index] = true; } From 90684d1545167ee4e0c93d8eaf6ba4a3c7ab710e Mon Sep 17 00:00:00 2001 From: Aleksandr Platonov Date: Mon, 27 Jul 2020 14:39:31 +0300 Subject: [PATCH 0189/1035] [clangd] Collect references for externally visible main-file symbols Summary: Without this patch clangd does not collect references for main-file symbols if there is no public declaration in preamble. Example: `test1.c` ``` void f1() {} ``` `test2.c` ``` extern void f1(); void f2() { f^1(); } ``` `Find all references` does not show definition of f1() in the result, but GTD works OK. Reviewers: sammccall, kadircet Reviewed By: kadircet Subscribers: ilya-golovenko, ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, usaxena95, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D84513 --- .../clangd/index/SymbolCollector.cpp | 3 ++- .../clangd/unittests/BackgroundIndexTests.cpp | 2 +- .../clangd/unittests/SymbolCollectorTests.cpp | 18 ++++++++++++------ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp index 6c11399c87b68..c163951aff9ba 100644 --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -314,7 +314,8 @@ bool SymbolCollector::handleDeclOccurrence( // file locations for references (as it aligns the behavior of clangd's // AST-based xref). // FIXME: we should try to use the file locations for other fields. - if (CollectRef && !IsMainFileOnly && !isa(ND) && + if (CollectRef && (!IsMainFileOnly || ND->isExternallyVisible()) && + !isa(ND) && (Opts.RefsInHeaders || SM.getFileID(SM.getFileLoc(Loc)) == SM.getMainFileID())) DeclRefs[ND].emplace_back(SM.getFileLoc(Loc), Roles); diff --git a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp index 70d5156b10723..f1c582ef1abe5 100644 --- a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp +++ b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp @@ -171,7 +171,7 @@ TEST_F(BackgroundIndexTest, IndexTwoFiles) { #endif )cpp"; FS.Files[testPath("root/A.cc")] = - "#include \"A.h\"\nvoid g() { (void)common; }"; + "#include \"A.h\"\nstatic void g() { (void)common; }"; FS.Files[testPath("root/B.cc")] = R"cpp( #define A 0 diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp index 9e4f75b5cca3f..3614ab2c5cb9e 100644 --- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp @@ -624,11 +624,13 @@ TEST_F(SymbolCollectorTest, Refs) { EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(Symbols, "NS").ID, _)))); EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "MACRO").ID, HaveRanges(Main.ranges("macro"))))); - // Symbols *only* in the main file (a, b, c, FUNC) had no refs collected. + // Symbols *only* in the main file: + // - (a, b) externally visible and should have refs. + // - (c, FUNC) externally invisible and had no refs collected. auto MainSymbols = TestTU::withHeaderCode(SymbolsOnlyInMainCode.code()).headerSymbols(); - EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "a").ID, _)))); - EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "b").ID, _)))); + EXPECT_THAT(Refs, Contains(Pair(findSymbol(MainSymbols, "a").ID, _))); + EXPECT_THAT(Refs, Contains(Pair(findSymbol(MainSymbols, "b").ID, _))); EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "c").ID, _)))); EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "FUNC").ID, _)))); } @@ -816,11 +818,15 @@ TEST_F(SymbolCollectorTest, HeaderAsMainFile) { $Foo[[Foo]] fo; } )"); - // The main file is normal .cpp file, we shouldn't collect any refs of symbols - // which are not declared in the preamble. + // The main file is normal .cpp file, we should collect the refs + // for externally visible symbols. TestFileName = testPath("foo.cpp"); runSymbolCollector("", Header.code()); - EXPECT_THAT(Refs, UnorderedElementsAre()); + EXPECT_THAT(Refs, + UnorderedElementsAre(Pair(findSymbol(Symbols, "Foo").ID, + HaveRanges(Header.ranges("Foo"))), + Pair(findSymbol(Symbols, "Func").ID, + HaveRanges(Header.ranges("Func"))))); // Run the .h file as main file, we should collect the refs. TestFileName = testPath("foo.h"); From 432241955e032fba3d8b584ee6388212909bee9b Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Mon, 27 Jul 2020 14:28:30 +0200 Subject: [PATCH 0190/1035] [lldb][NFC] Use a StringRef for AddRegexCommand::AddRegexCommand parameters Summary: This way we can get rid of this 1024 char buffer workaround. Reviewers: #lldb, labath Reviewed By: labath Subscribers: JDevlieghere Differential Revision: https://reviews.llvm.org/D84528 --- .../lldb/Interpreter/CommandObjectRegexCommand.h | 2 +- lldb/source/Commands/CommandObjectCommands.cpp | 2 +- lldb/source/Interpreter/CommandInterpreter.cpp | 11 +++-------- lldb/source/Interpreter/CommandObjectRegexCommand.cpp | 9 ++++----- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h b/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h index 01d7c6d118d46..cbd50511c483c 100644 --- a/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h +++ b/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h @@ -30,7 +30,7 @@ class CommandObjectRegexCommand : public CommandObjectRaw { bool IsRemovable() const override { return m_is_removable; } - bool AddRegexCommand(const char *re_cstr, const char *command_cstr); + bool AddRegexCommand(llvm::StringRef re_cstr, llvm::StringRef command_cstr); bool HasRegexEntries() const { return !m_entries.empty(); } diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp index 255fbe53fb2ea..eaf22344fafa2 100644 --- a/lldb/source/Commands/CommandObjectCommands.cpp +++ b/lldb/source/Commands/CommandObjectCommands.cpp @@ -970,7 +970,7 @@ a number follows 'f':" std::string subst(std::string(regex_sed.substr( second_separator_char_pos + 1, third_separator_char_pos - second_separator_char_pos - 1))); - m_regex_cmd_up->AddRegexCommand(regex.c_str(), subst.c_str()); + m_regex_cmd_up->AddRegexCommand(regex, subst); } return error; } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index aca3654b03097..4786e4602e4b4 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -631,15 +631,10 @@ void CommandInterpreter::LoadCommandDictionary() { if (tbreak_regex_cmd_up) { bool success = true; for (size_t i = 0; i < num_regexes; i++) { - // If you add a resultant command string longer than 1024 characters be - // sure to increase the size of this buffer. - char buffer[1024]; - int num_printed = - snprintf(buffer, 1024, "%s %s", break_regexes[i][1], "-o 1"); - lldbassert(num_printed < 1024); - UNUSED_IF_ASSERT_DISABLED(num_printed); + std::string command = break_regexes[i][1]; + command += " -o 1"; success = - tbreak_regex_cmd_up->AddRegexCommand(break_regexes[i][0], buffer); + tbreak_regex_cmd_up->AddRegexCommand(break_regexes[i][0], command); if (!success) break; } diff --git a/lldb/source/Interpreter/CommandObjectRegexCommand.cpp b/lldb/source/Interpreter/CommandObjectRegexCommand.cpp index 5a0265e58c5c6..7485fd76cc25f 100644 --- a/lldb/source/Interpreter/CommandObjectRegexCommand.cpp +++ b/lldb/source/Interpreter/CommandObjectRegexCommand.cpp @@ -69,14 +69,13 @@ bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command, return false; } -bool CommandObjectRegexCommand::AddRegexCommand(const char *re_cstr, - const char *command_cstr) { +bool CommandObjectRegexCommand::AddRegexCommand(llvm::StringRef re_cstr, + llvm::StringRef command_cstr) { m_entries.resize(m_entries.size() + 1); // Only add the regular expression if it compiles - m_entries.back().regex = - RegularExpression(llvm::StringRef::withNullAsEmpty(re_cstr)); + m_entries.back().regex = RegularExpression(re_cstr); if (m_entries.back().regex.IsValid()) { - m_entries.back().command.assign(command_cstr); + m_entries.back().command = command_cstr.str(); return true; } // The regex didn't compile... From d1271127240b0920e8758519d95a948d03a832ad Mon Sep 17 00:00:00 2001 From: Nathan James Date: Mon, 27 Jul 2020 13:37:21 +0100 Subject: [PATCH 0191/1035] [llvm][NFC] Silence unused variable warning by using isa over dyn_cast --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 711be57a0bafa..965a72d3a6500 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1143,7 +1143,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { Builder); NewPN->addIncoming(InV, PN->getIncomingBlock(i)); } - } else if (auto *FI = dyn_cast(&I)) { + } else if (isa(&I)) { for (unsigned i = 0; i != NumPHIValues; ++i) { Value *InV; if (NonConstBB == PN->getIncomingBlock(i)) From 1bac5101cdaabfbc755a6d28936962d11240f932 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 27 Jul 2020 13:30:39 +0300 Subject: [PATCH 0192/1035] [Reduce] Function reduction: replace all users of function with undef There may be other users of a function other than CallInsts, but what's more important, we can't actually replace function pointer with undef, because for constants, that would not preserve the type and RAUW would assert. In particular, that affects blockaddress, however it proves to be prohibitively complex to come up with a good test involving blockaddress: we'd need to both ensure that the function body survives until this pass, and is not interesting in this pass. --- .../llvm-reduce/deltas/ReduceFunctions.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp index b29df88261d91..4dd7b98fca3b6 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp @@ -32,22 +32,21 @@ static void extractFunctionsFromModule(const std::vector &ChunksToKeep, if (O.shouldKeep()) FuncsToKeep.insert(&F); - // Delete out-of-chunk functions, and replace their calls with undef + // Delete out-of-chunk functions, and replace their users with undef std::vector FuncsToRemove; - SetVector CallsToRemove; + SetVector InstrsToRemove; for (auto &F : *Program) if (!FuncsToKeep.count(&F)) { - for (auto U : F.users()) - if (auto *Call = dyn_cast(U)) { - Call->replaceAllUsesWith(UndefValue::get(Call->getType())); - CallsToRemove.insert(Call); - } - F.replaceAllUsesWith(UndefValue::get(F.getType())); + for (auto U : F.users()) { + U->replaceAllUsesWith(UndefValue::get(U->getType())); + if (auto *I = dyn_cast(U)) + InstrsToRemove.insert(I); + } FuncsToRemove.push_back(&F); } - for (auto *C : CallsToRemove) - C->eraseFromParent(); + for (auto *I : InstrsToRemove) + I->eraseFromParent(); for (auto *F : FuncsToRemove) F->eraseFromParent(); From 61480db6019d01a7a97de6ec64991664bf9b4996 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 27 Jul 2020 15:36:07 +0300 Subject: [PATCH 0193/1035] [Reduce] Argument reduction: shoe-horn new function into remaining uses of old function Much like with function reduction, there may be remaining unhandled uses of function, in particular in blockaddress. And in constants we can't RAUW it with undef, because undef is not a function. Instead, let's try to pretent that in the remaining cases, the new signature didn't change, by bitcasting it. A new (previously crashing) test case added. --- ...arguments-of-funcs-used-in-blockaddress.ll | 30 +++++++++++++++++++ llvm/test/Reduce/remove-invoked-functions.ll | 2 +- .../llvm-reduce/deltas/ReduceArguments.cpp | 2 +- 3 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Reduce/remove-function-arguments-of-funcs-used-in-blockaddress.ll diff --git a/llvm/test/Reduce/remove-function-arguments-of-funcs-used-in-blockaddress.ll b/llvm/test/Reduce/remove-function-arguments-of-funcs-used-in-blockaddress.ll new file mode 100644 index 0000000000000..f1ad5db49bfb6 --- /dev/null +++ b/llvm/test/Reduce/remove-function-arguments-of-funcs-used-in-blockaddress.ll @@ -0,0 +1,30 @@ +; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-ALL,CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t +; RUN: cat %t | FileCheck --check-prefixes=CHECK-ALL,CHECK-FINAL %s + +; CHECK-INTERESTINGNESS: define void @func( +; CHECK-FINAL: define void @func() +define void @func(i1 %arg) { +; CHECK-ALL: bb: +; CHECK-ALL: br label %bb4 +bb: + br label %bb4 + +; CHECK-ALL: bb4 +bb4: +; CHECK-INTERESTINGNESS; callbr void asm +; CHECK-INTERESTINGNESS-SAME; blockaddress +; CHECK-FINAL: callbr void asm sideeffect "", "X"(i8* blockaddress(@func, %bb11)) +; CHECK-ALL: to label %bb5 [label %bb11] + callbr void asm sideeffect "", "X"(i8* blockaddress(@func, %bb11)) + to label %bb5 [label %bb11] + +; CHECK-ALL: bb5: +; CHECK-ALL: br label %bb11 +bb5: + br label %bb11 + +; CHECK-ALL: bb11: +; CHECK-ALL: ret void +bb11: + ret void +} diff --git a/llvm/test/Reduce/remove-invoked-functions.ll b/llvm/test/Reduce/remove-invoked-functions.ll index e4458e662fee3..d291859478033 100644 --- a/llvm/test/Reduce/remove-invoked-functions.ll +++ b/llvm/test/Reduce/remove-invoked-functions.ll @@ -23,7 +23,7 @@ define void @caller(i32 %arg) personality i8* bitcast (i32 (...)* @__gxx_persona ; CHECK-ALL: bb: bb: ; CHECK-INTERESTINGNESS: %i0 = invoke i32 -; CHECK-FINAL: %i0 = invoke i32 undef(i32 %arg) +; CHECK-FINAL: %i0 = invoke i32 bitcast (i32 ()* @maybe_throwing_callee to i32 (i32)*)(i32 %arg) ; CHECK-ALL: to label %bb3 unwind label %bb1 %i0 = invoke i32 @maybe_throwing_callee(i32 %arg) to label %bb3 unwind label %bb1 diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp index 9488d71b71c35..c3c7dee83db10 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp @@ -94,7 +94,7 @@ static void extractArgumentsFromModule(std::vector ChunksToKeep, replaceFunctionCalls(*F, *ClonedFunc, ArgIndexesToKeep); // Rename Cloned Function to Old's name std::string FName = std::string(F->getName()); - F->replaceAllUsesWith(UndefValue::get(F->getType())); + F->replaceAllUsesWith(ConstantExpr::getBitCast(ClonedFunc, F->getType())); F->eraseFromParent(); ClonedFunc->setName(FName); } From 1da9834557cd4302a5183b8228ce063e69f82602 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 27 Jul 2020 15:07:51 +0300 Subject: [PATCH 0194/1035] [JumpThreading] ProcessBranchOnXOR(): bailout if any pred ends in indirect branch (PR46857) SplitBlockPredecessors() can not split blocks that have such terminators, and in two other places we already ensure that we don't end up calling SplitBlockPredecessors() on such blocks. Do so in one more place. Fixes https://bugs.llvm.org/show_bug.cgi?id=46857 --- llvm/lib/Transforms/Scalar/JumpThreading.cpp | 8 +++ .../JumpThreading/pr46857-callbr.ll | 52 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 llvm/test/Transforms/JumpThreading/pr46857-callbr.ll diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 9d0500419a7f5..2f379b7f61608 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1859,6 +1859,14 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) { return true; } + // If any of predecessors end with an indirect goto, we can't change its + // destination. Same for CallBr. + if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { + return isa(Pred->getTerminator()) || + isa(Pred->getTerminator()); + })) + return false; + // Try to duplicate BB into PredBB. return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); } diff --git a/llvm/test/Transforms/JumpThreading/pr46857-callbr.ll b/llvm/test/Transforms/JumpThreading/pr46857-callbr.ll new file mode 100644 index 0000000000000..3de7d6265136d --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/pr46857-callbr.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -jump-threading -S | FileCheck %s + +; CHECK-ALL-LABEL: @func( + +define i1 @func(i1 %arg, i32 %arg1, i1 %arg2) { +; CHECK-LABEL: @func( +; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[BB3:%.*]], label [[BB4:%.*]] +; CHECK: bb3: +; CHECK-NEXT: [[I:%.*]] = icmp eq i32 [[ARG1:%.*]], 0 +; CHECK-NEXT: br label [[BB7:%.*]] +; CHECK: bb4: +; CHECK-NEXT: callbr void asm sideeffect "", "X"(i8* blockaddress(@func, [[BB7]])) +; CHECK-NEXT: to label [[BB5:%.*]] [label %bb7] +; CHECK: bb5: +; CHECK-NEXT: br label [[BB7]] +; CHECK: bb7: +; CHECK-NEXT: [[I8:%.*]] = phi i1 [ [[I]], [[BB3]] ], [ [[ARG2:%.*]], [[BB5]] ], [ [[ARG2]], [[BB4]] ] +; CHECK-NEXT: [[I9:%.*]] = xor i1 [[I8]], [[ARG]] +; CHECK-NEXT: br i1 [[I9]], label [[BB11:%.*]], label [[BB11]] +; CHECK: bb11: +; CHECK-NEXT: ret i1 [[I9]] +; +bb: + br i1 %arg, label %bb3, label %bb4 + +bb3: + %i = icmp eq i32 %arg1, 0 + br label %bb7 + +bb4: + callbr void asm sideeffect "", "X"(i8* blockaddress(@func, %bb6)) + to label %bb5 [label %bb6] + +bb5: + br label %bb6 + +bb6: + br label %bb7 + +bb7: + %i8 = phi i1 [ %i, %bb3 ], [ %arg2, %bb6 ] + %i9 = xor i1 %i8, %arg + br i1 %i9, label %bb11, label %bb10 + +bb10: + br label %bb11 + +bb11: + ret i1 %i9 +} From 08e9556d5d77fb424b8cb99fe16ffe2bc77f555e Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 27 Jul 2020 14:41:29 +0200 Subject: [PATCH 0195/1035] llvm_canonicalize_cmake_booleans(CLANGD_ENABLE_REMOTE) Otherwise it got defined as e.g. OFF in Features.inc. --- clang-tools-extra/clangd/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index 8db6656e5291a..639441e8130ab 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -15,6 +15,7 @@ if (NOT DEFINED CLANGD_BUILD_XPC) endif () llvm_canonicalize_cmake_booleans(CLANGD_BUILD_XPC) +llvm_canonicalize_cmake_booleans(CLANGD_ENABLE_REMOTE) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/Features.inc.in From 0f1494be43f0916516533fea9d99e9211bb4c581 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Mon, 27 Jul 2020 13:29:49 +0100 Subject: [PATCH 0196/1035] AArch64: avoid UB shift of negative value Left shifting a negative value is undefined behaviour, so this just moves the negation afterwards to avoid it. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 08f80c9aa361b..323ac76e903fd 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -6847,10 +6847,9 @@ Optional AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) return None; - Offset = MI.getOperand(2).getImm() * Sign; int Shift = MI.getOperand(3).getImm(); assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); - Offset = Offset << Shift; + Offset = Sign * (MI.getOperand(2).getImm() << Shift); } } return RegImmPair{MI.getOperand(1).getReg(), Offset}; From 005eee8b3ef7f244daf12f574a191079bfc1918b Mon Sep 17 00:00:00 2001 From: Mitch Phillips <31459023+hctim@users.noreply.github.com> Date: Mon, 27 Jul 2020 05:46:43 -0700 Subject: [PATCH 0197/1035] [GWP-ASan] Fix uninitialized memory use in sigaction. Fix up a small bug where we used a partially-uninitialized sigaction struct in the optional signal handler. Shouldn't be a user-visible change. --- compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp b/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp index 1bd7a606c2136..9a80436efb311 100644 --- a/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp +++ b/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp @@ -144,7 +144,7 @@ void installSignalHandlers(gwp_asan::GuardedPoolAllocator *GPA, Printf_t Printf, PrintBacktraceForSignalHandler = PrintBacktrace; BacktraceForSignalHandler = SegvBacktrace; - struct sigaction Action; + struct sigaction Action = {}; Action.sa_sigaction = sigSegvHandler; Action.sa_flags = SA_SIGINFO; sigaction(SIGSEGV, &Action, &PreviousHandler); From bec77ece14890d2aa40c76eedc6a7a406d84f1fc Mon Sep 17 00:00:00 2001 From: Sergey Dmitriev Date: Mon, 27 Jul 2020 06:02:06 -0700 Subject: [PATCH 0198/1035] [CallGraph] Preserve call records vector when replacing call edge Summary: Try not to resize vector of call records in a call graph node when replacing call edge. That would prevent invalidation of iterators stored in the CG SCC pass manager's scc_iterator. Reviewers: jdoerfert Reviewed By: jdoerfert Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84295 --- llvm/lib/Analysis/CallGraph.cpp | 34 ++++++-- llvm/unittests/IR/LegacyPassManagerTest.cpp | 86 +++++++++++++++++++++ 2 files changed, 115 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp index 55adb454b7338..19c128b6633c0 100644 --- a/llvm/lib/Analysis/CallGraph.cpp +++ b/llvm/lib/Analysis/CallGraph.cpp @@ -281,13 +281,37 @@ void CallGraphNode::replaceCallEdge(CallBase &Call, CallBase &NewCall, I->second = NewNode; NewNode->AddRef(); - // Refresh callback references. - forEachCallbackFunction(Call, [=](Function *CB) { - removeOneAbstractEdgeTo(CG->getOrInsertFunction(CB)); + // Refresh callback references. Do not resize CalledFunctions if the + // number of callbacks is the same for new and old call sites. + SmallVector OldCBs; + SmallVector NewCBs; + forEachCallbackFunction(Call, [this, &OldCBs](Function *CB) { + OldCBs.push_back(CG->getOrInsertFunction(CB)); }); - forEachCallbackFunction(NewCall, [=](Function *CB) { - addCalledFunction(nullptr, CG->getOrInsertFunction(CB)); + forEachCallbackFunction(NewCall, [this, &NewCBs](Function *CB) { + NewCBs.push_back(CG->getOrInsertFunction(CB)); }); + if (OldCBs.size() == NewCBs.size()) { + for (unsigned N = 0; N < OldCBs.size(); ++N) { + CallGraphNode *OldNode = OldCBs[N]; + CallGraphNode *NewNode = NewCBs[N]; + for (auto J = CalledFunctions.begin();; ++J) { + assert(J != CalledFunctions.end() && + "Cannot find callsite to update!"); + if (!J->first && J->second == OldNode) { + J->second = NewNode; + OldNode->DropRef(); + NewNode->AddRef(); + break; + } + } + } + } else { + for (auto *CGN : OldCBs) + removeOneAbstractEdgeTo(CGN); + for (auto *CGN : NewCBs) + addCalledFunction(nullptr, CGN); + } return; } } diff --git a/llvm/unittests/IR/LegacyPassManagerTest.cpp b/llvm/unittests/IR/LegacyPassManagerTest.cpp index 72ac4be229974..f461bcc8c7761 100644 --- a/llvm/unittests/IR/LegacyPassManagerTest.cpp +++ b/llvm/unittests/IR/LegacyPassManagerTest.cpp @@ -16,6 +16,8 @@ #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/AbstractCallSite.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -28,6 +30,7 @@ #include "llvm/IR/OptBisect.h" #include "llvm/InitializePasses.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "gtest/gtest.h" @@ -694,6 +697,89 @@ namespace llvm { ASSERT_EQ(P->NumExtCalledBefore, /* test1, 2a, 2b, 3, 4 */ 5U); ASSERT_EQ(P->NumExtCalledAfter, /* test1, 3repl, 4 */ 3U); } + + // Test for call graph SCC pass that replaces all callback call instructions + // with clones and updates CallGraph by calling CallGraph::replaceCallEdge() + // method. Test is expected to complete successfully after running pass on + // all SCCs in the test module. + struct CallbackCallsModifierPass : public CGPass { + bool runOnSCC(CallGraphSCC &SCC) override { + CGPass::run(); + + CallGraph &CG = const_cast(SCC.getCallGraph()); + + bool Changed = false; + for (CallGraphNode *CGN : SCC) { + Function *F = CGN->getFunction(); + if (!F || F->isDeclaration()) + continue; + + SmallVector Calls; + for (Use &U : F->uses()) { + AbstractCallSite ACS(&U); + if (!ACS || !ACS.isCallbackCall() || !ACS.isCallee(&U)) + continue; + Calls.push_back(cast(ACS.getInstruction())); + } + if (Calls.empty()) + continue; + + for (CallBase *OldCB : Calls) { + CallGraphNode *CallerCGN = CG[OldCB->getParent()->getParent()]; + assert(any_of(*CallerCGN, + [CGN](const CallGraphNode::CallRecord &CallRecord) { + return CallRecord.second == CGN; + }) && + "function is not a callee"); + + CallBase *NewCB = cast(OldCB->clone()); + + NewCB->insertBefore(OldCB); + NewCB->takeName(OldCB); + + CallerCGN->replaceCallEdge(*OldCB, *NewCB, CG[F]); + + OldCB->replaceAllUsesWith(NewCB); + OldCB->eraseFromParent(); + } + Changed = true; + } + return Changed; + } + }; + + TEST(PassManager, CallbackCallsModifier0) { + LLVMContext Context; + + const char *IR = "define void @foo() {\n" + " call void @broker(void (i8*)* @callback0, i8* null)\n" + " call void @broker(void (i8*)* @callback1, i8* null)\n" + " ret void\n" + "}\n" + "\n" + "declare !callback !0 void @broker(void (i8*)*, i8*)\n" + "\n" + "define internal void @callback0(i8* %arg) {\n" + " ret void\n" + "}\n" + "\n" + "define internal void @callback1(i8* %arg) {\n" + " ret void\n" + "}\n" + "\n" + "!0 = !{!1}\n" + "!1 = !{i64 0, i64 1, i1 false}"; + + SMDiagnostic Err; + std::unique_ptr M = parseAssemblyString(IR, Err, Context); + if (!M) + Err.print("LegacyPassManagerTest", errs()); + + CallbackCallsModifierPass *P = new CallbackCallsModifierPass(); + legacy::PassManager Passes; + Passes.add(P); + Passes.run(*M); + } } } From 2dd7a9cc2d0572c3d1e5b9ce554a0800079863c9 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Sat, 25 Jul 2020 14:56:35 +0200 Subject: [PATCH 0199/1035] [MLIR] NFC: Rename mcuMemHostRegister* to mgpuMemHostRegister* to make it consistent with the other cuda-runner functions and ROCm. Summary: Rename mcuMemHostRegister* to mgpuMemHostRegister*. Reviewers: herhut Reviewed By: herhut Subscribers: yaxunl, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, Kayjukh, jurahul, msifontes Tags: #mlir Differential Revision: https://reviews.llvm.org/D84583 --- mlir/test/mlir-cuda-runner/all-reduce-and.mlir | 6 +++--- mlir/test/mlir-cuda-runner/all-reduce-max.mlir | 6 +++--- mlir/test/mlir-cuda-runner/all-reduce-min.mlir | 6 +++--- mlir/test/mlir-cuda-runner/all-reduce-op.mlir | 4 ++-- mlir/test/mlir-cuda-runner/all-reduce-or.mlir | 6 +++--- mlir/test/mlir-cuda-runner/all-reduce-region.mlir | 4 ++-- mlir/test/mlir-cuda-runner/all-reduce-xor.mlir | 6 +++--- mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir | 4 ++-- mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir | 8 ++++---- mlir/test/mlir-cuda-runner/shuffle.mlir | 4 ++-- mlir/test/mlir-cuda-runner/two-modules.mlir | 4 ++-- mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp | 10 +++++----- 12 files changed, 34 insertions(+), 34 deletions(-) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir index d3ad7a802537b..f89f914157248 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir @@ -25,9 +25,9 @@ func @main() { %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ func @main() { return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir index ae2f6c3d6b3e5..4adf8a73d924c 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir @@ -25,9 +25,9 @@ func @main() { %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ func @main() { return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir index 0cd4f11daf105..8cb3116e9d0dd 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir @@ -25,9 +25,9 @@ func @main() { %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ func @main() { return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir index 67c4f96d36f47..72306674c3ff9 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir @@ -11,7 +11,7 @@ func @main() { %sy = dim %dst, %c1 : memref %sz = dim %dst, %c0 : memref %cast_dst = memref_cast %dst : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz) { %t0 = muli %tz, %block_y : index @@ -28,5 +28,5 @@ func @main() { return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir index cc9eae9e8b660..7d0ed929322e8 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir @@ -25,9 +25,9 @@ func @main() { %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ func @main() { return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir index afd3d7cb038a6..a9426c6589787 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir @@ -8,7 +8,7 @@ func @main() { %c0 = constant 0 : index %sx = dim %dst, %c0 : memref %cast_dst = memref_cast %dst : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_dst) : (memref<*xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %val = index_cast %tx : index to i32 @@ -25,5 +25,5 @@ func @main() { return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir index a32c4d3eb93e5..67461783b2570 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir @@ -25,9 +25,9 @@ func @main() { %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_data) : (memref<*xi32>) -> () %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_sum) : (memref<*xi32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> @@ -58,6 +58,6 @@ func @main() { return } -func @mcuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir index 0ef33ea6112a2..80339c36fb384 100644 --- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir +++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir @@ -18,7 +18,7 @@ func @main() { %21 = constant 5 : i32 %22 = memref_cast %arg0 : memref<5xf32> to memref %23 = memref_cast %22 : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%23) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%23) : (memref<*xf32>) -> () call @print_memref_f32(%23) : (memref<*xf32>) -> () %24 = constant 1.0 : f32 call @other_func(%24, %22) : (f32, memref) -> () @@ -26,5 +26,5 @@ func @main() { return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir index a7b143f760a75..b88d8e1b8ba1a 100644 --- a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir +++ b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir @@ -26,11 +26,11 @@ func @main() { %c6 = constant 6 : index %cast_data = memref_cast %data : memref<2x6xf32> to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_data) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_data) : (memref<*xf32>) -> () %cast_sum = memref_cast %sum : memref<2xf32> to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_sum) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_sum) : (memref<*xf32>) -> () %cast_mul = memref_cast %mul : memref<2xf32> to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_mul) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_mul) : (memref<*xf32>) -> () store %cst0, %data[%c0, %c0] : memref<2x6xf32> store %cst1, %data[%c0, %c1] : memref<2x6xf32> @@ -66,5 +66,5 @@ func @main() { return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir index 0f8cdca3a8eb3..a4563cc0c381d 100644 --- a/mlir/test/mlir-cuda-runner/shuffle.mlir +++ b/mlir/test/mlir-cuda-runner/shuffle.mlir @@ -8,7 +8,7 @@ func @main() { %c0 = constant 0 : index %sx = dim %dst, %c0 : memref %cast_dest = memref_cast %dst : memref to memref<*xf32> - call @mcuMemHostRegisterFloat(%cast_dest) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%cast_dest) : (memref<*xf32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %t0 = index_cast %tx : index to i32 @@ -28,5 +28,5 @@ func @main() { return } -func @mcuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/test/mlir-cuda-runner/two-modules.mlir b/mlir/test/mlir-cuda-runner/two-modules.mlir index 3229879d2fb50..ef4dd0c48b8df 100644 --- a/mlir/test/mlir-cuda-runner/two-modules.mlir +++ b/mlir/test/mlir-cuda-runner/two-modules.mlir @@ -8,7 +8,7 @@ func @main() { %c0 = constant 0 : index %sx = dim %dst, %c0 : memref %cast_dst = memref_cast %dst : memref to memref<*xi32> - call @mcuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> () + call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> () gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { %t0 = index_cast %tx : index to i32 @@ -25,5 +25,5 @@ func @main() { return } -func @mcuMemHostRegisterInt32(%memref : memref<*xi32>) +func @mgpuMemHostRegisterInt32(%memref : memref<*xi32>) func @print_memref_i32(%memref : memref<*xi32>) diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp index 705fa9f00930a..2b71eb34703bd 100644 --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -83,7 +83,7 @@ extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { // Allows to register a MemRef with the CUDA runtime. Initializes array with // value. Helpful until we have transfer functions implemented. template -void mcuMemHostRegisterMemRef(const DynamicMemRefType &mem_ref, T value) { +void mgpuMemHostRegisterMemRef(const DynamicMemRefType &mem_ref, T value) { llvm::SmallVector denseStrides(mem_ref.rank); llvm::ArrayRef sizes(mem_ref.sizes, mem_ref.rank); llvm::ArrayRef strides(mem_ref.strides, mem_ref.rank); @@ -103,12 +103,12 @@ void mcuMemHostRegisterMemRef(const DynamicMemRefType &mem_ref, T value) { mgpuMemHostRegister(pointer, count * sizeof(T)); } -extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) { +extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) { UnrankedMemRefType mem_ref = {rank, ptr}; - mcuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 1.23f); + mgpuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 1.23f); } -extern "C" void mcuMemHostRegisterInt32(int64_t rank, void *ptr) { +extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) { UnrankedMemRefType mem_ref = {rank, ptr}; - mcuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 123); + mgpuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 123); } From 5b5b3ce0ad6d308471ed09c644eee4b5c337cd1b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 25 Jul 2020 15:52:05 +0100 Subject: [PATCH 0200/1035] IRPrintingPasses.h - simplify unnecessary header with forward declarations. NFC. Remove duplicate PassManager.h include in IRPrintingPasses.cpp --- llvm/include/llvm/IR/IRPrintingPasses.h | 3 ++- llvm/lib/IR/IRPrintingPasses.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/IR/IRPrintingPasses.h b/llvm/include/llvm/IR/IRPrintingPasses.h index 3a1c489ee09f2..ed7082a31a423 100644 --- a/llvm/include/llvm/IR/IRPrintingPasses.h +++ b/llvm/include/llvm/IR/IRPrintingPasses.h @@ -18,11 +18,12 @@ #ifndef LLVM_IR_IRPRINTINGPASSES_H #define LLVM_IR_IRPRINTINGPASSES_H -#include "llvm/ADT/StringRef.h" #include "llvm/IR/PassManager.h" #include namespace llvm { +class raw_ostream; +class StringRef; /// Create and return a pass that writes the module to the specified /// \c raw_ostream. diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp index 03657ff8d9d43..7c73d2ab98711 100644 --- a/llvm/lib/IR/IRPrintingPasses.cpp +++ b/llvm/lib/IR/IRPrintingPasses.cpp @@ -11,13 +11,14 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/IRPrintingPasses.h" +#include "llvm/ADT/StringRef.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" + using namespace llvm; PrintModulePass::PrintModulePass() : OS(dbgs()) {} From f720c9c68c70003ba56d17fff1549f30f509778f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 27 Jul 2020 13:14:13 +0100 Subject: [PATCH 0201/1035] [X86] combineExtractSubvector - pull out repeated getSizeInBits() calls. NFCI. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7c134a8c7cb92..a807c5117631b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48278,12 +48278,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, unsigned IdxVal = N->getConstantOperandVal(1); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); + unsigned SizeInBits = VT.getSizeInBits(); + unsigned InSizeInBits = InVecVT.getSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) && - InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) { - auto isConcatenatedNot = [] (SDValue V) { + InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) { + auto isConcatenatedNot = [](SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) return false; @@ -48326,7 +48328,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 && InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) && ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) && - InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) { + InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) { SDLoc DL(N); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), @@ -48337,14 +48339,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // broadcasting to the smaller type directly, assuming this is the only use. // As its a broadcast we don't care about the extraction index. if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() && - InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) + InVec.getOperand(0).getValueSizeInBits() <= SizeInBits) return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) { auto *MemIntr = cast(InVec); - if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) { + if (MemIntr->getMemoryVT().getSizeInBits() <= SizeInBits) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, MemIntr->getMemoryVT(), @@ -48359,7 +48361,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // SimplifyDemandedVectorElts do more simplifications. if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST || InVec.getOpcode() == X86ISD::VBROADCAST_LOAD)) - return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits()); + return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); // If we're extracting a broadcasted subvector, just use the source. if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST && @@ -48367,12 +48369,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return InVec.getOperand(0); // Attempt to extract from the source of a shuffle vector. - if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 && + if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % VT.getVectorNumElements()) == 0) { SmallVector ShuffleMask; SmallVector ScaledMask; SmallVector ShuffleInputs; - unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits(); + unsigned NumSubVecs = InSizeInBits / SizeInBits; // Decode the shuffle mask and scale it so its shuffling subvectors. if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) && scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) { @@ -48382,11 +48384,11 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (ScaledMask[SubVecIdx] == SM_SentinelZero) return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs]; - if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) { + if (Src.getValueSizeInBits() == InSizeInBits) { unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs; unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements(); return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG, - SDLoc(N), VT.getSizeInBits()); + SDLoc(N), SizeInBits); } } } From ab4ffa52f0a62447c2e1be872adb0aa3e357f071 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 27 Jul 2020 14:32:32 +0100 Subject: [PATCH 0202/1035] [X86][AVX] Fold extract_subvector(truncate(x),0) -> truncate(extract_subvector(x),0) This is currently only supported for VLX targets where the op should be legal. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++++++ llvm/test/CodeGen/X86/vector-reduce-mul.ll | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a807c5117631b..6abe6c6b83156 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48435,6 +48435,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } + if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && + (VT.is128BitVector() || VT.is256BitVector())) { + SDLoc DL(N); + SDValue InVecSrc = InVec.getOperand(0); + unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits; + SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); + return DAG.getNode(InOpcode, DL, VT, Ext); + } } return SDValue(); diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index f7e1a72f9a91f..6e9bfbe5a79fd 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -2008,24 +2008,24 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512BWVL-NEXT: vzeroupper From 1b4d24912a1f8730475d4b01e7da89dfae90ae9c Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 27 Jul 2020 13:54:46 +0000 Subject: [PATCH 0203/1035] [NFC] Replace ".size() < 1" with ".empty()" --- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 908e70b2789da..f0377df8648a4 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1716,7 +1716,7 @@ Error BitcodeReader::parseTypeTableBody() { case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries] // TYPE_CODE_NUMENTRY contains a count of the number of types in the // type list. This allows us to reserve space. - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); TypeList.resize(Record[0]); continue; @@ -1757,7 +1757,7 @@ Error BitcodeReader::parseTypeTableBody() { ResultTy = Type::getTokenTy(Context); break; case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); uint64_t NumBits = Record[0]; @@ -1769,7 +1769,7 @@ Error BitcodeReader::parseTypeTableBody() { } case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or // [pointee type, address space] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); unsigned AddressSpace = 0; if (Record.size() == 2) @@ -1824,7 +1824,7 @@ Error BitcodeReader::parseTypeTableBody() { break; } case bitc::TYPE_CODE_STRUCT_ANON: { // STRUCT: [ispacked, eltty x N] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); SmallVector EltTys; for (unsigned i = 1, e = Record.size(); i != e; ++i) { @@ -1844,7 +1844,7 @@ Error BitcodeReader::parseTypeTableBody() { continue; case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); if (NumRecords >= TypeList.size()) @@ -3716,7 +3716,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, break; /// MODULE_CODE_VSTOFFSET: [offset] case bitc::MODULE_CODE_VSTOFFSET: - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); // Note that we subtract 1 here because the offset is relative to one word // before the start of the identification or module block, which was @@ -3870,7 +3870,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { default: // Default behavior: reject return error("Invalid value"); case bitc::FUNC_CODE_DECLAREBLOCKS: { // DECLAREBLOCKS: [nblocks] - if (Record.size() < 1 || Record[0] == 0) + if (Record.empty() || Record[0] == 0) return error("Invalid record"); // Create all the basic blocks for the function. FunctionBBs.resize(Record[0]); @@ -4712,7 +4712,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { InstructionList.push_back(I); break; case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); // The first record specifies the type. FullTy = getFullyStructuredTypeByID(Record[0]); @@ -5205,7 +5205,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { // number of operand bundle blocks. These blocks are read into // OperandBundles and consumed at the next call or invoke instruction. - if (Record.size() < 1 || Record[0] >= BundleTags.size()) + if (Record.empty() || Record[0] >= BundleTags.size()) return error("Invalid record"); std::vector Inputs; @@ -5738,7 +5738,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() { } /// MODULE_CODE_VSTOFFSET: [offset] case bitc::MODULE_CODE_VSTOFFSET: - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); // Note that we subtract 1 here because the offset is relative to one // word before the start of the identification or module block, which From 92fa91bb402921a5705507c38f583e9b8e9d84e4 Mon Sep 17 00:00:00 2001 From: Anastasia Stulova Date: Mon, 27 Jul 2020 13:27:21 +0100 Subject: [PATCH 0204/1035] [OpenCL] Fixed missing address space for templated copy constructor. Added missing address space for the parameter of copy ctor created for templated constructor with an R-value reference. Patch by Ole Strohm (olestrohm)! Tags: #clang Differential Revision: https://reviews.llvm.org/D83665 --- clang/lib/Sema/SemaTemplateDeduction.cpp | 5 ++++- .../SemaOpenCLCXX/address-space-templates.cl | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 52062e9a5039c..8e7b4e1655ea9 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3815,8 +3815,11 @@ static bool AdjustFunctionParmAndArgTypesForDeduction( // If P is a forwarding reference and the argument is an lvalue, the type // "lvalue reference to A" is used in place of A for type deduction. if (isForwardingReference(QualType(ParamRefType, 0), FirstInnerIndex) && - Arg->isLValue()) + Arg->isLValue()) { + if (S.getLangOpts().OpenCL) + ArgType = S.Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic); ArgType = S.Context.getLValueReferenceType(ArgType); + } } else { // C++ [temp.deduct.call]p2: // If P is not a reference type: diff --git a/clang/test/SemaOpenCLCXX/address-space-templates.cl b/clang/test/SemaOpenCLCXX/address-space-templates.cl index 6b304d2fdda45..be187de5684b0 100644 --- a/clang/test/SemaOpenCLCXX/address-space-templates.cl +++ b/clang/test/SemaOpenCLCXX/address-space-templates.cl @@ -22,10 +22,28 @@ void foo3() { __private T ii; // expected-error{{conflicting address space qualifiers are provided between types '__private T' and '__global int'}} } +template struct remove_reference { typedef _Tp type; }; +template struct remove_reference<_Tp &> { typedef _Tp type; }; +template struct as_pointer { + typedef typename remove_reference<_Tp>::type* type; +}; + +struct rep { + // CHECK |-CXXConstructorDecl {{.*}} rep 'void (const __generic rep &__private) __generic' + template::type> + rep(U&& v) {} +}; + +struct rep_outer : private rep { + rep_outer() + : rep(0) {} +}; + void bar() { S sintgl; // expected-note{{in instantiation of template class 'S' requested here}} foo1<__local int>(1); // expected-error{{no matching function for call to 'foo1'}} foo2<__global int>(0); foo3<__global int>(); // expected-note{{in instantiation of function template specialization 'foo3<__global int>' requested here}} + rep_outer r; } From a7044edde71be0a4c70e79c503c39ce6bdd930e5 Mon Sep 17 00:00:00 2001 From: Luofan Chen Date: Mon, 27 Jul 2020 22:28:39 +0800 Subject: [PATCH 0205/1035] [Attributor] Fix qualifier warning of the unittest Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D84532 --- llvm/unittests/Transforms/IPO/AttributorTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/unittests/Transforms/IPO/AttributorTest.cpp b/llvm/unittests/Transforms/IPO/AttributorTest.cpp index 8c1682c01f307..7d496ed0d14fb 100644 --- a/llvm/unittests/Transforms/IPO/AttributorTest.cpp +++ b/llvm/unittests/Transforms/IPO/AttributorTest.cpp @@ -44,8 +44,8 @@ TEST_F(AttributorTestBase, TestCast) { Function *F = M.getFunction("foo"); - AbstractAttribute *AA = (AbstractAttribute *)&( - A.getOrCreateAAFor(IRPosition::function(*F))); + const AbstractAttribute *AA = + &A.getOrCreateAAFor(IRPosition::function(*F)); EXPECT_TRUE(AA); From 343ffa70fc4c55f4dc0d717cf8c168865beaa9c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 25 Jul 2020 12:25:19 +0300 Subject: [PATCH 0206/1035] [LLD] [COFF] Fix mingw comdat associativity for leader symbols with a different name For a weak symbol func in a comdat, the actual leader symbol ends up named like .weak.func.default*. Likewise, for stdcall on i386, the symbol may be named _func@4, while the section suffix only is "func", which the previous implementation didn't handle. This fixes unwinding through weak functions when using -ffunction-sections in mingw environments. Differential Revision: https://reviews.llvm.org/D84607 --- lld/COFF/InputFiles.cpp | 8 +-- lld/test/COFF/associative-comdat-mingw-i386.s | 21 ++++++- lld/test/COFF/associative-comdat-mingw-weak.s | 63 +++++++++++++++++++ 3 files changed, 87 insertions(+), 5 deletions(-) create mode 100644 lld/test/COFF/associative-comdat-mingw-weak.s diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 0adc2b91bd999..4346b3a2ffa7b 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -348,13 +348,13 @@ void ObjFile::recordPrevailingSymbolForMingw( // of the section chunk we actually include instead of discarding it, // add the symbol to a map to allow using it for implicitly // associating .[px]data$ sections to it. + // Use the suffix from the .text$ instead of the leader symbol + // name, for cases where the names differ (i386 mangling/decorations, + // cases where the leader is a weak symbol named .weak.func.default*). int32_t sectionNumber = sym.getSectionNumber(); SectionChunk *sc = sparseChunks[sectionNumber]; if (sc && sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE) { - StringRef name; - name = check(coffObj->getSymbolName(sym)); - if (getMachineType() == I386) - name.consume_front("_"); + StringRef name = sc->getSectionName().split('$').second; prevailingSectionMap[name] = sectionNumber; } } diff --git a/lld/test/COFF/associative-comdat-mingw-i386.s b/lld/test/COFF/associative-comdat-mingw-i386.s index 8d89478d4eb03..3ba8c1cd9a75b 100644 --- a/lld/test/COFF/associative-comdat-mingw-i386.s +++ b/lld/test/COFF/associative-comdat-mingw-i386.s @@ -1,10 +1,14 @@ # REQUIRES: x86 -# RUN: llvm-mc -triple=i686-windows-gnu %s -filetype=obj -o %t.obj +# RUN: llvm-mc -triple=i686-windows-gnu %s -defsym stdcall=0 -filetype=obj -o %t.obj # RUN: lld-link -lldmingw -entry:main %t.obj -out:%t.exe # RUN: llvm-objdump -s %t.exe | FileCheck %s +# RUN: llvm-mc -triple=i686-windows-gnu %s -defsym stdcall=1 -filetype=obj -o %t.stdcall.obj +# RUN: lld-link -lldmingw -entry:main %t.stdcall.obj -out:%t.stdcall.exe +# RUN: llvm-objdump -s %t.stdcall.exe | FileCheck %s + # Check that the .eh_frame comdat was included, even if it had no symbols, # due to associativity with the symbol _foo. @@ -19,19 +23,34 @@ .globl _main .p2align 4, 0x90 _main: +.if stdcall==0 call _foo +.else + call _foo@0 +.endif ret .section .eh_frame$foo,"dr" .linkonce discard .byte 0x42 +.if stdcall==0 .def _foo; +.else + .def _foo@0; +.endif .scl 2; .type 32; .endef +.if stdcall==0 .section .text$foo,"xr",discard,_foo .globl _foo .p2align 4 _foo: +.else + .section .text$foo,"xr",discard,_foo@0 + .globl _foo@0 + .p2align 4 +_foo@0: +.endif ret diff --git a/lld/test/COFF/associative-comdat-mingw-weak.s b/lld/test/COFF/associative-comdat-mingw-weak.s new file mode 100644 index 0000000000000..80c738b436be4 --- /dev/null +++ b/lld/test/COFF/associative-comdat-mingw-weak.s @@ -0,0 +1,63 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -triple=x86_64-windows-gnu %s -filetype=obj -o %t.obj +# RUN: llvm-readobj --symbols %t.obj | FileCheck %s --check-prefix=SYMBOL + +# RUN: lld-link -lldmingw -entry:main %t.obj -out:%t.exe -lldmap:%t.map -verbose +# RUN: llvm-readobj --sections %t.exe | FileCheck %s + +# CHECK: Sections [ +# CHECK: Section { +# CHECK: Number: 2 +# CHECK-LABEL: Name: .rdata (2E 72 64 61 74 61 00 00) +# This is the critical check to show that .xdata$foo was +# retained, while .xdata$bar wasn't. This *must* be 0x24 +# (0x4 for the .xdata section and 0x20 for the +# .ctors/.dtors headers/ends). +# CHECK-NEXT: VirtualSize: 0x24 + +# Check that the weak symbols still are emitted as it was when the test was +# written, to make sure the test still actually tests what was intended. + +# SYMBOL: Symbol { +# SYMBOL: Name: foo +# SYMBOL-NEXT: Value: 0 +# SYMBOL-NEXT: Section: IMAGE_SYM_UNDEFINED (0) +# SYMBOL-NEXT: BaseType: Null (0x0) +# SYMBOL-NEXT: ComplexType: Null (0x0) +# SYMBOL-NEXT: StorageClass: WeakExternal (0x69) +# SYMBOL-NEXT: AuxSymbolCount: 1 +# SYMBOL-NEXT: AuxWeakExternal { +# SYMBOL-NEXT: Linked: .weak.foo.default.main (19) +# SYMBOL-NEXT: Search: Alias (0x3) +# SYMBOL-NEXT: } +# SYMBOL-NEXT: } + + .text + .globl main +main: + call foo + retq + +# See associative-comdat-mingw.s for the general setup. Here, the leader +# symbols are weak, which causes the functions foo and bar to be undefined +# weak externals, while the actual leader symbols are named like +# .weak.foo.default.main. + + .section .xdata$foo,"dr" + .linkonce discard + .long 42 + + .section .xdata$bar,"dr" + .linkonce discard + .long 43 + + .section .text$foo,"xr",discard,foo + .weak foo +foo: + ret + + .section .text$bar,"xr",discard,bar + .weak bar +bar: + ret From 7c182663a857fc87552fa2861c7f94046d55845e Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Mon, 27 Jul 2020 09:36:38 -0500 Subject: [PATCH 0207/1035] Revert "Re-apply:" Emit DW_OP_implicit_value for Floating point constants"" This patch reverts commit `59a76d957a26` as it has caused failure on the big endian PowerPC buildbots (as well as the SystemZ buildbots). --- llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 21 +----- .../CodeGen/AsmPrinter/DwarfExpression.cpp | 32 --------- llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h | 3 - .../test/DebugInfo/X86/float_const_loclist.ll | 4 +- .../DebugInfo/X86/implicit_value-double.ll | 66 ----------------- .../DebugInfo/X86/implicit_value-float.ll | 65 ----------------- llvm/test/DebugInfo/X86/implicit_value-ld.ll | 71 ------------------- 7 files changed, 6 insertions(+), 256 deletions(-) delete mode 100644 llvm/test/DebugInfo/X86/implicit_value-double.ll delete mode 100644 llvm/test/DebugInfo/X86/implicit_value-float.ll delete mode 100644 llvm/test/DebugInfo/X86/implicit_value-ld.ll diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 5a4c4dfb90a2a..1169adaaf470c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2488,7 +2488,6 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, DwarfExpr.addSignedConstant(Value.getInt()); else DwarfExpr.addUnsignedConstant(Value.getInt()); - DwarfExpr.addExpression(std::move(ExprCursor)); } else if (Value.isLocation()) { MachineLocation Location = Value.getLoc(); DwarfExpr.setLocation(Location, DIExpr); @@ -2509,24 +2508,10 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, DwarfExpr.addExpression(std::move(ExprCursor)); return; } else if (Value.isConstantFP()) { - if (AP.getDwarfVersion() >= 4 && AP.getDwarfDebug()->tuneForGDB()) { - DwarfExpr.addConstantFP(Value.getConstantFP()->getValueAPF()); - return; - } else if (Value.getConstantFP() - ->getValueAPF() - .bitcastToAPInt() - .getBitWidth() <= 64 /*bits*/) { - DwarfExpr.addUnsignedConstant( - Value.getConstantFP()->getValueAPF().bitcastToAPInt()); - DwarfExpr.addExpression(std::move(ExprCursor)); - return; - } - LLVM_DEBUG( - dbgs() - << "Skipped DwarfExpression creation for ConstantFP of size: " - << Value.getConstantFP()->getValueAPF().bitcastToAPInt().getBitWidth() - << " bits\n"); + APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt(); + DwarfExpr.addUnsignedConstant(RawBytes); } + DwarfExpr.addExpression(std::move(ExprCursor)); } void DebugLocEntry::finalize(const AsmPrinter &AP, diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 7e6f4a0425d87..d4762121d1050 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -25,8 +25,6 @@ using namespace llvm; -#define DEBUG_TYPE "dwarfdebug" - void DwarfExpression::emitConstu(uint64_t Value) { if (Value < 32) emitOp(dwarf::DW_OP_lit0 + Value); @@ -221,36 +219,6 @@ void DwarfExpression::addUnsignedConstant(const APInt &Value) { } } -void DwarfExpression::addConstantFP(const APFloat &Value) { - assert(isImplicitLocation() || isUnknownLocation()); - APInt RawBytes = Value.bitcastToAPInt(); - int NumBytes = RawBytes.getBitWidth() / 8; - const char *Data = (const char *)RawBytes.getRawData(); - emitOp(dwarf::DW_OP_implicit_value); - if (NumBytes == 4 /*float*/ || NumBytes == 8 /*double*/) { - emitUnsigned(NumBytes /*Size of the block in bytes*/); - for (int i = 0; i < NumBytes; ++i) - emitData1(Data[i]); - return; - } - if (NumBytes == 10 /*long double*/) { - // long double IEEE representation uses 80 bits(10 bytes). - // 6 bytes are padded to make it 128 bits(16 bytes) due to - // addressing restrictions. - emitUnsigned(16 /*Size of the block in bytes*/); - // Emit the block of bytes. - for (int i = 0; i < NumBytes; ++i) - emitData1(Data[i]); - // Emit the rest as padding bytes. - for (int i = 0; i < 16 - NumBytes; ++i) - emitData1(0); - return; - } - LLVM_DEBUG( - dbgs() << "Skipped DW_OP_implicit_value creation for ConstantFP of size: " - << RawBytes.getBitWidth() << " bits\n"); -} - bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &ExprCursor, unsigned MachineReg, diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index 097208f1cfade..757b175114535 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -299,9 +299,6 @@ class DwarfExpression { /// Emit an unsigned constant. void addUnsignedConstant(const APInt &Value); - /// Emit floating point constant. - void addConstantFP(const APFloat &Value); - /// Lock this down to become a memory location description. void setMemoryLocationKind() { assert(isUnknownLocation()); diff --git a/llvm/test/DebugInfo/X86/float_const_loclist.ll b/llvm/test/DebugInfo/X86/float_const_loclist.ll index 24ee16444836a..f9008209e5dfb 100644 --- a/llvm/test/DebugInfo/X86/float_const_loclist.ll +++ b/llvm/test/DebugInfo/X86/float_const_loclist.ll @@ -20,10 +20,12 @@ ; ; CHECK: .debug_info contents: ; CHECK: DW_TAG_variable +; CHECK-NEXT: DW_AT_location {{.*}} ( +; CHECK-NEXT: [0x[[START:.*]], 0x[[END:.*]]): DW_OP_constu 0xc8f5c28f5c28f800, DW_OP_piece 0x8, DW_OP_constu 0x4000, DW_OP_bit_piece 0x10 0x40) ; CHECK-NEXT: DW_AT_name {{.*}}"ld" ; CHECK: DW_TAG_variable ; CHECK-NEXT: DW_AT_location {{.*}} ( -; CHECK-NEXT: [0x{{.*}}, 0x{{.*}}): DW_OP_constu 0x4048f5c3) +; CHECK-NEXT: [0x[[START]], 0x[[END]]): DW_OP_constu 0x4048f5c3) ; CHECK-NEXT: DW_AT_name {{.*}}"f" source_filename = "test.c" diff --git a/llvm/test/DebugInfo/X86/implicit_value-double.ll b/llvm/test/DebugInfo/X86/implicit_value-double.ll deleted file mode 100644 index 3c14c7dfefce4..0000000000000 --- a/llvm/test/DebugInfo/X86/implicit_value-double.ll +++ /dev/null @@ -1,66 +0,0 @@ -;; This test checks for emission of DW_OP_implicit_value operation -;; for double type. - -; RUN: llc -debugger-tune=gdb -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s - -; CHECK: .debug_info contents: -; CHECK: DW_TAG_variable -; CHECK-NEXT: DW_AT_location ({{.*}} -; CHECK-NEXT: [{{.*}}): DW_OP_implicit_value 0x8 0x1f 0x85 0xeb 0x51 0xb8 0x1e 0x09 0x40) -; CHECK-NEXT: DW_AT_name ("d") - -;; Generated from: clang -ggdb -O1 -;;int main() { -;; double d = 3.14; -;; printf("dummy\n"); -;; d *= d; -;; return 0; -;;} - -; ModuleID = 'implicit_value-double.c' -source_filename = "implicit_value-double.c" -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@str = private unnamed_addr constant [6 x i8] c"dummy\00", align 1 - -; Function Attrs: nofree nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 { -entry: - call void @llvm.dbg.value(metadata double 3.140000e+00, metadata !12, metadata !DIExpression()), !dbg !14 - %puts = call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0)), !dbg !15 - call void @llvm.dbg.value(metadata double undef, metadata !12, metadata !DIExpression()), !dbg !14 - ret i32 0, !dbg !16 -} - -; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - -; Function Attrs: nofree nounwind -declare i32 @puts(i8* nocapture readonly) local_unnamed_addr #2 - -attributes #0 = { nofree nounwind uwtable } -attributes #1 = { nounwind readnone speculatable willreturn } -attributes #2 = { nofree nounwind } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "implicit_value-double.c", directory: "/home/") -!2 = !{} -!3 = !{i32 7, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 11.0.0"} -!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) -!8 = !DISubroutineType(types: !9) -!9 = !{!10} -!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!11 = !{!12} -!12 = !DILocalVariable(name: "d", scope: !7, file: !1, line: 2, type: !13) -!13 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float) -!14 = !DILocation(line: 0, scope: !7) -!15 = !DILocation(line: 3, column: 2, scope: !7) -!16 = !DILocation(line: 5, column: 2, scope: !7) diff --git a/llvm/test/DebugInfo/X86/implicit_value-float.ll b/llvm/test/DebugInfo/X86/implicit_value-float.ll deleted file mode 100644 index 8c51b49481773..0000000000000 --- a/llvm/test/DebugInfo/X86/implicit_value-float.ll +++ /dev/null @@ -1,65 +0,0 @@ -;; This test checks for emission of DW_OP_implicit_value operation -;; for float type. - -; RUN: llc -debugger-tune=gdb -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s - -; CHECK: .debug_info contents: -; CHECK: DW_TAG_variable -; CHECK-NEXT: DW_AT_location ({{.*}} -; CHECK-NEXT: [{{.*}}): DW_OP_implicit_value 0x4 0xc3 0xf5 0x48 0x40) -; CHECK-NEXT: DW_AT_name ("f") - -;; Generated from: clang -ggdb -O1 -;;int main() { -;; float f = 3.14f; -;; printf("dummy\n"); -;; f *= f; -;; return 0; -;;} -; ModuleID = 'implicit_value-float.c' -source_filename = "implicit_value-float.c" -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@str = private unnamed_addr constant [6 x i8] c"dummy\00", align 1 - -; Function Attrs: nofree nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 { -entry: - call void @llvm.dbg.value(metadata float 0x40091EB860000000, metadata !12, metadata !DIExpression()), !dbg !14 - %puts = call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0)), !dbg !15 - call void @llvm.dbg.value(metadata float undef, metadata !12, metadata !DIExpression()), !dbg !14 - ret i32 0, !dbg !16 -} - -; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - -; Function Attrs: nofree nounwind -declare i32 @puts(i8* nocapture readonly) local_unnamed_addr #2 - -attributes #0 = { nofree nounwind uwtable } -attributes #1 = { nounwind readnone speculatable willreturn } -attributes #2 = { nofree nounwind } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "implicit_value-float.c", directory: "/home/") -!2 = !{} -!3 = !{i32 7, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 11.0.0"} -!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) -!8 = !DISubroutineType(types: !9) -!9 = !{!10} -!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!11 = !{!12} -!12 = !DILocalVariable(name: "f", scope: !7, file: !1, line: 2, type: !13) -!13 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) -!14 = !DILocation(line: 0, scope: !7) -!15 = !DILocation(line: 3, column: 2, scope: !7) -!16 = !DILocation(line: 5, column: 2, scope: !7) diff --git a/llvm/test/DebugInfo/X86/implicit_value-ld.ll b/llvm/test/DebugInfo/X86/implicit_value-ld.ll deleted file mode 100644 index 5fdaf396c5347..0000000000000 --- a/llvm/test/DebugInfo/X86/implicit_value-ld.ll +++ /dev/null @@ -1,71 +0,0 @@ -;; This test checks for emission of DW_OP_implicit_value operation -;; for long double type. - -; RUN: llc -debugger-tune=gdb -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s - -; CHECK: .debug_info contents: -; CHECK: DW_TAG_variable -; CHECK-NEXT: DW_AT_location ({{.*}} -; CHECK-NEXT: [{{.*}}): DW_OP_implicit_value 0x10 0x00 0xf8 0x28 0x5c 0x8f 0xc2 0xf5 0xc8 0x00 0x40 0x00 0x00 0x00 0x00 0x00 0x00) -; CHECK-NEXT: DW_AT_name ("ld") - -;; Generated from: clang -ggdb -O1 -;;int main() { -;; long double ld = 3.14; -;; printf("dummy\n"); -;; ld *= ld; -;; return 0; -;;} - -; ModuleID = 'implicit_value-ld.c' -source_filename = "implicit_value-ld.c" -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@str = private unnamed_addr constant [6 x i8] c"dummy\00", align 1 - -; Function Attrs: nofree nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 { -entry: - call void @llvm.dbg.declare(metadata [6 x i8]* undef, metadata !12, metadata !DIExpression(DW_OP_LLVM_fragment, 80, 48)), !dbg !14 - call void @llvm.dbg.value(metadata x86_fp80 0xK4000C8F5C28F5C28F800, metadata !12, metadata !DIExpression()), !dbg !15 - %puts = call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0)), !dbg !16 - call void @llvm.dbg.value(metadata x86_fp80 undef, metadata !12, metadata !DIExpression()), !dbg !15 - ret i32 0, !dbg !17 -} - -; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 - -; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - -; Function Attrs: nofree nounwind -declare i32 @puts(i8* nocapture readonly) local_unnamed_addr #2 - -attributes #0 = { nofree nounwind uwtable } -attributes #1 = { nounwind readnone speculatable willreturn } -attributes #2 = { nofree nounwind } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "implicit_value-ld.c", directory: "/home/") -!2 = !{} -!3 = !{i32 7, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 11.0.0"} -!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) -!8 = !DISubroutineType(types: !9) -!9 = !{!10} -!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!11 = !{!12} -!12 = !DILocalVariable(name: "ld", scope: !7, file: !1, line: 2, type: !13) -!13 = !DIBasicType(name: "long double", size: 128, encoding: DW_ATE_float) -!14 = !DILocation(line: 2, column: 14, scope: !7) -!15 = !DILocation(line: 0, scope: !7) -!16 = !DILocation(line: 3, column: 2, scope: !7) -!17 = !DILocation(line: 5, column: 2, scope: !7) From 4dd5c2bee366514cbc3fc4e6da46462bc11a0a3d Mon Sep 17 00:00:00 2001 From: Sergej Jaskiewicz Date: Wed, 15 Jul 2020 18:55:37 +0300 Subject: [PATCH 0208/1035] [lit] Don't expand escapes until all substitutions have been applied Otherwise, if a Lit script contains escaped substitutions (like %%p in this test https://github.com/llvm/llvm-project/blob/master/compiler-rt/test/asan/TestCases/Darwin/asan-symbolize-partial-report-with-module-map.cpp#L10), they are unescaped during recursive application of substitutions, and the results are unexpected. We solve it using the fact that double percent signs are first replaced with #_MARKER_#, and only after all the other substitutions have been applied, #_MARKER_# is replaced with a single percent sign. The only change is that instead of replacing #_MARKER_# at each recursion step, we replace it once after the last recursion step. Differential Revision: https://reviews.llvm.org/D83894 --- llvm/utils/lit/lit/TestRunner.py | 20 +++++++++++-------- .../escaping/lit.cfg | 10 ++++++++++ .../escaping/test.py | 1 + .../tests/shtest-recursive-substitution.py | 4 ++++ 4 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/lit.cfg create mode 100644 llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/test.py diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index d80f0aeee8ce9..643b03fc279a1 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1081,9 +1081,7 @@ def getDefaultSubstitutions(test, tmpDir, tmpBase, normalize_slashes=False): tmpDir = tmpDir.replace('\\', '/') tmpBase = tmpBase.replace('\\', '/') - # We use #_MARKER_# to hide %% while we do the other substitutions. substitutions = [] - substitutions.extend([('%%', '#_MARKER_#')]) substitutions.extend(test.config.substitutions) tmpName = tmpBase + '.tmp' baseName = os.path.basename(tmpBase) @@ -1093,8 +1091,7 @@ def getDefaultSubstitutions(test, tmpDir, tmpBase, normalize_slashes=False): ('%{pathsep}', os.pathsep), ('%t', tmpName), ('%basename_t', baseName), - ('%T', tmpDir), - ('#_MARKER_#', '%')]) + ('%T', tmpDir)]) # "%/[STpst]" should be normalized. substitutions.extend([ @@ -1159,6 +1156,14 @@ def applySubstitutions(script, substitutions, recursion_limit=None): `recursion_limit` times, it is an error. If the `recursion_limit` is `None` (the default), no recursive substitution is performed at all. """ + + # We use #_MARKER_# to hide %% while we do the other substitutions. + def escape(ln): + return _caching_re_compile('%%').sub('#_MARKER_#', ln) + + def unescape(ln): + return _caching_re_compile('#_MARKER_#').sub('%', ln) + def processLine(ln): # Apply substitutions for a,b in substitutions: @@ -1171,7 +1176,7 @@ def processLine(ln): # short-lived, since the set of substitutions is fairly small, and # since thrashing has such bad consequences, not bounding the cache # seems reasonable. - ln = _caching_re_compile(a).sub(str(b), ln) + ln = _caching_re_compile(a).sub(str(b), escape(ln)) # Strip the trailing newline and any extra whitespace. return ln.strip() @@ -1193,10 +1198,9 @@ def processLineToFixedPoint(ln): return processed - # Note Python 3 map() gives an iterator rather than a list so explicitly - # convert to list before returning. process = processLine if recursion_limit is None else processLineToFixedPoint - return list(map(process, script)) + + return [unescape(process(ln)) for ln in script] class ParserKind(object): diff --git a/llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/lit.cfg new file mode 100644 index 0000000000000..97a4faac23870 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/lit.cfg @@ -0,0 +1,10 @@ +import lit.formats +config.name = 'escaping' +config.suffixes = ['.py'] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None + +config.substitutions = [("%rec1", "%%s"), ("%rec2", "%rec1")] + +config.recursiveExpansionLimit = 5 diff --git a/llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/test.py b/llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/test.py new file mode 100644 index 0000000000000..74056e71caf47 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-recursive-substitution/escaping/test.py @@ -0,0 +1 @@ +# RUN: echo %rec2 %%s %%%%s diff --git a/llvm/utils/lit/tests/shtest-recursive-substitution.py b/llvm/utils/lit/tests/shtest-recursive-substitution.py index 1b3d5f4c44519..d446c422139bc 100644 --- a/llvm/utils/lit/tests/shtest-recursive-substitution.py +++ b/llvm/utils/lit/tests/shtest-recursive-substitution.py @@ -21,3 +21,7 @@ # RUN: %{lit} -j 1 %{inputs}/shtest-recursive-substitution/set-to-none --show-all | FileCheck --check-prefix=CHECK-TEST6 %s # CHECK-TEST6: PASS: set-to-none :: test.py + +# RUN: %{lit} -j 1 %{inputs}/shtest-recursive-substitution/escaping --show-all | FileCheck --check-prefix=CHECK-TEST7 %s +# CHECK-TEST7: PASS: escaping :: test.py +# CHECK-TEST7: $ "echo" "%s" "%s" "%%s" From f5e1ec8c5804ab7bd36f9acd43124b2029fbabc4 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Tue, 14 Jul 2020 15:45:05 -0600 Subject: [PATCH 0209/1035] [AArch64] fjcvtzs,rmif,cfinv,setf* all clobber nzcv Differential Revision: https://reviews.llvm.org/D83818 --- llvm/lib/Target/AArch64/AArch64InstrFormats.td | 1 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 ++-- llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir | 17 +++++++++++++++++ llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir | 17 +++++++++++++++++ llvm/test/CodeGen/AArch64/fjcvtzs.mir | 17 +++++++++++++++++ llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir | 16 ++++++++++++++++ llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir | 16 ++++++++++++++++ llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir | 16 ++++++++++++++++ llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir | 16 ++++++++++++++++ llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir | 16 ++++++++++++++++ llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir | 16 ++++++++++++++++ 11 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir create mode 100644 llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir create mode 100644 llvm/test/CodeGen/AArch64/fjcvtzs.mir create mode 100644 llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir create mode 100644 llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir create mode 100644 llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir create mode 100644 llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir create mode 100644 llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir create mode 100644 llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 4f4ba692c2db4..e2403c5f6347f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1942,6 +1942,7 @@ class BaseFlagManipulation : I<(outs), iops, asm, ops, "", []>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; + let Defs = [NZCV]; bits<5> Rn; let Inst{31} = sf; let Inst{30-15} = 0b0111010000000000; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d4e984754d0e9..9ea497838b634 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1030,7 +1030,7 @@ let Predicates = [HasPA] in { } // v8.3a floating point conversion for javascript -let Predicates = [HasJS, HasFPARMv8] in +let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, "fjcvtzs", [(set GPR32:$Rd, @@ -1039,7 +1039,7 @@ def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, } // HasJS, HasFPARMv8 // v8.4 Flag manipulation instructions -let Predicates = [HasFMI] in { +let Predicates = [HasFMI], Defs = [NZCV], Uses = [NZCV] in { def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> { let Inst{20-5} = 0b0000001000000000; } diff --git a/llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir b/llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir new file mode 100644 index 0000000000000..cfcda7de52c52 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir @@ -0,0 +1,17 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+11]]:29: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } + - { reg: '$x0' } +body: | + bb.0: + liveins: $w0, $x0 + + CFINV implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir b/llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir new file mode 100644 index 0000000000000..4e3b3ead77155 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir @@ -0,0 +1,17 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+11]]:25: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } + - { reg: '$x0' } +body: | + bb.0: + liveins: $w0, $x0 + + CFINV implicit $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/fjcvtzs.mir b/llvm/test/CodeGen/AArch64/fjcvtzs.mir new file mode 100644 index 0000000000000..efdee63669e29 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fjcvtzs.mir @@ -0,0 +1,17 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -mattr=+jsconv -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+11]]:49: missing implicit register operand 'implicit-def $nzcv' + +... +--- +name: test_jcvt +liveins: + - { reg: '$d0' } +body: | + bb.0: + liveins: $d0 + + renamable $w0 = FJCVTZS killed renamable $d0 + RET undef $lr, implicit killed $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir b/llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir new file mode 100644 index 0000000000000..1d0903d1af8a7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:49: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$x0' } +body: | + bb.0: + liveins: $x0 + + RMIF renamable $x0, 0, 0, implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir b/llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir new file mode 100644 index 0000000000000..74274a2acb2e6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:45: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$x0' } +body: | + bb.0: + liveins: $x0 + + RMIF renamable $x0, 0, 0, implicit $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir b/llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir new file mode 100644 index 0000000000000..e828ddad68e19 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:45: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF16 renamable $w0, implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir b/llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir new file mode 100644 index 0000000000000..7f2c5606f2e8f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:41: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF16 renamable $w0, implicit $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir b/llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir new file mode 100644 index 0000000000000..d9a9ef00efd59 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:44: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF8 renamable $w0, implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir b/llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir new file mode 100644 index 0000000000000..05d803f7b7b61 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:40: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF8 renamable $w0, implicit $nzcv + RET undef $lr, implicit killed $w0 + + From c25f61cf6a61bc323af118d351a27603fdd0158d Mon Sep 17 00:00:00 2001 From: jasonliu Date: Mon, 27 Jul 2020 14:04:59 +0000 Subject: [PATCH 0210/1035] [XCOFF][AIX] Handle llvm.used and llvm.compiler.used global array For now, just return and do nothing when we see llvm.used and llvm.compiler.used global array. Hopefully, we could come up with a good solution later to prevent linker from eliminating symbols in llvm.used array. Reviewed By: DiggerLin, daltenty Differential Revision: https://reviews.llvm.org/D84363 --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 36 ++++++++++++++------- llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll | 26 +++++++++++++++ 2 files changed, 50 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index bc869c39e3934..540e620a845bc 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1678,22 +1678,31 @@ void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) { report_fatal_error("COMDAT not yet supported by AIX."); } -static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { - return StringSwitch(GV->getName()) - .Cases("llvm.global_ctors", "llvm.global_dtors", true) - .Default(false); +static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) { + return GV->hasAppendingLinkage() && + StringSwitch(GV->getName()) + // TODO: Update the handling of global arrays for static init when + // we support the ".ref" directive. + // Otherwise, we can skip these arrays, because the AIX linker + // collects static init functions simply based on their name. + .Cases("llvm.global_ctors", "llvm.global_dtors", true) + // TODO: Linker could still eliminate the GV if we just skip + // handling llvm.used array. Skipping them for now until we or the + // AIX OS team come up with a good solution. + .Case("llvm.used", true) + // It's correct to just skip llvm.compiler.used array here. + .Case("llvm.compiler.used", true) + .Default(false); } void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { - ValidateGV(GV); - - // TODO: Update the handling of global arrays for static init when we support - // the ".ref" directive. - // Otherwise, we can skip these arrays, because the AIX linker collects - // static init functions simply based on their name. - if (isSpecialLLVMGlobalArrayForStaticInit(GV)) + if (isSpecialLLVMGlobalArrayToSkip(GV)) return; + assert(!GV->getName().startswith("llvm.") && + "Unhandled intrinsic global variable."); + ValidateGV(GV); + // Create the symbol, set its storage class. MCSymbolXCOFF *GVSym = cast(getSymbol(GV)); GVSym->setStorageClass( @@ -1836,8 +1845,11 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { // We need to know, up front, the alignment of csects for the assembly path, // because once a .csect directive gets emitted, we could not change the // alignment value on it. - for (const auto &G : M.globals()) + for (const auto &G : M.globals()) { + if (isSpecialLLVMGlobalArrayToSkip(&G)) + continue; setCsectAlignment(&G); + } for (const auto &F : M) setCsectAlignment(&F); diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll new file mode 100644 index 0000000000000..dd0812f3d8c78 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll @@ -0,0 +1,26 @@ +;; This test verifies llc on AIX would not crash when llvm.used and +;; llvm.compiler.used is presented in the IR. + +; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mtriple powerpc-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mtriple powerpc64-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +@keep_this = internal global i32 2, align 4 +@keep_this2 = internal global i32 3, align 4 +@llvm.used = appending global [1 x i8*] [i8* bitcast (i32* @keep_this to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [1 x i8*] [i8* bitcast (i32* @keep_this2 to i8*)], section "llvm.metadata" + +; CHECK-NOT: llvm.metadata +; CHECK-NOT: llvm.used +; CHECK-NOT: llvm.compiler.used + +; CHECK: .lglobl keep_this +; CHECK: keep_this: +; CHECK: .lglobl keep_this2 +; CHECK: keep_this2: + +; CHECK-NOT: llvm.metadata +; CHECK-NOT: llvm.used +; CHECK-NOT: llvm.compiler.used From 48c948abeb7dd8a2d022749d1cc1561ddc45d8dc Mon Sep 17 00:00:00 2001 From: Vy Nguyen Date: Fri, 17 Jul 2020 19:08:48 -0400 Subject: [PATCH 0211/1035] [llvm-exegesis] Check perf_branch_entry for field cycles Summary: Follow up to breakages reported in D77422 Reviewers: ondrasej, gchatelet Tags: #llvm Differential Revision: https://reviews.llvm.org/D84076 --- llvm/cmake/modules/FindLibpfm.cmake | 14 ++++++++++++++ llvm/include/llvm/Config/config.h.cmake | 3 +++ 2 files changed, 17 insertions(+) diff --git a/llvm/cmake/modules/FindLibpfm.cmake b/llvm/cmake/modules/FindLibpfm.cmake index 202bb030e3803..38cf3af3953d9 100644 --- a/llvm/cmake/modules/FindLibpfm.cmake +++ b/llvm/cmake/modules/FindLibpfm.cmake @@ -7,6 +7,7 @@ include(CheckIncludeFile) include(CheckLibraryExists) +include(CheckCXXSourceCompiles) if (LLVM_ENABLE_LIBPFM) check_library_exists(pfm pfm_initialize "" HAVE_LIBPFM_INITIALIZE) @@ -16,6 +17,19 @@ if (LLVM_ENABLE_LIBPFM) check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H) if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H) set(HAVE_LIBPFM 1) + # Check to see if perf_branch_entry has the field 'cycles'. + # We couldn't use CheckStructHasMember here because 'cycles' is a bit field which is not + # supported by CheckStructHasMember. + CHECK_CXX_SOURCE_COMPILES(" + #include + int main() { + perf_branch_entry entry; + entry.cycles = 2; + return 0; + }" COMPILE_WITH_CYCLES) + if(COMPILE_WITH_CYCLES) + set(LIBPFM_HAS_FIELD_CYCLES 1) + endif() endif() endif() endif() diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index 298fab318c10d..4d76b27df6b61 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -97,6 +97,9 @@ /* Define to 1 if you have the `pfm' library (-lpfm). */ #cmakedefine HAVE_LIBPFM ${HAVE_LIBPFM} +/* Define to 1 if the `perf_branch_entry' struct has field cycles. */ +#cmakedefine LIBPFM_HAS_FIELD_CYCLES ${LIBPFM_HAS_FIELD_CYCLES} + /* Define to 1 if you have the `psapi' library (-lpsapi). */ #cmakedefine HAVE_LIBPSAPI ${HAVE_LIBPSAPI} From a52aea0ba624fcf46602bff8463b7b831e87ba55 Mon Sep 17 00:00:00 2001 From: Logan Smith Date: Mon, 27 Jul 2020 08:37:01 -0700 Subject: [PATCH 0212/1035] Use INTERFACE_COMPILE_OPTIONS to disable -Wsuggest-override for any target that links to gtest This cleans up several CMakeLists.txt's where -Wno-suggest-override was manually specified. These test targets now inherit this flag from the gtest target. Some unittests CMakeLists.txt's, in particular Flang and LLDB, are not touched by this patch. Flang manually adds the gtest sources itself in some configurations, rather than linking to LLVM's gtest target, so this fix would be insufficient to cover those cases. Similarly, LLDB has subdirectories that manually add the gtest headers to their include path without linking to the gtest target, so those subdirectories still need -Wno-suggest-override to be manually specified to compile without warnings. Differential Revision: https://reviews.llvm.org/D84554 --- clang-tools-extra/clangd/unittests/CMakeLists.txt | 4 ---- clang-tools-extra/unittests/CMakeLists.txt | 4 ---- clang/unittests/CMakeLists.txt | 4 ---- lld/unittests/CMakeLists.txt | 4 ---- llvm/lib/Testing/Support/CMakeLists.txt | 4 ---- llvm/unittests/CMakeLists.txt | 4 ---- llvm/utils/unittest/CMakeLists.txt | 11 ++++++++--- mlir/unittests/CMakeLists.txt | 4 ---- polly/unittests/CMakeLists.txt | 4 ---- 9 files changed, 8 insertions(+), 35 deletions(-) diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt index 8ede92c16f7ad..c25e2b7f81037 100644 --- a/clang-tools-extra/clangd/unittests/CMakeLists.txt +++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt @@ -13,10 +13,6 @@ include_directories( ${CLANGD_BINARY_DIR} ) -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - if(CLANG_BUILT_STANDALONE) # LLVMTestingSupport library is needed for clangd tests. if (EXISTS ${LLVM_MAIN_SRC_DIR}/lib/Testing/Support diff --git a/clang-tools-extra/unittests/CMakeLists.txt b/clang-tools-extra/unittests/CMakeLists.txt index 72abe0fa6d0c1..086a68e638307 100644 --- a/clang-tools-extra/unittests/CMakeLists.txt +++ b/clang-tools-extra/unittests/CMakeLists.txt @@ -5,10 +5,6 @@ function(add_extra_unittest test_dirname) add_unittest(ExtraToolsUnitTests ${test_dirname} ${ARGN}) endfunction() -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_subdirectory(clang-apply-replacements) add_subdirectory(clang-change-namespace) add_subdirectory(clang-doc) diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index 9a52b9fb02620..4c222e24599f0 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -1,10 +1,6 @@ add_custom_target(ClangUnitTests) set_target_properties(ClangUnitTests PROPERTIES FOLDER "Clang tests") -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - if(CLANG_BUILT_STANDALONE) # LLVMTestingSupport library is needed for some of the unittests. if (EXISTS ${LLVM_MAIN_SRC_DIR}/lib/Testing/Support diff --git a/lld/unittests/CMakeLists.txt b/lld/unittests/CMakeLists.txt index 88cb85a084017..84d35d43f4e87 100644 --- a/lld/unittests/CMakeLists.txt +++ b/lld/unittests/CMakeLists.txt @@ -12,9 +12,5 @@ function(add_lld_unittest test_dirname) target_link_libraries(${test_dirname} ${LLVM_COMMON_LIBS}) endfunction() -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_subdirectory(DriverTests) add_subdirectory(MachOTests) diff --git a/llvm/lib/Testing/Support/CMakeLists.txt b/llvm/lib/Testing/Support/CMakeLists.txt index 595221a105cd3..fe460aeefc91f 100644 --- a/llvm/lib/Testing/Support/CMakeLists.txt +++ b/llvm/lib/Testing/Support/CMakeLists.txt @@ -1,10 +1,6 @@ add_definitions(-DGTEST_LANG_CXX11=1) add_definitions(-DGTEST_HAS_TR1_TUPLE=0) -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_llvm_library(LLVMTestingSupport Annotations.cpp Error.cpp diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt index e90f07448c100..d7dbaeaa32fe8 100644 --- a/llvm/unittests/CMakeLists.txt +++ b/llvm/unittests/CMakeLists.txt @@ -14,10 +14,6 @@ function(add_llvm_target_unittest test_dir_name) add_llvm_unittest(${test_dir_name} DISABLE_LLVM_LINK_LLVM_DYLIB ${ARGN}) endfunction() -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_subdirectory(ADT) add_subdirectory(Analysis) add_subdirectory(AsmParser) diff --git a/llvm/utils/unittest/CMakeLists.txt b/llvm/utils/unittest/CMakeLists.txt index 36761a60d9f72..bcae36fa150d1 100644 --- a/llvm/utils/unittest/CMakeLists.txt +++ b/llvm/utils/unittest/CMakeLists.txt @@ -43,9 +43,6 @@ endif() if(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG) add_definitions("-Wno-covered-switch-default") endif() -if(CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_definitions("-Wno-suggest-override") -endif() set(LLVM_REQUIRES_RTTI 1) add_definitions( -DGTEST_HAS_RTTI=0 ) @@ -73,6 +70,14 @@ add_llvm_library(gtest BUILDTREE_ONLY ) +# The googletest and googlemock sources don't presently use the 'override' +# keyword, which leads to lots of warnings from -Wsuggest-override. Disable +# that warning here for any targets that link to gtest. +if(CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) + add_definitions("-Wno-suggest-override") + set_target_properties(gtest PROPERTIES INTERFACE_COMPILE_OPTIONS "-Wno-suggest-override") +endif() + add_subdirectory(UnitTestMain) # When LLVM_LINK_LLVM_DYLIB is enabled, libLLVM.so is added to the interface diff --git a/mlir/unittests/CMakeLists.txt b/mlir/unittests/CMakeLists.txt index 1dc07413a8850..851092c5b56a4 100644 --- a/mlir/unittests/CMakeLists.txt +++ b/mlir/unittests/CMakeLists.txt @@ -5,10 +5,6 @@ function(add_mlir_unittest test_dirname) add_unittest(MLIRUnitTests ${test_dirname} ${ARGN}) endfunction() -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_subdirectory(Analysis) add_subdirectory(Dialect) add_subdirectory(IR) diff --git a/polly/unittests/CMakeLists.txt b/polly/unittests/CMakeLists.txt index 1a6881fde3fd0..fac70383de948 100644 --- a/polly/unittests/CMakeLists.txt +++ b/polly/unittests/CMakeLists.txt @@ -19,10 +19,6 @@ function(add_polly_unittest test_name) target_link_libraries(${test_name} PRIVATE Polly) endfunction() -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_subdirectory(Isl) add_subdirectory(Flatten) add_subdirectory(DeLICM) From 88ce9f9b441ecbe2798f20c33c67abbfc4863b08 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 22 Jul 2020 16:13:20 -0600 Subject: [PATCH 0213/1035] [TableGen][CGS] Print better errors on overlapping InstRW Differential Revision: https://reviews.llvm.org/D83588 --- llvm/include/llvm/TableGen/Error.h | 2 + llvm/lib/TableGen/Error.cpp | 7 ++++ .../CodeGenSchedule-duplicate-instrw.td | 21 ++++++++++ llvm/utils/TableGen/CodeGenSchedule.cpp | 38 +++++++++---------- 4 files changed, 48 insertions(+), 20 deletions(-) create mode 100644 llvm/test/TableGen/CodeGenSchedule-duplicate-instrw.td diff --git a/llvm/include/llvm/TableGen/Error.h b/llvm/include/llvm/TableGen/Error.h index cf990427f5770..1eed622ab393f 100644 --- a/llvm/include/llvm/TableGen/Error.h +++ b/llvm/include/llvm/TableGen/Error.h @@ -20,6 +20,8 @@ namespace llvm { void PrintNote(const Twine &Msg); void PrintNote(ArrayRef NoteLoc, const Twine &Msg); +LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(ArrayRef ErrorLoc, + const Twine &Msg); void PrintWarning(ArrayRef WarningLoc, const Twine &Msg); void PrintWarning(const char *Loc, const Twine &Msg); diff --git a/llvm/lib/TableGen/Error.cpp b/llvm/lib/TableGen/Error.cpp index 54b063cb4f8d2..1dfba9fb6b5dc 100644 --- a/llvm/lib/TableGen/Error.cpp +++ b/llvm/lib/TableGen/Error.cpp @@ -45,6 +45,13 @@ void PrintNote(ArrayRef NoteLoc, const Twine &Msg) { PrintMessage(NoteLoc, SourceMgr::DK_Note, Msg); } +void PrintFatalNote(ArrayRef NoteLoc, const Twine &Msg) { + PrintNote(NoteLoc, Msg); + // The following call runs the file cleanup handlers. + sys::RunInterruptHandlers(); + std::exit(1); +} + void PrintWarning(ArrayRef WarningLoc, const Twine &Msg) { PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg); } diff --git a/llvm/test/TableGen/CodeGenSchedule-duplicate-instrw.td b/llvm/test/TableGen/CodeGenSchedule-duplicate-instrw.td new file mode 100644 index 0000000000000..e8b13d473ec8c --- /dev/null +++ b/llvm/test/TableGen/CodeGenSchedule-duplicate-instrw.td @@ -0,0 +1,21 @@ +// RUN: not llvm-tblgen --gen-subtarget -I %p/../../include -I %p/Common %s -o - 2>&1 | FileCheck %s + +include "llvm/Target/Target.td" + +def FakeTarget : Target { } + +def FakeModel : SchedMachineModel { } + +def WriteA : SchedWrite; +def WriteB : SchedWrite; + +let SchedModel = NoSchedModel in { + def : InstRW<[WriteA], (instrs COPY)>; + + def : InstRW<[WriteB], (instrs COPY)>; +// CHECK: [[@LINE-1]]:3: error: Overlapping InstRW definition for "COPY" also matches previous "(instrs COPY)". +// CHECK-NEXT: def : InstRW<[WriteB], (instrs COPY)>; + +// CHECK: [[@LINE-6]]:3: note: Previous match was here. +// CHECK-NEXT: def : InstRW<[WriteA], (instrs COPY)>; +} \ No newline at end of file diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp index 67583c736cd2f..31ef38c33b56e 100644 --- a/llvm/utils/TableGen/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/CodeGenSchedule.cpp @@ -248,8 +248,7 @@ void CodeGenSchedModels::checkSTIPredicates() const { } PrintError(R->getLoc(), "STIPredicate " + Name + " multiply declared."); - PrintNote(It->second->getLoc(), "Previous declaration was here."); - PrintFatalError(R->getLoc(), "Invalid STIPredicateDecl found."); + PrintFatalNote(It->second->getLoc(), "Previous declaration was here."); } // Disallow InstructionEquivalenceClasses with an empty instruction list. @@ -454,10 +453,8 @@ void CodeGenSchedModels::checkMCInstPredicates() const { PrintError(TIIPred->getLoc(), "TIIPredicate " + Name + " is multiply defined."); - PrintNote(It->second->getLoc(), - " Previous definition of " + Name + " was here."); - PrintFatalError(TIIPred->getLoc(), - "Found conflicting definitions of TIIPredicate."); + PrintFatalNote(It->second->getLoc(), + " Previous definition of " + Name + " was here."); } } @@ -1083,13 +1080,14 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { if (RWD->getValueAsDef("SchedModel") == RWModelDef && RWModelDef->getValueAsBit("FullInstRWOverlapCheck")) { assert(!InstDefs.empty()); // Checked at function start. - PrintFatalError - (InstRWDef->getLoc(), - "Overlapping InstRW definition for \"" + - InstDefs.front()->getName() + - "\" also matches previous \"" + - RWD->getValue("Instrs")->getValue()->getAsString() + - "\"."); + PrintError( + InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + InstDefs.front()->getName() + + "\" also matches previous \"" + + RWD->getValue("Instrs")->getValue()->getAsString() + + "\"."); + PrintFatalNote(RWD->getLoc(), "Previous match was here."); } } LLVM_DEBUG(dbgs() << "InstRW: Reuse SC " << OldSCIdx << ":" @@ -1118,13 +1116,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { for (Record *OldRWDef : SchedClasses[OldSCIdx].InstRWs) { if (OldRWDef->getValueAsDef("SchedModel") == RWModelDef) { assert(!InstDefs.empty()); // Checked at function start. - PrintFatalError - (InstRWDef->getLoc(), - "Overlapping InstRW definition for \"" + - InstDefs.front()->getName() + - "\" also matches previous \"" + - OldRWDef->getValue("Instrs")->getValue()->getAsString() + - "\"."); + PrintError( + InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + InstDefs.front()->getName() + "\" also matches previous \"" + + OldRWDef->getValue("Instrs")->getValue()->getAsString() + + "\"."); + PrintFatalNote(OldRWDef->getLoc(), "Previous match was here."); } assert(OldRWDef != InstRWDef && "SchedClass has duplicate InstRW def"); From 52dd18ab1d3848f0156e4033bba2b65c3ae18a9e Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 27 Jul 2020 12:30:09 -0400 Subject: [PATCH 0214/1035] [gn build] (manually) merge 48c948abeb7 --- llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index 72b5796cd9aaa..94f0a66ecb182 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -113,6 +113,7 @@ write_cmake_config("config") { "HAVE____CHKSTK=", "HAVE____CHKSTK_MS=", "HOST_LINK_VERSION=", + "LIBPFM_HAS_FIELD_CYCLES=", "LLVM_TARGET_TRIPLE_ENV=", "LLVM_VERSION_INFO=", "LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO=1", From ee7caa75939afb75547c00744c5df4d04d45e517 Mon Sep 17 00:00:00 2001 From: Vy Nguyen Date: Mon, 27 Jul 2020 12:38:05 -0400 Subject: [PATCH 0215/1035] Reland [llvm-exegesis] Add benchmark latency option on X86 that uses LBR for more precise measurements. Starting with Skylake, the LBR contains the precise number of cycles between the two consecutive branches. Making use of this will hopefully make the measurements more precise than the existing methods of using RDTSC. Differential Revision: https://reviews.llvm.org/D77422 New change: check for existence of field `cycles` in perf_branch_entry before enabling this mode. This should prevent compilation errors when building for older kernel whose headers don't support it. --- llvm/docs/CommandGuide/llvm-exegesis.rst | 20 +- .../llvm-exegesis/X86/lbr/Inputs/mov_add.att | 4 + .../tools/llvm-exegesis/X86/lbr/lit.local.cfg | 31 +++ .../tools/llvm-exegesis/X86/lbr/mov-add.s | 18 ++ .../llvm-exegesis/lib/BenchmarkRunner.cpp | 5 +- llvm/tools/llvm-exegesis/lib/PerfHelper.cpp | 6 +- llvm/tools/llvm-exegesis/lib/PerfHelper.h | 15 +- .../llvm-exegesis/lib/X86/CMakeLists.txt | 1 + llvm/tools/llvm-exegesis/lib/X86/Target.cpp | 47 ++++ .../llvm-exegesis/lib/X86/X86Counter.cpp | 212 ++++++++++++++++++ llvm/tools/llvm-exegesis/lib/X86/X86Counter.h | 55 +++++ llvm/tools/llvm-exegesis/llvm-exegesis.cpp | 13 ++ 12 files changed, 415 insertions(+), 12 deletions(-) create mode 100644 llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att create mode 100644 llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg create mode 100644 llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s create mode 100644 llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp create mode 100644 llvm/tools/llvm-exegesis/lib/X86/X86Counter.h diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst index 321cdf5a6dab1..8cc1a237e9969 100644 --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -192,10 +192,24 @@ OPTIONS .. option:: -mode=[latency|uops|inverse_throughput|analysis] - Specify the run mode. Note that if you pick `analysis` mode, you also need - to specify at least one of the `-analysis-clusters-output-file=` and - `-analysis-inconsistencies-output-file=`. + Specify the run mode. Note that some modes have additional requirements and options. + `latency` mode can be make use of either RDTSC or LBR. + `latency[LBR]` is only available on X86 (at least `Skylake`). + To run in this mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop` + + In `analysis` mode, you also need to specify at least one of the + `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. + +.. option:: -x86-lbr-sample-period= + + Specify the LBR sampling period - how many branches before we take a sample. + When a positive value is specified for this option and when the mode is `latency`, + we will use LBRs for measuring. + On choosing the "right" sampling period, a small value is preferred, but throttling + could occur if the sampling is too frequent. A prime number should be used to + avoid consistently skipping certain blocks. + .. option:: -repetition-mode=[duplicate|loop|min] Specify the repetition mode. `duplicate` will create a large, straight line diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att b/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att new file mode 100644 index 0000000000000..8f85b395e7319 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att @@ -0,0 +1,4 @@ +# LLVM-EXEGESIS-LIVEIN RDI +# LLVM-EXEGESIS-DEFREG XMM1 42 +movq $2, %rdi +addq $0x10, %rdi \ No newline at end of file diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg new file mode 100644 index 0000000000000..431967c1ec9b0 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg @@ -0,0 +1,31 @@ +import subprocess +import lit.util + +if not ('X86' in config.root.targets): + # We need support for X86. + config.unsupported = True + +elif not ('x86_64' in config.root.host_triple): + # We need to be running on an X86 host. + config.unsupported = True + +else: + # We need libpfm to be installed and the host to be at least skylake. + llvm_exegesis_exe = lit.util.which('llvm-exegesis', config.llvm_tools_dir) + if not llvm_exegesis_exe: + print('llvm-exegesis not found') + config.unsupported = True + else: + try: + with open(os.devnull, 'w') as quiet: + check_llvm_exegesis_uops_result = subprocess.call( + [llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'uops', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet) + check_llvm_exegesis_latency_result = subprocess.call( + [llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'latency', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet) + except OSError: + print('could not exec llvm-exegesis') + config.unsupported = True + if not check_llvm_exegesis_uops_result == 0: + config.unsupported = True + if not check_llvm_exegesis_latency_result == 0: + config.unsupported = True diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s new file mode 100644 index 0000000000000..5f72e8f99b30d --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s @@ -0,0 +1,18 @@ +# RUN: llvm-exegesis -mode=latency --repetition-mode=loop --x86-lbr-sample-period=521 --snippets-file=%p/Inputs/mov_add.att + + +CHECK: --- +CHECK-NEXT: mode: latency +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: 'MOV64ri32 RDI i_0x2' +CHECK-NEXT: 'ADD64ri8 RDI RDI i_0x10' +CHECK-NEXT: config: '' +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: num_repetitions: 10000 +CHECK-NEXT: measurements: +CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}} +CHECK-LAST: ... diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index bdef8f8a89189..f015147b0fc2f 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -55,7 +55,6 @@ class FunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { static void accumulateCounterValues(const llvm::SmallVector &NewValues, llvm::SmallVector *Result) { - const size_t NumValues = std::max(NewValues.size(), Result->size()); if (NumValues > Result->size()) Result->resize(NumValues, 0); @@ -106,10 +105,10 @@ class FunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { if (Crashed) return make_error("snippet crashed while running"); } - auto ValueOrError = Counter->readOrError(); + + auto ValueOrError = Counter->readOrError(Function.getFunctionBytes()); if (!ValueOrError) return ValueOrError.takeError(); - accumulateCounterValues(ValueOrError.get(), &CounterValues); } return CounterValues; diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp index cba4846709e80..58e1f4dc2a2b2 100644 --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp @@ -128,7 +128,8 @@ int64_t Counter::read() const { return -1; } -llvm::Expected> Counter::readOrError() const { +llvm::Expected> +Counter::readOrError(StringRef /*unused*/) const { int64_t Count = 0; ssize_t ReadSize = ::read(FileDescriptor, &Count, sizeof(Count)); if (ReadSize != sizeof(Count)) @@ -152,7 +153,8 @@ void Counter::stop() {} int64_t Counter::read() const { return 42; } -llvm::Expected> Counter::readOrError() const { +llvm::Expected> +Counter::readOrError(StringRef /*unused*/) const { return llvm::make_error("Not implemented", llvm::errc::io_error); } diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h index d41b090e85f17..19a35595c9af7 100644 --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h @@ -59,8 +59,9 @@ class PerfEvent { // e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1" StringRef getPfmEventString() const; -private: - const std::string EventString; +protected: + PerfEvent() = default; + std::string EventString; std::string FullQualifiedEventString; perf_event_attr *Attr; }; @@ -87,11 +88,17 @@ class Counter { int64_t read() const; /// Returns the current value of the counter or error if it cannot be read. - virtual llvm::Expected> readOrError() const; + /// FunctionBytes: The benchmark function being executed. + /// This is used to filter out the measurements to ensure they are only + /// within the benchmarked code. + /// If empty (or not specified), then no filtering will be done. + /// Not all counters choose to use this. + virtual llvm::Expected> + readOrError(StringRef FunctionBytes = StringRef()) const; virtual int numValues() const; -private: +protected: PerfEvent Event; #ifdef HAVE_LIBPFM int FileDescriptor = -1; diff --git a/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt index 912877dd6ed1f..ce3bbd5908a83 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt @@ -6,6 +6,7 @@ include_directories( add_library(LLVMExegesisX86 STATIC Target.cpp + X86Counter.cpp ) llvm_update_compile_flags(LLVMExegesisX86) diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp index 7a84f936e0d0e..9f045fa11aa24 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -14,15 +14,40 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" +#include "X86Counter.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Sequence.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include +#include +#include + namespace llvm { namespace exegesis { +static cl::OptionCategory + BenchmarkOptions("llvm-exegesis benchmark x86-options"); + +// If a positive value is specified, we are going to use the LBR in +// latency-mode. +// +// Note: +// - A small value is preferred, but too low a value could result in +// throttling. +// - A prime number is preferred to avoid always skipping certain blocks. +// +static cl::opt LbrSamplingPeriod( + "x86-lbr-sample-period", + cl::desc("The sample period (nbranches/sample), used for LBR sampling"), + cl::cat(BenchmarkOptions), cl::init(0)); + +// FIXME: Validates that repetition-mode is loop if LBR is requested. + // Returns a non-null reason if we cannot handle the memory references in this // instruction. static const char *isInvalidMemoryInstr(const Instruction &Instr) { @@ -568,10 +593,32 @@ void ConstantInliner::initStack(unsigned Bytes) { #include "X86GenExegesis.inc" namespace { + class ExegesisX86Target : public ExegesisTarget { public: ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} + Expected> + createCounter(StringRef CounterName, const LLVMState &State) const override { + // If LbrSamplingPeriod was provided, then ignore the + // CounterName because we only have one for LBR. + if (LbrSamplingPeriod > 0) { + // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without + // __linux__ (for now) +#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \ + defined(__linux__) + return std::make_unique( + X86LbrPerfEvent(LbrSamplingPeriod)); +#else + return llvm::make_error( + "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, " + "or running on Linux.", + llvm::errc::invalid_argument); +#endif + } + return ExegesisTarget::createCounter(CounterName, State); + } + private: void addTargetSpecificPasses(PassManagerBase &PM) const override; diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp new file mode 100644 index 0000000000000..57b493818aaad --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp @@ -0,0 +1,212 @@ +//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "X86Counter.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#ifdef __linux__ +#include "llvm/Support/Endian.h" +#include "llvm/Support/Errc.h" + +#ifdef HAVE_LIBPFM +#include "perfmon/perf_event.h" +#include "perfmon/pfmlib.h" +#include "perfmon/pfmlib_perf_event.h" +#endif // HAVE_LIBPFM + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) +namespace llvm { +namespace exegesis { + +static constexpr size_t kBufferPages = 8; +static const size_t kDataBufferSize = kBufferPages * getpagesize(); + +// Waits for the LBR perf events. +static int pollLbrPerfEvent(const int FileDescriptor) { + struct pollfd PollFd; + PollFd.fd = FileDescriptor; + PollFd.events = POLLIN; + PollFd.revents = 0; + return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */); +} + +// Copies the data-buffer into Buf, given the pointer to MMapped. +static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail, + size_t DataSize) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page. + char *Start = reinterpret_cast(MMappedBuffer) + getpagesize(); + // The LBR buffer is a cyclic buffer, we copy data to another buffer. + uint64_t Offset = Tail % kDataBufferSize; + size_t CopySize = kDataBufferSize - Offset; + memcpy(Buf, Start + Offset, CopySize); + if (CopySize >= DataSize) + return; + + memcpy(Buf + CopySize, Start, Offset); + return; +} + +// Parses the given data-buffer for stats and fill the CycleArray. +// If data has been extracted successfully, also modifies the code to jump +// out the benchmark loop. +static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize, + const void *From, const void *To, + llvm::SmallVector *CycleArray) { + assert(From != nullptr && To != nullptr); + const char *DataPtr = DataBuf; + while (DataPtr < DataBuf + DataSize) { + struct perf_event_header Header; + memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); + if (Header.type != PERF_RECORD_SAMPLE) { + // Ignores non-sample records. + DataPtr += Header.size; + continue; + } + DataPtr += sizeof(Header); + uint64_t Count = llvm::support::endian::read64(DataPtr, support::native); + DataPtr += sizeof(Count); + + struct perf_branch_entry Entry; + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + + // Read the perf_branch_entry array. + for (uint64_t i = 0; i < Count; ++i) { + const uint64_t BlockStart = From == nullptr + ? std::numeric_limits::min() + : reinterpret_cast(From); + const uint64_t BlockEnd = To == nullptr + ? std::numeric_limits::max() + : reinterpret_cast(To); + + if (BlockStart <= Entry.from && BlockEnd >= Entry.to) + CycleArray->push_back(Entry.cycles); + + if (i == Count - 1) + // We've reached the last entry. + return llvm::Error::success(); + + // Advance to next entry + DataPtr += sizeof(Entry); + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + } + } + return llvm::make_error("Unable to parse databuffer.", + llvm::errc::io_error); +} + +X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) { + assert(SamplingPeriod > 0 && "SamplingPeriod must be positive"); + EventString = "BR_INST_RETIRED.NEAR_TAKEN"; + Attr = new perf_event_attr(); + Attr->size = sizeof(*Attr); + Attr->type = PERF_TYPE_RAW; + // FIXME This is SKL's encoding. Not sure if it'll change. + Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN + Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; + // Don't need to specify "USER" because we've already excluded HV and Kernel. + Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY; + Attr->sample_period = SamplingPeriod; + Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. + Attr->disabled = 1; + Attr->exclude_kernel = 1; + Attr->exclude_hv = 1; + Attr->read_format = PERF_FORMAT_GROUP; + + FullQualifiedEventString = EventString; +} + +X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent) + : Counter(std::move(NewEvent)) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page, so we allocate one more page. + MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(), + PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0); + if (MMappedBuffer == MAP_FAILED) + llvm::errs() << "Failed to mmap buffer."; +} + +X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); } + +void X86LbrCounter::start() { + ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); +} + +llvm::Expected> +X86LbrCounter::readOrError(StringRef FunctionBytes) const { + // The max number of time-outs/retries before we give up. + static constexpr int kMaxTimeouts = 160; + + // Disable the event before reading + ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0); + + // Parses the LBR buffer and fills CycleArray with the sequence of cycle + // counts from the buffer. + llvm::SmallVector CycleArray; + std::unique_ptr DataBuf(new char[kDataBufferSize]); + int NumTimeouts = 0; + int PollResult = 0; + + // Find the boundary of the function so that we could filter the LBRs + // to keep only the relevant records. + if (FunctionBytes.empty()) + return llvm::make_error("Empty function bytes", + llvm::errc::invalid_argument); + const void *From = reinterpret_cast(FunctionBytes.data()); + const void *To = reinterpret_cast(FunctionBytes.data() + + FunctionBytes.size()); + while (PollResult <= 0) { + PollResult = pollLbrPerfEvent(FileDescriptor); + if (PollResult > 0) + break; + if (PollResult == -1) + return llvm::make_error("Cannot poll LBR perf event.", + llvm::errc::io_error); + if (NumTimeouts++ >= kMaxTimeouts) + return llvm::make_error( + "LBR polling still timed out after max number of attempts.", + llvm::errc::device_or_resource_busy); + } + + struct perf_event_mmap_page Page; + memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); + + const uint64_t DataTail = Page.data_tail; + const uint64_t DataHead = Page.data_head; + // We're supposed to use a barrier after reading data_head. + std::atomic_thread_fence(std::memory_order_acq_rel); + const size_t DataSize = DataHead - DataTail; + if (DataSize > kDataBufferSize) + return llvm::make_error( + "DataSize larger than buffer size.", llvm::errc::invalid_argument); + + copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize); + llvm::Error error = + parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray); + if (!error) + return CycleArray; + return std::move(error); +} + +} // namespace exegesis +} // namespace llvm + +#endif // defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) +#endif // __linux__ diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h new file mode 100644 index 0000000000000..94062012917df --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h @@ -0,0 +1,55 @@ +//===-- X86Counter.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Perf counter that reads the LBRs for measuring the benchmarked block's +/// throughput. +/// +/// More info at: https://lwn.net/Articles/680985 +//===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H +#define LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H + +#include "../PerfHelper.h" +#include "llvm/Support/Error.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#if defined(__linux__) && defined(HAVE_LIBPFM) && \ + defined(LIBPFM_HAS_FIELD_CYCLES) + +namespace llvm { +namespace exegesis { + +class X86LbrPerfEvent : public pfm::PerfEvent { +public: + X86LbrPerfEvent(unsigned SamplingPeriod); +}; + +class X86LbrCounter : public pfm::Counter { +public: + explicit X86LbrCounter(pfm::PerfEvent &&Event); + + virtual ~X86LbrCounter(); + + void start() override; + + llvm::Expected> + readOrError(StringRef FunctionBytes) const override; + +private: + void *MMappedBuffer = nullptr; +}; + +} // namespace exegesis +} // namespace llvm + +#endif // defined(__linux__) && defined(HAVE_LIBPFM) && + // defined(LIBPFM_HAS_FIELD_CYCLES) + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index 507015b97472b..8eeda48823859 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -160,6 +160,12 @@ static cl::opt cl::desc(""), cl::cat(AnalysisOptions), cl::init("")); +static cl::list + AllowedHostCpus("allowed-host-cpu", + cl::desc("If specified, only run the benchmark if the host " + "CPU matches the names"), + cl::cat(Options), cl::ZeroOrMore); + static cl::opt AnalysisDisplayUnstableOpcodes( "analysis-display-unstable-clusters", cl::desc("if there is more than one benchmark for an opcode, said " @@ -296,6 +302,13 @@ void benchmarkMain() { const LLVMState State(CpuName); + llvm::StringRef ActualCpu = State.getTargetMachine().getTargetCPU(); + for (auto Begin = AllowedHostCpus.begin(); Begin != AllowedHostCpus.end(); + ++Begin) { + if (ActualCpu != *Begin) + ExitWithError(llvm::Twine("Unexpected host CPU ").concat(ActualCpu)); + } + const std::unique_ptr Runner = ExitOnErr(State.getExegesisTarget().createBenchmarkRunner( BenchmarkMode, State, ResultAggMode)); From b6902d977ac07ad093655230dcee0a7d920e47b7 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 27 Jul 2020 12:44:47 -0400 Subject: [PATCH 0216/1035] [gn build] Make syncing to single-line source files work after 1afd889d0b43 --- llvm/utils/gn/build/sync_source_lists_from_cmake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/build/sync_source_lists_from_cmake.py b/llvm/utils/gn/build/sync_source_lists_from_cmake.py index b88b3b8007712..e0c550ed7085b 100755 --- a/llvm/utils/gn/build/sync_source_lists_from_cmake.py +++ b/llvm/utils/gn/build/sync_source_lists_from_cmake.py @@ -27,7 +27,7 @@ def patch_gn_file(gn_file, add, remove): with open(gn_file) as f: gn_contents = f.read() - srcs_tok = 'sources = [\n' + srcs_tok = 'sources = [' tokloc = gn_contents.find(srcs_tok) if tokloc == -1: raise ValueError(gn_file + ': Failed to find source list') From e77ff4abb38778904e72a38942cc8f160f661583 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 27 Jul 2020 16:45:49 +0000 Subject: [PATCH 0217/1035] [gn build] Port ee7caa75939 --- .../gn/secondary/llvm/tools/llvm-exegesis/lib/X86/BUILD.gn | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/X86/BUILD.gn index 42bce295c9a7e..a4e028e653186 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/X86/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/X86/BUILD.gn @@ -14,6 +14,9 @@ static_library("X86") { # depend on this Target/X86-internal build target. "//llvm/lib/Target/X86/MCTargetDesc", ] - sources = [ "Target.cpp" ] + sources = [ + "Target.cpp", + "X86Counter.cpp", + ] include_dirs = [ "//llvm/lib/Target/X86" ] } From 4d84d94969d6a72f8cf189c5fba1e2366ed88ee3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 27 Jul 2020 17:32:53 +0100 Subject: [PATCH 0218/1035] [X86][SSE] Relax 128-bit restriction on extract_subvector(ext_vector_inreg(X),0) -> ext_vector_inreg(extract_subvector(X,0)) fold We only need to ensure that the source is larger than the subvector result type --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +++++++--- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 12 ++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6abe6c6b83156..b986c42b9563f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48420,10 +48420,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || InOpcode == ISD::SIGN_EXTEND || InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && - VT.is128BitVector() && - InVec.getOperand(0).getSimpleValueType().is128BitVector()) { + (SizeInBits == 128 || SizeInBits == 256) && + InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) { + SDLoc DL(N); + SDValue Ext = InVec.getOperand(0); + if (Ext.getValueSizeInBits() > SizeInBits) + Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits); unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); - return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0)); + return DAG.getNode(ExtOp, DL, VT, Ext); } if (InOpcode == ISD::VSELECT && InVec.getOperand(0).getValueType().is256BitVector() && diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 6e9bfbe5a79fd..23bf09c7ec9c7 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -2006,24 +2006,24 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512BWVL-LABEL: test_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax From 8f6e84ba7b590bb6fdcd6b2b3ba32782b7802bd3 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Mon, 27 Jul 2020 09:48:36 -0700 Subject: [PATCH 0219/1035] [mlir][Linalg] Enable fusion of std.constant (producer) with linalg.indexed_generic (consumer) with tensor arguments. The implementation of fusing std.constant producer with a linalg.indexed_generic consumer was already in place. It is exposed with this change. Also cleaning up some of the patterns that implement the fusion to not be templated, thereby avoiding lot of conditional checks for calling the right instantiation. Differential Revision: https://reviews.llvm.org/D84566 --- mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp | 184 +++++++++--------- mlir/test/Dialect/Linalg/fusion-tensor.mlir | 64 ++++++ 2 files changed, 161 insertions(+), 87 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp index 82dfa75fc1f4b..9080a202a824d 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp @@ -440,6 +440,10 @@ namespace { struct FuseGenericOpsOnTensors { static bool isFusible(LinalgOp producer, LinalgOp consumer, unsigned consumerIdx) { + // Producer and consumer must have tensor semantics. + if (!producer.hasTensorSemantics() || !consumer.hasTensorSemantics()) + return false; + // Verify that // - the producer has all "parallel" iterator type. if (producer.getNumParallelLoops() != producer.getNumLoops()) @@ -457,9 +461,9 @@ struct FuseGenericOpsOnTensors { return producerResultIndexMap.isPermutation(); } - static Operation *fuse(LinalgOp producer, LinalgOp consumer, - unsigned consumerIdx, PatternRewriter &rewriter, - OperationFolder *folder = nullptr) { + static LinalgOp fuse(LinalgOp producer, LinalgOp consumer, + unsigned consumerIdx, PatternRewriter &rewriter, + OperationFolder *folder = nullptr) { if (!isFusible(producer, consumer, consumerIdx)) return nullptr; @@ -736,24 +740,45 @@ static bool isTensorReshapeOpFusible(TensorReshapeOp reshapeOp, return useIndexMap.isIdentity(); } +/// Based on the type of `op` create a linalg op of the same type, i.e. if `op` +/// is a linalg.generic operation, the create a `linalg.generic` operation with +/// the given `args`. Expects `op` to be `linalg.generic` or +/// `linalg.indexed_generic`. +template +static LinalgOp createLinalgOpOfSameType(LinalgOp op, PatternRewriter &rewriter, + Args... args) { + if (isa(op.getOperation())) + return cast(rewriter.create(args...).getOperation()); + if (isa(op.getOperation())) + return cast( + rewriter.create(args...).getOperation()); + llvm_unreachable( + "expected only linalg.generic or linalg.indexed_generic ops"); + return nullptr; +} + namespace { + /// Implementation of fusion on tensor ops when producer is a TensorReshapeOp. -template struct FuseTensorReshapeOpAsProducer { - static bool isFusible(TensorReshapeOp producer, LinalgOpTy consumer, +struct FuseTensorReshapeOpAsProducer { + static bool isFusible(TensorReshapeOp producer, LinalgOp consumer, unsigned consumerIdx) { - return isTensorReshapeOpFusible( - producer, consumer.getInputIndexingMap(consumerIdx), true); + return isa(consumer.getOperation()) && + consumer.hasTensorSemantics() && + isTensorReshapeOpFusible(producer, + consumer.getInputIndexingMap(consumerIdx), + /*asProducer=*/true); } - static Operation *fuse(TensorReshapeOp producer, LinalgOpTy consumer, - unsigned consumerIdx, PatternRewriter &rewriter, - OperationFolder *folder = nullptr) { + static LinalgOp fuse(TensorReshapeOp producer, LinalgOp consumer, + unsigned consumerIdx, PatternRewriter &rewriter, + OperationFolder *folder = nullptr) { if (!isFusible(producer, consumer, consumerIdx)) return nullptr; // Compute the fused operands list, - SmallVector fusedOperands(consumer.operand_begin(), - consumer.operand_end()); + Operation *consumerOp = consumer.getOperation(); + SmallVector fusedOperands(consumerOp->getOperands()); fusedOperands[consumerIdx] = producer.src(); // Compute indexing_maps for the fused operation. The indexing_maps for the @@ -783,32 +808,35 @@ template struct FuseTensorReshapeOpAsProducer { llvm::map_range(fusedIndexMaps, [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); })); - auto fusedOp = rewriter.create( - rewriter.getUnknownLoc(), consumer.getResultTypes(), fusedOperands, + LinalgOp fusedOp = createLinalgOpOfSameType( + consumer, rewriter, rewriter.getUnknownLoc(), + consumerOp->getResultTypes(), fusedOperands, rewriter.getI64IntegerAttr(fusedOperands.size()), - rewriter.getI64IntegerAttr(consumer.getNumResults()), + rewriter.getI64IntegerAttr(consumerOp->getNumResults()), rewriter.getArrayAttr(indexMapAttrs), consumer.iterator_types(), /*doc=*/nullptr, /*library_call=*/nullptr, /*symbol_source=*/nullptr); - auto &fusedRegion = fusedOp.region(); - rewriter.cloneRegionBefore(consumer.region(), fusedRegion, + auto &fusedRegion = fusedOp.getOperation()->getRegion(0); + rewriter.cloneRegionBefore(consumerOp->getRegion(0), fusedRegion, fusedRegion.begin()); return fusedOp; } }; /// Implementation of fusion on tensor ops when consumer is a TensorReshapeOp. -template struct FuseTensorReshapeOpAsConsumer { - static bool isFusible(LinalgOpTy producer, TensorReshapeOp consumer, +struct FuseTensorReshapeOpAsConsumer { + static bool isFusible(LinalgOp producer, TensorReshapeOp consumer, unsigned consumerIdx) { - return isTensorReshapeOpFusible(consumer, producer.getOutputIndexingMap(0), - false); + return isa(producer.getOperation()) && + producer.hasTensorSemantics() && + isTensorReshapeOpFusible(consumer, producer.getOutputIndexingMap(0), + /*asProducer=*/false); } - static Operation *fuse(LinalgOpTy producer, TensorReshapeOp consumer, - unsigned consumerIdx, PatternRewriter &rewriter, - OperationFolder *folder = nullptr) { + static LinalgOp fuse(LinalgOp producer, TensorReshapeOp consumer, + unsigned consumerIdx, PatternRewriter &rewriter, + OperationFolder *folder = nullptr) { if (!isFusible(producer, consumer, consumerIdx)) return nullptr; @@ -839,33 +867,36 @@ template struct FuseTensorReshapeOpAsConsumer { return AffineMapAttr::get(map); })); - auto fusedOp = rewriter.create( - rewriter.getUnknownLoc(), consumer.getResultType(), - producer.getOperands(), - rewriter.getI64IntegerAttr(producer.getNumOperands()), + Operation *producerOp = producer.getOperation(); + LinalgOp fusedOp = createLinalgOpOfSameType( + producer, rewriter, rewriter.getUnknownLoc(), consumer.getResultType(), + producerOp->getOperands(), + rewriter.getI64IntegerAttr(producerOp->getNumOperands()), rewriter.getI64IntegerAttr(1), rewriter.getArrayAttr(indexMapAttrs), producer.iterator_types(), /*doc=*/nullptr, /*library_call=*/nullptr, /*symbol_source=*/nullptr); - auto &fusedRegion = fusedOp.region(); - rewriter.cloneRegionBefore(producer.region(), fusedRegion, + auto &fusedRegion = fusedOp.getOperation()->getRegion(0); + rewriter.cloneRegionBefore(producerOp->getRegion(0), fusedRegion, fusedRegion.begin()); return fusedOp; } }; /// Implementation of fusion on tensor ops when producer is a splat constant. -template struct FuseConstantOpAsProducer { - static bool isFusible(ConstantOp producer, LinalgOpTy consumer, +struct FuseConstantOpAsProducer { + static bool isFusible(ConstantOp producer, LinalgOp consumer, unsigned consumerIdx) { - return producer.getResult().getType().isa() && - producer.value().template cast().isSplat(); + return isa(consumer.getOperation()) && + consumer.hasTensorSemantics() && + producer.getResult().getType().isa() && + producer.value().cast().isSplat(); } - static Operation *fuse(ConstantOp producer, LinalgOpTy consumer, - unsigned consumerIdx, PatternRewriter &rewriter, - OperationFolder *folder = nullptr) { + static LinalgOp fuse(ConstantOp producer, LinalgOp consumer, + unsigned consumerIdx, PatternRewriter &rewriter, + OperationFolder *folder = nullptr) { if (!isFusible(producer, consumer, consumerIdx)) return nullptr; @@ -881,19 +912,20 @@ template struct FuseConstantOpAsProducer { // The operands list is same as the consumer with the argument for constant // index dropped. - SmallVector fusedOperands(consumer.operand_begin(), - consumer.operand_end()); + Operation *consumerOp = consumer.getOperation(); + SmallVector fusedOperands(consumerOp->getOperands()); fusedOperands.erase(std::next(fusedOperands.begin(), consumerIdx)); // Create a constant scalar value from the splat constant. Value scalarConstant = rewriter.create( producer.getLoc(), - producer.value().template cast().getSplatValue()); + producer.value().cast().getSplatValue()); - auto fusedOp = rewriter.create( - rewriter.getUnknownLoc(), consumer.getResultTypes(), fusedOperands, - rewriter.getI64IntegerAttr(consumer.getNumOperands() - 1), - rewriter.getI64IntegerAttr(consumer.getNumResults()), + LinalgOp fusedOp = createLinalgOpOfSameType( + consumer, rewriter, rewriter.getUnknownLoc(), + consumerOp->getResultTypes(), fusedOperands, + rewriter.getI64IntegerAttr(consumerOp->getNumOperands() - 1), + rewriter.getI64IntegerAttr(consumerOp->getNumResults()), rewriter.getAffineMapArrayAttr(fusedIndexMaps), consumer.iterator_types(), /*doc=*/nullptr, @@ -902,19 +934,18 @@ template struct FuseConstantOpAsProducer { // Map the block argument corresponding to the replaced argument with the // scalar constant. - Region &consumerRegion = consumer.region(); + Region &consumerRegion = consumerOp->getRegion(0); Block &entryBlock = *consumerRegion.begin(); - unsigned argIndex = - entryBlock.getNumArguments() - consumer.getNumOperands() + consumerIdx; + unsigned argIndex = entryBlock.getNumArguments() - + consumerOp->getNumOperands() + consumerIdx; BlockAndValueMapping mapping; mapping.map(entryBlock.getArgument(argIndex), scalarConstant); - Region &fusedRegion = fusedOp.region(); + Region &fusedRegion = fusedOp.getOperation()->getRegion(0); rewriter.cloneRegionBefore(consumerRegion, fusedRegion, fusedRegion.begin(), mapping); return fusedOp; } }; - } // namespace Operation *mlir::linalg::fuseTensorOps(PatternRewriter &rewriter, @@ -929,48 +960,27 @@ Operation *mlir::linalg::fuseTensorOps(PatternRewriter &rewriter, // Fuse when consumer is GenericOp or IndexedGenericOp. if (isa(consumer)) { - auto linalgOpConsumer = cast(consumer); - if (!linalgOpConsumer.hasTensorSemantics()) - return nullptr; - if (isa(producer)) { - auto linalgOpProducer = cast(producer); - if (linalgOpProducer.hasTensorSemantics()) - return FuseGenericOpsOnTensors::fuse(linalgOpProducer, linalgOpConsumer, - consumerIdx, rewriter, folder); - } else if (auto reshapeOpProducer = dyn_cast(producer)) { - if (auto genericOpConsumer = dyn_cast(consumer)) { - return FuseTensorReshapeOpAsProducer::fuse( - reshapeOpProducer, genericOpConsumer, consumerIdx, rewriter, - folder); - } else if (auto indexedGenericOpConsumer = - dyn_cast(consumer)) { - return FuseTensorReshapeOpAsProducer::fuse( - reshapeOpProducer, indexedGenericOpConsumer, consumerIdx, rewriter, - folder); - } - } else if (auto constantOpProducer = dyn_cast(producer)) { - if (auto genericOpConsumer = dyn_cast(consumer)) { - return FuseConstantOpAsProducer::fuse( - constantOpProducer, genericOpConsumer, consumerIdx, rewriter, - folder); - } - } + if (isa(producer)) + return FuseGenericOpsOnTensors::fuse(cast(producer), + cast(consumer), + consumerIdx, rewriter, folder); + if (auto reshapeOpProducer = dyn_cast(producer)) + return FuseTensorReshapeOpAsProducer::fuse(reshapeOpProducer, + cast(consumer), + consumerIdx, rewriter, folder); + if (auto constantOpProducer = dyn_cast(producer)) + return FuseConstantOpAsProducer::fuse(constantOpProducer, + cast(consumer), + consumerIdx, rewriter, folder); return nullptr; } - // Fuse when consumer is a TensorReshapeOp. - if (TensorReshapeOp reshapeOp = dyn_cast(consumer)) { - if (auto genericOpProducer = dyn_cast(producer)) { - if (genericOpProducer.hasTensorSemantics()) - return FuseTensorReshapeOpAsConsumer::fuse( - genericOpProducer, reshapeOp, consumerIdx, rewriter, folder); - } else if (auto indexedGenericOpProducer = - dyn_cast(producer)) { - if (indexedGenericOpProducer.hasTensorSemantics()) - return FuseTensorReshapeOpAsConsumer::fuse( - indexedGenericOpProducer, reshapeOp, consumerIdx, rewriter, folder); + if (isa(producer)) { + // Fuse when consumer is a TensorReshapeOp. + if (TensorReshapeOp reshapeOp = dyn_cast(consumer)) { + return FuseTensorReshapeOpAsConsumer::fuse( + cast(producer), reshapeOp, consumerIdx, rewriter, folder); } - return nullptr; } return nullptr; diff --git a/mlir/test/Dialect/Linalg/fusion-tensor.mlir b/mlir/test/Dialect/Linalg/fusion-tensor.mlir index 5c49a59227c14..4e7f1f6152b1a 100644 --- a/mlir/test/Dialect/Linalg/fusion-tensor.mlir +++ b/mlir/test/Dialect/Linalg/fusion-tensor.mlir @@ -249,6 +249,38 @@ func @generic_op_constant_fusion(%arg0 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32> // ----- +#map0 = affine_map<(d0, d1, d2) -> (d0)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +func @indexed_generic_op_constant_fusion(%arg0 : tensor<5x?x?xf32>) + -> tensor<5x?x?xf32> +{ + %0 = constant dense<42.0> : tensor<5xf32> + %1 = linalg.indexed_generic + {args_in = 2 : i64, args_out = 1 : i64, + indexing_maps = [#map0, #map1, #map1], + iterator_types = ["parallel", "parallel", "parallel"]} + %0, %arg0 { + ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: f32, %arg5 : f32): + %2 = mulf %arg4, %arg5 : f32 + linalg.yield %2 : f32 + }: tensor<5xf32>, tensor<5x?x?xf32> -> tensor<5x?x?xf32> + return %1 : tensor<5x?x?xf32> +} +// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +// CHECK-LABEL: func @indexed_generic_op_constant_fusion +// CHECK: %[[CST:.*]] = constant {{.*}} : f32 +// CHECK: linalg.indexed_generic +// CHECK-SAME: args_in = 1 : i64 +// CHECK-SAME: args_out = 1 : i64 +// CHECK: ^{{[a-zA-Z0-9_]*}} +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]*]]: index +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]*]]: index +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]*]]: index +// CHECK-SAME: %[[ARG4:.*]]: f32) +// CHECK: mulf %[[CST]], %[[ARG4]] + +// ----- + #map0 = affine_map<(d0, d1, d2) -> ()> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> func @generic_op_zero_dim_constant_fusion(%arg0 : tensor<5x?x?xf32>) @@ -277,6 +309,38 @@ func @generic_op_zero_dim_constant_fusion(%arg0 : tensor<5x?x?xf32>) // ----- +#map0 = affine_map<(d0, d1, d2) -> ()> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +func @indexed_generic_op_zero_dim_constant_fusion + (%arg0 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32> +{ + %0 = constant dense<42.0> : tensor + %1 = linalg.indexed_generic + {args_in = 2 : i64, args_out = 1 : i64, + indexing_maps = [#map0, #map1, #map1], + iterator_types = ["parallel", "parallel", "parallel"]} + %0, %arg0 { + ^bb0(%arg1 : index, %arg2 : index, %arg3 : index, %arg4: f32, %arg5: f32): + %2 = mulf %arg4, %arg5 : f32 + linalg.yield %2 : f32 + }: tensor, tensor<5x?x?xf32> -> tensor<5x?x?xf32> + return %1 : tensor<5x?x?xf32> +} +// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +// CHECK-LABEL: func @indexed_generic_op_zero_dim_constant_fusion +// CHECK: %[[CST:.*]] = constant {{.*}} : f32 +// CHECK: linalg.indexed_generic +// CHECK-SAME: args_in = 1 : i64 +// CHECK-SAME: args_out = 1 : i64 +// CHECK: ^{{[a-zA-Z0-9_]*}} +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]*]]: index +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]*]]: index +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]*]]: index +// CHECK-SAME: %[[ARG4:.*]]: f32) +// CHECK: mulf %[[CST]], %[[ARG4]] + +// ----- + #map0 = affine_map<(d0, d1) -> (d0, d1)> func @generic_op_indexed_generic_op_fusion(%arg0: tensor, %arg1: tensor) { From dbeb184b7f54db2d3ef20ac153b1c77f81cf0b99 Mon Sep 17 00:00:00 2001 From: Francesco Petrogalli Date: Mon, 27 Jul 2020 16:24:18 +0000 Subject: [PATCH 0220/1035] [NFC][AArch64] Replace some template methods/invocations... ...with the non-template version, as the template version might increase the size of the compiler build. Methods affected: 1.`findAddrModeSVELoadStore` 2. `SelectPredicatedStore` Also, remove the `const` qualifier from the `unsigned` parameters of the methods to conform with other similar methods in the class. --- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 67 +++++++------------ 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 10c4778533533..dbd7db7ee8e6f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -262,14 +262,12 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - template - void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr, - const unsigned Opc_ri); - template + void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale, + unsigned Opc_rr, unsigned Opc_ri); std::tuple - findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, - const unsigned Opc_ri, const SDValue &OldBase, - const SDValue &OldOffset); + findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri, + const SDValue &OldBase, const SDValue &OldOffset, + unsigned Scale); bool tryBitfieldExtractOp(SDNode *N); bool tryBitfieldExtractOpFromSExt(SDNode *N); @@ -1414,12 +1412,12 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, /// Optimize \param OldBase and \param OldOffset selecting the best addressing /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the /// new Base and an SDValue representing the new offset. -template std::tuple -AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, - const unsigned Opc_ri, +AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, + unsigned Opc_ri, const SDValue &OldBase, - const SDValue &OldOffset) { + const SDValue &OldOffset, + unsigned Scale) { SDValue NewBase = OldBase; SDValue NewOffset = OldOffset; // Detect a possible Reg+Imm addressing mode. @@ -1429,7 +1427,7 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, // Detect a possible reg+reg addressing mode, but only if we haven't already // detected a Reg+Imm one. const bool IsRegReg = - !IsRegImm && SelectSVERegRegAddrMode(OldBase, NewBase, NewOffset); + !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset); // Select the instruction. return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset); @@ -1479,10 +1477,9 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, ReplaceNode(N, St); } -template void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, - const unsigned Opc_rr, - const unsigned Opc_ri) { + unsigned Scale, unsigned Opc_rr, + unsigned Opc_ri) { SDLoc dl(N); // Form a REG_SEQUENCE to force register allocation. @@ -1492,9 +1489,9 @@ void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, // Optimize addressing mode. unsigned Opc; SDValue Offset, Base; - std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3), - CurDAG->getTargetConstant(0, dl, MVT::i64)); + CurDAG->getTargetConstant(0, dl, MVT::i64), Scale); SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate Base, // address @@ -4085,63 +4082,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_st2: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore(Node, 2, AArch64::ST2B, - AArch64::ST2B_IMM); + SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore(Node, 2, AArch64::ST2H, - AArch64::ST2H_IMM); + SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore(Node, 2, AArch64::ST2W, - AArch64::ST2W_IMM); + SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore(Node, 2, AArch64::ST2D, - AArch64::ST2D_IMM); + SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM); return; } break; } case Intrinsic::aarch64_sve_st3: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore(Node, 3, AArch64::ST3B, - AArch64::ST3B_IMM); + SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore(Node, 3, AArch64::ST3H, - AArch64::ST3H_IMM); + SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore(Node, 3, AArch64::ST3W, - AArch64::ST3W_IMM); + SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore(Node, 3, AArch64::ST3D, - AArch64::ST3D_IMM); + SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM); return; } break; } case Intrinsic::aarch64_sve_st4: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore(Node, 4, AArch64::ST4B, - AArch64::ST4B_IMM); + SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore(Node, 4, AArch64::ST4H, - AArch64::ST4H_IMM); + SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore(Node, 4, AArch64::ST4W, - AArch64::ST4W_IMM); + SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore(Node, 4, AArch64::ST4D, - AArch64::ST4D_IMM); + SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM); return; } break; From 4a4cafabc9067fced5890a245b03ef5897ad988b Mon Sep 17 00:00:00 2001 From: AlexisPerry Date: Mon, 27 Jul 2020 09:57:31 -0700 Subject: [PATCH 0221/1035] [flang] Temp Driver - pass the flag to change the default integer kind through to F18_FC fixes BUG 46307 Differential Revision: https://reviews.llvm.org/D84266 --- flang/tools/f18/f18.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp index bcafb0d53cc7b..2b2eacc2e6f11 100644 --- a/flang/tools/f18/f18.cpp +++ b/flang/tools/f18/f18.cpp @@ -545,6 +545,11 @@ int main(int argc, char *const argv[]) { defaultKinds.set_defaultIntegerKind(8); defaultKinds.set_subscriptIntegerKind(8); defaultKinds.set_sizeIntegerKind(8); + if (isPGF90) { + driver.F18_FCArgs.push_back("-i8"); + } else { + driver.F18_FCArgs.push_back("-fdefault-integer-8"); + } } else if (arg == "-Mlargearray") { } else if (arg == "-Mnolargearray") { } else if (arg == "-flarge-sizes") { From 51e1c028d4021eba64a56e684d34bb164c244d43 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 27 Jul 2020 09:40:26 -0700 Subject: [PATCH 0222/1035] [X86] Add back comment inadvertently lost in 1a1448e6568d9b11f198e510fa9c4cb6b1f4216a. --- llvm/lib/Target/X86/X86MCInstLower.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 8f3e32727371b..af86c1fc7206d 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1083,6 +1083,9 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, /// bytes. Return the size of nop emitted. static unsigned emitNop(MCStreamer &OS, unsigned NumBytes, const X86Subtarget *Subtarget) { + // Determine the longest nop which can be efficiently decoded for the given + // target cpu. 15-bytes is the longest single NOP instruction, but some + // platforms can't decode the longest forms efficiently. unsigned MaxNopLength = 1; if (Subtarget->is64Bit()) { // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the From df880b77302d2e12d988e620eba242defdd6d4a7 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 27 Jul 2020 10:06:36 -0700 Subject: [PATCH 0223/1035] [StackProtector] Speed up RequiresStackProtector Speed up the method RequiresStackProtector by checking the intrinsic value of the call. The original code calls getName() that returns an allocating std::string on each check. This change removes about 96072 std::string instances when compiling sqlite3.c; The function was discovered with a Facebook-internal performance tool. Differential Revision: https://reviews.llvm.org/D84620 --- llvm/lib/CodeGen/StackProtector.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index a343791807e64..e246c2e5f55cb 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -251,10 +251,9 @@ bool StackProtector::HasAddressTaken(const Instruction *AI, static const CallInst *findStackProtectorIntrinsic(Function &F) { for (const BasicBlock &BB : F) for (const Instruction &I : BB) - if (const CallInst *CI = dyn_cast(&I)) - if (CI->getCalledFunction() == - Intrinsic::getDeclaration(F.getParent(), Intrinsic::stackprotector)) - return CI; + if (const auto *II = dyn_cast(&I)) + if (II->getIntrinsicID() == Intrinsic::stackprotector) + return II; return nullptr; } From 902cbcd59e22ccd853f6b0c22acc772fd955dc46 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 23 Jul 2020 23:13:44 -0700 Subject: [PATCH 0224/1035] Use llvm::is_contained where appropriate (NFC) Summary: This patch replaces std::find with llvm::is_contained where appropriate. Reviewers: efriedma, nhaehnle Reviewed By: nhaehnle Subscribers: arsenm, jvesely, nhaehnle, hiraditya, rogfer01, kerbowa, llvm-commits, vkmr Tags: #llvm Differential Revision: https://reviews.llvm.org/D84489 --- llvm/lib/Analysis/AssumptionCache.cpp | 2 +- llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 3 +-- llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp | 2 +- llvm/lib/Target/BPF/BPFMIChecking.cpp | 6 ++---- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 3 +-- llvm/lib/Transforms/IPO/PartialInlining.cpp | 3 +-- llvm/lib/Transforms/Scalar/GVNSink.cpp | 6 ++---- llvm/lib/Transforms/Vectorize/VPlanSLP.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 8 ++------ llvm/tools/bugpoint/ExtractFunction.cpp | 2 +- llvm/tools/llvm-exegesis/lib/Analysis.cpp | 2 +- 12 files changed, 16 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp index 16bfd5c75902f..bc0cdc4c4c77a 100644 --- a/llvm/lib/Analysis/AssumptionCache.cpp +++ b/llvm/lib/Analysis/AssumptionCache.cpp @@ -175,7 +175,7 @@ void AssumptionCache::transferAffectedValuesInCache(Value *OV, Value *NV) { return; for (auto &A : AVI->second) - if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end()) + if (!llvm::is_contained(NAVV, A)) NAVV.push_back(A); AffectedValues.erase(OV); } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp index a83742f2138fc..17bce517814de 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp @@ -24,7 +24,7 @@ LegalityPredicates::typeInSet(unsigned TypeIdx, std::initializer_list TypesInit) { SmallVector Types = TypesInit; return [=](const LegalityQuery &Query) { - return std::find(Types.begin(), Types.end(), Query.Types[TypeIdx]) != Types.end(); + return llvm::is_contained(Types, Query.Types[TypeIdx]); }; } @@ -34,7 +34,7 @@ LegalityPredicate LegalityPredicates::typePairInSet( SmallVector, 4> Types = TypesInit; return [=](const LegalityQuery &Query) { std::pair Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1]}; - return std::find(Types.begin(), Types.end(), Match) != Types.end(); + return llvm::is_contained(Types, Match); }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 4a14259f1bdb1..538a22df514fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -495,8 +495,7 @@ bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const { } bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { - return AllNative || - std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end(); + return AllNative || llvm::is_contained(UseNative, F); } void AMDGPULibCalls::initNativeFuncs() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index 281ae6d646e9e..f898456203a10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -96,7 +96,7 @@ namespace { SmallVector All; for (auto MD : NamedMD->operands()) for (const auto &Op : MD->operands()) - if (std::find(All.begin(), All.end(), Op.get()) == All.end()) + if (!llvm::is_contained(All, Op.get())) All.push_back(Op.get()); NamedMD->eraseFromParent(); diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp index f82f166eda4d0..ac40b24596a26 100644 --- a/llvm/lib/Target/BPF/BPFMIChecking.cpp +++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp @@ -143,12 +143,10 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) { return true; // Otherwise, return true if any aliased SuperReg of GPR32 is not dead. - std::vector::iterator search_begin = GPR64DeadDefs.begin(); - std::vector::iterator search_end = GPR64DeadDefs.end(); for (auto I : GPR32LiveDefs) for (MCSuperRegIterator SR(I, TRI); SR.isValid(); ++SR) - if (std::find(search_begin, search_end, *SR) == search_end) - return true; + if (!llvm::is_contained(GPR64DeadDefs, *SR)) + return true; return false; } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index d9fb820f7cb53..0b650b3b58f43 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2221,8 +2221,7 @@ isValidCandidateForColdCC(Function &F, BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc); if (!isColdCallSite(CB, CallerBFI)) return false; - auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc); - if (It == AllCallsCold.end()) + if (!llvm::is_contained(AllCallsCold, CallerFunc)) return false; } return true; diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 5d863f1330a44..e1dc036ae413c 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -1181,8 +1181,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // (i.e. not to be extracted to the out of line function) auto ToBeInlined = [&, this](BasicBlock *BB) { return BB == ClonedOI->ReturnBlock || - (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != - ClonedOI->Entries.end()); + llvm::is_contained(ClonedOI->Entries, BB); }; assert(ClonedOI && "Expecting OutlineInfo for single region outline"); diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index dfb4b7e038ba5..5a34ad18158b4 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -158,8 +158,7 @@ class LockstepReverseIterator { void restrictToBlocks(SmallSetVector &Blocks) { for (auto II = Insts.begin(); II != Insts.end();) { - if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) == - Blocks.end()) { + if (!llvm::is_contained(Blocks, (*II)->getParent())) { ActiveBlocks.remove((*II)->getParent()); II = Insts.erase(II); } else { @@ -277,8 +276,7 @@ class ModelledPHI { auto VI = Values.begin(); while (BI != Blocks.end()) { assert(VI != Values.end()); - if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) == - NewBlocks.end()) { + if (!llvm::is_contained(NewBlocks, *BI)) { BI = Blocks.erase(BI); VI = Values.erase(VI); } else { diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index 9019ed15ec5ff..b072ca9c39206 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -124,7 +124,7 @@ bool VPlanSlp::areVectorizable(ArrayRef Operands) const { for (auto &I : *Parent) { auto *VPI = cast(&I); if (VPI->getOpcode() == Instruction::Load && - std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + llvm::is_contained(Operands, VPI)) LoadsSeen++; if (LoadsSeen == Operands.size()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index b384c94121e9b..6eec8d14de4af 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -65,9 +65,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { for (const VPBlockBase *Succ : Successors) { // There must be a bi-directional link between block and successor. const auto &SuccPreds = Succ->getPredecessors(); - assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) != - SuccPreds.end() && - "Missing predecessor link."); + assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link."); (void)SuccPreds; } @@ -86,9 +84,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { // There must be a bi-directional link between block and predecessor. const auto &PredSuccs = Pred->getSuccessors(); - assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) != - PredSuccs.end() && - "Missing successor link."); + assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link."); (void)PredSuccs; } } diff --git a/llvm/tools/bugpoint/ExtractFunction.cpp b/llvm/tools/bugpoint/ExtractFunction.cpp index d9047acd30e11..7a75cb90edc53 100644 --- a/llvm/tools/bugpoint/ExtractFunction.cpp +++ b/llvm/tools/bugpoint/ExtractFunction.cpp @@ -386,7 +386,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector &BBs, for (Function &F : *M) for (BasicBlock &BB : F) // Check if this block is going to be extracted. - if (std::find(BBs.begin(), BBs.end(), &BB) == BBs.end()) + if (!llvm::is_contained(BBs, &BB)) BlocksToExtract.push_back(&BB); raw_fd_ostream OS(Temp->FD, /*shouldClose*/ false); diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp index 5e9023b8127ac..077acf9aff6b9 100644 --- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp +++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp @@ -28,7 +28,7 @@ enum EscapeTag { kEscapeCsv, kEscapeHtml, kEscapeHtmlString }; template void writeEscaped(raw_ostream &OS, const StringRef S); template <> void writeEscaped(raw_ostream &OS, const StringRef S) { - if (std::find(S.begin(), S.end(), kCsvSep) == S.end()) { + if (!llvm::is_contained(S, kCsvSep)) { OS << S; } else { // Needs escaping. From d19af2f2476b5e13a65d5283cce9859e2c1ef763 Mon Sep 17 00:00:00 2001 From: Mitch Phillips <31459023+hctim@users.noreply.github.com> Date: Mon, 27 Jul 2020 10:51:53 -0700 Subject: [PATCH 0225/1035] [GWP-ASan] Crash handler API returns sizeof(collected trace) Summary: Fix up a slight bug with the crash handler API, where we say that we return the size of the collected trace (instead of the size of the trace that's returned) when the return buffer is too small, and the result is truncated. Also, as a result, patch up a small uninitialized memory bug. Reviewers: morehouse, eugenis Reviewed By: eugenis Subscribers: #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D84652 --- compiler-rt/lib/gwp_asan/common.cpp | 3 ++ compiler-rt/lib/gwp_asan/crash_handler.cpp | 21 ++++++++-- compiler-rt/lib/gwp_asan/tests/backtrace.cpp | 44 ++++++++++++++++++-- 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/compiler-rt/lib/gwp_asan/common.cpp b/compiler-rt/lib/gwp_asan/common.cpp index 3438c4b91893b..483694d57b7e6 100644 --- a/compiler-rt/lib/gwp_asan/common.cpp +++ b/compiler-rt/lib/gwp_asan/common.cpp @@ -34,6 +34,9 @@ const char *ErrorToString(const Error &E) { __builtin_trap(); } +constexpr size_t AllocationMetadata::kStackFrameStorageBytes; +constexpr size_t AllocationMetadata::kMaxTraceLengthToCollect; + void AllocationMetadata::RecordAllocation(uintptr_t AllocAddr, size_t AllocSize) { Addr = AllocAddr; diff --git a/compiler-rt/lib/gwp_asan/crash_handler.cpp b/compiler-rt/lib/gwp_asan/crash_handler.cpp index c3b9e1467bd9f..3c640256706f5 100644 --- a/compiler-rt/lib/gwp_asan/crash_handler.cpp +++ b/compiler-rt/lib/gwp_asan/crash_handler.cpp @@ -10,6 +10,7 @@ #include "gwp_asan/stack_trace_compressor.h" #include +#include using AllocationMetadata = gwp_asan::AllocationMetadata; using Error = gwp_asan::Error; @@ -112,9 +113,15 @@ uint64_t __gwp_asan_get_allocation_thread_id( size_t __gwp_asan_get_allocation_trace( const gwp_asan::AllocationMetadata *AllocationMeta, uintptr_t *Buffer, size_t BufferLen) { - return gwp_asan::compression::unpack( + uintptr_t UncompressedBuffer[AllocationMetadata::kMaxTraceLengthToCollect]; + size_t UnpackedLength = gwp_asan::compression::unpack( AllocationMeta->AllocationTrace.CompressedTrace, - AllocationMeta->AllocationTrace.TraceSize, Buffer, BufferLen); + AllocationMeta->AllocationTrace.TraceSize, UncompressedBuffer, + AllocationMetadata::kMaxTraceLengthToCollect); + if (UnpackedLength < BufferLen) + BufferLen = UnpackedLength; + memcpy(Buffer, UncompressedBuffer, BufferLen * sizeof(*Buffer)); + return UnpackedLength; } bool __gwp_asan_is_deallocated( @@ -130,9 +137,15 @@ uint64_t __gwp_asan_get_deallocation_thread_id( size_t __gwp_asan_get_deallocation_trace( const gwp_asan::AllocationMetadata *AllocationMeta, uintptr_t *Buffer, size_t BufferLen) { - return gwp_asan::compression::unpack( + uintptr_t UncompressedBuffer[AllocationMetadata::kMaxTraceLengthToCollect]; + size_t UnpackedLength = gwp_asan::compression::unpack( AllocationMeta->DeallocationTrace.CompressedTrace, - AllocationMeta->DeallocationTrace.TraceSize, Buffer, BufferLen); + AllocationMeta->DeallocationTrace.TraceSize, UncompressedBuffer, + AllocationMetadata::kMaxTraceLengthToCollect); + if (UnpackedLength < BufferLen) + BufferLen = UnpackedLength; + memcpy(Buffer, UncompressedBuffer, BufferLen * sizeof(*Buffer)); + return UnpackedLength; } #ifdef __cplusplus diff --git a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp index b3d44270bb2a1..95150653ff61a 100644 --- a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp +++ b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp @@ -8,6 +8,7 @@ #include +#include "gwp_asan/common.h" #include "gwp_asan/crash_handler.h" #include "gwp_asan/tests/harness.h" @@ -76,9 +77,46 @@ TEST(Backtrace, Short) { TEST(Backtrace, ExceedsStorableLength) { gwp_asan::AllocationMetadata Meta; Meta.AllocationTrace.RecordBacktrace( - [](uintptr_t * /* TraceBuffer */, size_t /* Size */) -> size_t { - return SIZE_MAX; // Wow, that's big! + [](uintptr_t *TraceBuffer, size_t Size) -> size_t { + // Need to inintialise the elements that will be packed. + memset(TraceBuffer, 0u, Size * sizeof(*TraceBuffer)); + + // Indicate that there were more frames, and we just didn't have enough + // room to store them. + return Size * 2; + }); + // Retrieve a frame from the collected backtrace, make sure it works E2E. + uintptr_t TraceOutput; + EXPECT_EQ(gwp_asan::AllocationMetadata::kMaxTraceLengthToCollect, + __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); +} + +TEST(Backtrace, ExceedsRetrievableAllocLength) { + gwp_asan::AllocationMetadata Meta; + constexpr size_t kNumFramesToStore = 3u; + Meta.AllocationTrace.RecordBacktrace( + [](uintptr_t *TraceBuffer, size_t /* Size */) -> size_t { + memset(TraceBuffer, kNumFramesToStore, + kNumFramesToStore * sizeof(*TraceBuffer)); + return kNumFramesToStore; + }); + uintptr_t TraceOutput; + // Ask for one element, get told that there's `kNumFramesToStore` available. + EXPECT_EQ(kNumFramesToStore, + __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); +} + +TEST(Backtrace, ExceedsRetrievableDeallocLength) { + gwp_asan::AllocationMetadata Meta; + constexpr size_t kNumFramesToStore = 3u; + Meta.DeallocationTrace.RecordBacktrace( + [](uintptr_t *TraceBuffer, size_t /* Size */) -> size_t { + memset(TraceBuffer, kNumFramesToStore, + kNumFramesToStore * sizeof(*TraceBuffer)); + return kNumFramesToStore; }); uintptr_t TraceOutput; - EXPECT_EQ(1u, __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); + // Ask for one element, get told that there's `kNumFramesToStore` available. + EXPECT_EQ(kNumFramesToStore, + __gwp_asan_get_deallocation_trace(&Meta, &TraceOutput, 1)); } From 2a672767ccca4525baa323e9911f7a946cc1693d Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 27 Jul 2020 10:49:16 -0700 Subject: [PATCH 0226/1035] Prefix some AArch64/ARM passes with "aarch64-"/"arm-" For consistency with other target specific passes. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D84560 --- llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp | 6 +++--- llvm/lib/Target/AArch64/AArch64StackTagging.cpp | 2 +- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 8 ++++---- llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp | 2 +- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir | 2 +- llvm/test/CodeGen/AArch64/falkor-hwpf.ll | 4 ++-- llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll | 2 +- llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll | 2 +- llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll | 2 +- .../CodeGen/AArch64/stack-tagging-initializer-merge.ll | 2 +- .../test/CodeGen/AArch64/stack-tagging-untag-placement.ll | 2 +- llvm/test/CodeGen/AArch64/stack-tagging.ll | 4 ++-- llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll | 2 +- .../CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll | 2 +- llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll | 2 +- 16 files changed, 23 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 538863ebe95af..5d421cfbc2da8 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -54,7 +54,7 @@ using namespace llvm; -#define DEBUG_TYPE "falkor-hwpf-fix" +#define DEBUG_TYPE "aarch64-falkor-hwpf-fix" STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); STATISTIC(NumCollisionsAvoided, @@ -224,10 +224,10 @@ struct LoadInfo { char FalkorHWPFFix::ID = 0; -INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late", +INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", "Falkor HW Prefetch Fix Late Phase", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late", +INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", "Falkor HW Prefetch Fix Late Phase", false, false) static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 61f27cbc3b29d..0ed576fbc75f9 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -59,7 +59,7 @@ using namespace llvm; -#define DEBUG_TYPE "stack-tagging" +#define DEBUG_TYPE "aarch64-stack-tagging" static cl::opt ClMergeInit( "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index b0cef9b66e017..40d71def6d09a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -148,10 +148,10 @@ static cl::opt EnableGlobalISelAtO( cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); -static cl::opt EnableSVEIntrinsicOpts( - "aarch64-sve-intrinsic-opts", cl::Hidden, - cl::desc("Enable SVE intrinsic opts"), - cl::init(true)); +static cl::opt + EnableSVEIntrinsicOpts("aarch64-enable-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 74fe0cdd1ea7f..542d2c9645150 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -37,7 +37,7 @@ using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "sve-intrinsic-opts" +#define DEBUG_TYPE "aarch64-sve-intrinsic-opts" namespace llvm { void initializeSVEIntrinsicOptsPass(PassRegistry &); diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index 4d7ad6cd60cb9..e0ba7f88db951 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -44,7 +44,7 @@ using namespace llvm; -#define DEBUG_TYPE "mve-gather-scatter-lowering" +#define DEBUG_TYPE "arm-mve-gather-scatter-lowering" cl::opt EnableMaskedGatherScatters( "enable-arm-maskedgatscat", cl::Hidden, cl::init(false), diff --git a/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir b/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir index e37d8be349480..4f144e1ef8bff 100644 --- a/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir +++ b/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass falkor-hwpf-fix-late -o - %s | FileCheck %s +# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass aarch64-falkor-hwpf-fix-late -o - %s | FileCheck %s --- # Verify that the tag collision between the loads is resolved for various load opcodes. diff --git a/llvm/test/CodeGen/AArch64/falkor-hwpf.ll b/llvm/test/CodeGen/AArch64/falkor-hwpf.ll index b9c7291313fb9..aa4a43f2430ba 100644 --- a/llvm/test/CodeGen/AArch64/falkor-hwpf.ll +++ b/llvm/test/CodeGen/AArch64/falkor-hwpf.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s -; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF +; RUN: opt < %s -S -aarch64-falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s +; RUN: opt < %s -S -aarch64-falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF ; Check that strided access metadata is added to loads in inner loops when compiling for Falkor. diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll b/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll index 9ca4ff59b24f2..1525b2d3e3531 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -stack-tagging -S -o - | FileCheck %s +; RUN: opt < %s -aarch64-stack-tagging -S -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll index 258586824530c..2099e722fe523 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -stack-tagging %s -o - | FileCheck %s +; RUN: opt -S -aarch64-stack-tagging %s -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-arm-unknown-eabi" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll index 820041e061a3c..11389d5d5ba7c 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll @@ -24,7 +24,7 @@ ; return 0; ; } -; RUN: opt -S -stack-tagging %s -o - | FileCheck %s +; RUN: opt -S -aarch64-stack-tagging %s -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-unknown-eabi" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll index 5c4be4354ea7b..9dc08c192a01b 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -stack-tagging -S -o - | FileCheck %s +; RUN: opt < %s -aarch64-stack-tagging -S -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll index 3c0dc3df98dbd..a73c79d6cc985 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll @@ -1,4 +1,4 @@ -;; RUN: opt -S -stack-tagging %s -o - | FileCheck %s +;; RUN: opt -S -aarch64-stack-tagging %s -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-arm-unknown-eabi" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging.ll b/llvm/test/CodeGen/AArch64/stack-tagging.ll index 20d3c6bebdec4..275b8a7dbad7e 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -stack-tagging -S -o - | FileCheck %s --check-prefixes=CHECK,SSI -; RUN: opt < %s -stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK,NOSSI +; RUN: opt < %s -aarch64-stack-tagging -S -o - | FileCheck %s --check-prefixes=CHECK,SSI +; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK,NOSSI target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll index 191fddacffd1d..96c754778d52e 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s +; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll index 723ffd8c17330..47e0ff8f19c7f 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s +; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll index 22f6cd6bc3d05..e594fee53f9dc 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -S -o 2>/dev/null - | FileCheck %s +; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -S -o 2>/dev/null - | FileCheck %s define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: @push_out_add_sub_block( From 7832d0f63d3210b4a4f0e8bfc968ebe44dba0da5 Mon Sep 17 00:00:00 2001 From: aartbik Date: Mon, 27 Jul 2020 11:22:26 -0700 Subject: [PATCH 0227/1035] [mlir] [VectorOps] [integration_test] Sparse matrix times vector (DOT version) Integration test that illustrates the gather operation with a real-world operation expressed in mostly the Vector dialect. Uses jagged diagonal storage. Reviewed By: bondhugula Differential Revision: https://reviews.llvm.org/D84571 --- .../Vector/CPU/test-sparse-dot-matvec.mlir | 270 ++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 mlir/integration_test/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir diff --git a/mlir/integration_test/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir b/mlir/integration_test/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir new file mode 100644 index 0000000000000..088e68e0e507c --- /dev/null +++ b/mlir/integration_test/Dialect/Vector/CPU/test-sparse-dot-matvec.mlir @@ -0,0 +1,270 @@ +// RUN: mlir-opt %s -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// Illustrates an 8x8 Sparse Matrix x Vector implemented with only operations +// of the vector dialect (and some std/scf). Essentially, this example performs +// the following multiplication: +// +// 0 1 2 3 4 5 6 7 +// +------------------------+ +// 0 | 1 0 2 0 0 1 0 1 | | 1 | | 21 | +// 1 | 1 8 0 0 3 0 1 0 | | 2 | | 39 | +// 2 | 0 0 1 0 0 2 6 2 | | 3 | | 73 | +// 3 | 0 3 0 1 0 1 0 1 | x | 4 | = | 24 | +// 4 | 5 0 0 1 1 1 0 0 | | 5 | | 20 | +// 5 | 0 3 0 0 2 1 2 0 | | 6 | | 36 | +// 6 | 4 0 7 0 1 0 1 0 | | 7 | | 37 | +// 7 | 0 3 0 2 0 0 1 1 | | 8 | | 29 | +// +------------------------+ +// +// The sparse storage scheme used is an extended column scheme (also referred +// to as jagged diagonal, which is essentially a vector friendly variant of +// the general sparse row-wise scheme (also called compressed row storage), +// using fixed length vectors and no explicit pointer indexing into the +// value array to find the rows. +// +// The extended column storage for the matrix shown above is as follows. +// +// VALUE INDEX +// +---------+ +---------+ +// 0 | 1 2 1 1 | | 0 2 5 7 | +// 1 | 1 8 3 1 | | 0 1 4 6 | +// 2 | 1 2 6 2 | | 2 5 6 7 | +// 3 | 3 1 1 1 | | 1 3 5 7 | +// 4 | 5 1 1 1 | | 0 3 4 5 | +// 5 | 3 2 1 2 | | 1 4 5 6 | +// 6 | 4 7 1 1 | | 0 2 4 6 | +// 7 | 3 2 1 1 | | 1 3 6 7 | +// +---------+ +---------+ +// +// This example illustrates a DOT version for the operation. Another example +// in this directory illustrates an effective SAXPY version that operates on the +// transposed jagged diagonal storage to obtain higher vector lengths. + +#contraction_accesses = [ + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)>, + affine_map<(i) -> ()> +] +#dot_trait = { + indexing_maps = #contraction_accesses, + iterator_types = ["reduction"] +} + +func @spmv8x8(%AVAL: memref<8xvector<4xf32>>, + %AIDX: memref<8xvector<4xi32>>, %X: memref, %B: memref) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %cn = constant 8 : index + %f0 = constant 0.0 : f32 + %mask = vector.constant_mask [4] : vector<4xi1> + scf.for %i = %c0 to %cn step %c1 { + %aval = load %AVAL[%i] : memref<8xvector<4xf32>> + %aidx = load %AIDX[%i] : memref<8xvector<4xi32>> + %0 = vector.gather %X, %aidx, %mask + : (memref, vector<4xi32>, vector<4xi1>) -> vector<4xf32> + %1 = vector.contract #dot_trait %aval, %0, %f0 : vector<4xf32>, vector<4xf32> into f32 + store %1, %B[%i] : memref + } + return +} + +func @entry() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %c4 = constant 4 : index + %c5 = constant 5 : index + %c6 = constant 6 : index + %c7 = constant 7 : index + %c8 = constant 8 : index + + %f0 = constant 0.0 : f32 + %f1 = constant 1.0 : f32 + %f2 = constant 2.0 : f32 + %f3 = constant 3.0 : f32 + %f4 = constant 4.0 : f32 + %f5 = constant 5.0 : f32 + %f6 = constant 6.0 : f32 + %f7 = constant 7.0 : f32 + %f8 = constant 8.0 : f32 + + %i0 = constant 0 : i32 + %i1 = constant 1 : i32 + %i2 = constant 2 : i32 + %i3 = constant 3 : i32 + %i4 = constant 4 : i32 + %i5 = constant 5 : i32 + %i6 = constant 6 : i32 + %i7 = constant 7 : i32 + + // + // Allocate. + // + + %AVAL = alloc() {alignment = 64} : memref<8xvector<4xf32>> + %AIDX = alloc() {alignment = 64} : memref<8xvector<4xi32>> + %X = alloc(%c8) {alignment = 64} : memref + %B = alloc(%c8) {alignment = 64} : memref + + // + // Initialize. + // + + %vf1 = vector.broadcast %f1 : f32 to vector<4xf32> + + %0 = vector.insert %f2, %vf1[1] : f32 into vector<4xf32> + store %0, %AVAL[%c0] : memref<8xvector<4xf32>> + + %1 = vector.insert %f8, %vf1[1] : f32 into vector<4xf32> + %2 = vector.insert %f3, %1[2] : f32 into vector<4xf32> + store %2, %AVAL[%c1] : memref<8xvector<4xf32>> + + %3 = vector.insert %f2, %vf1[1] : f32 into vector<4xf32> + %4 = vector.insert %f6, %3[2] : f32 into vector<4xf32> + %5 = vector.insert %f2, %4[3] : f32 into vector<4xf32> + store %5, %AVAL[%c2] : memref<8xvector<4xf32>> + + %6 = vector.insert %f3, %vf1[0] : f32 into vector<4xf32> + store %6, %AVAL[%c3] : memref<8xvector<4xf32>> + + %7 = vector.insert %f5, %vf1[0] : f32 into vector<4xf32> + store %7, %AVAL[%c4] : memref<8xvector<4xf32>> + + %8 = vector.insert %f3, %vf1[0] : f32 into vector<4xf32> + %9 = vector.insert %f2, %8[1] : f32 into vector<4xf32> + %10 = vector.insert %f2, %9[3] : f32 into vector<4xf32> + store %10, %AVAL[%c5] : memref<8xvector<4xf32>> + + %11 = vector.insert %f4, %vf1[0] : f32 into vector<4xf32> + %12 = vector.insert %f7, %11[1] : f32 into vector<4xf32> + store %12, %AVAL[%c6] : memref<8xvector<4xf32>> + + %13 = vector.insert %f3, %vf1[0] : f32 into vector<4xf32> + %14 = vector.insert %f2, %13[1] : f32 into vector<4xf32> + store %14, %AVAL[%c7] : memref<8xvector<4xf32>> + + %vi0 = vector.broadcast %i0 : i32 to vector<4xi32> + + %20 = vector.insert %i2, %vi0[1] : i32 into vector<4xi32> + %21 = vector.insert %i5, %20[2] : i32 into vector<4xi32> + %22 = vector.insert %i7, %21[3] : i32 into vector<4xi32> + store %22, %AIDX[%c0] : memref<8xvector<4xi32>> + + %23 = vector.insert %i1, %vi0[1] : i32 into vector<4xi32> + %24 = vector.insert %i4, %23[2] : i32 into vector<4xi32> + %25 = vector.insert %i6, %24[3] : i32 into vector<4xi32> + store %25, %AIDX[%c1] : memref<8xvector<4xi32>> + + %26 = vector.insert %i2, %vi0[0] : i32 into vector<4xi32> + %27 = vector.insert %i5, %26[1] : i32 into vector<4xi32> + %28 = vector.insert %i6, %27[2] : i32 into vector<4xi32> + %29 = vector.insert %i7, %28[3] : i32 into vector<4xi32> + store %29, %AIDX[%c2] : memref<8xvector<4xi32>> + + %30 = vector.insert %i1, %vi0[0] : i32 into vector<4xi32> + %31 = vector.insert %i3, %30[1] : i32 into vector<4xi32> + %32 = vector.insert %i5, %31[2] : i32 into vector<4xi32> + %33 = vector.insert %i7, %32[3] : i32 into vector<4xi32> + store %33, %AIDX[%c3] : memref<8xvector<4xi32>> + + %34 = vector.insert %i3, %vi0[1] : i32 into vector<4xi32> + %35 = vector.insert %i4, %34[2] : i32 into vector<4xi32> + %36 = vector.insert %i5, %35[3] : i32 into vector<4xi32> + store %36, %AIDX[%c4] : memref<8xvector<4xi32>> + + %37 = vector.insert %i1, %vi0[0] : i32 into vector<4xi32> + %38 = vector.insert %i4, %37[1] : i32 into vector<4xi32> + %39 = vector.insert %i5, %38[2] : i32 into vector<4xi32> + %40 = vector.insert %i6, %39[3] : i32 into vector<4xi32> + store %40, %AIDX[%c5] : memref<8xvector<4xi32>> + + %41 = vector.insert %i2, %vi0[1] : i32 into vector<4xi32> + %42 = vector.insert %i4, %41[2] : i32 into vector<4xi32> + %43 = vector.insert %i6, %42[3] : i32 into vector<4xi32> + store %43, %AIDX[%c6] : memref<8xvector<4xi32>> + + %44 = vector.insert %i1, %vi0[0] : i32 into vector<4xi32> + %45 = vector.insert %i3, %44[1] : i32 into vector<4xi32> + %46 = vector.insert %i6, %45[2] : i32 into vector<4xi32> + %47 = vector.insert %i7, %46[3] : i32 into vector<4xi32> + store %47, %AIDX[%c7] : memref<8xvector<4xi32>> + + scf.for %i = %c0 to %c8 step %c1 { + %ix = addi %i, %c1 : index + %kx = index_cast %ix : index to i32 + %fx = sitofp %kx : i32 to f32 + store %fx, %X[%i] : memref + store %f0, %B[%i] : memref + } + + // + // Multiply. + // + + call @spmv8x8(%AVAL, %AIDX, %X, %B) : (memref<8xvector<4xf32>>, + memref<8xvector<4xi32>>, + memref, memref) -> () + + // + // Print and verify. + // + + scf.for %i = %c0 to %c8 step %c1 { + %aval = load %AVAL[%i] : memref<8xvector<4xf32>> + vector.print %aval : vector<4xf32> + } + + scf.for %i = %c0 to %c8 step %c1 { + %aidx = load %AIDX[%i] : memref<8xvector<4xi32>> + vector.print %aidx : vector<4xi32> + } + + scf.for %i = %c0 to %c8 step %c1 { + %ldb = load %B[%i] : memref + vector.print %ldb : f32 + } + + // + // CHECK: ( 1, 2, 1, 1 ) + // CHECK-NEXT: ( 1, 8, 3, 1 ) + // CHECK-NEXT: ( 1, 2, 6, 2 ) + // CHECK-NEXT: ( 3, 1, 1, 1 ) + // CHECK-NEXT: ( 5, 1, 1, 1 ) + // CHECK-NEXT: ( 3, 2, 1, 2 ) + // CHECK-NEXT: ( 4, 7, 1, 1 ) + // CHECK-NEXT: ( 3, 2, 1, 1 ) + // + // CHECK-NEXT: ( 0, 2, 5, 7 ) + // CHECK-NEXT: ( 0, 1, 4, 6 ) + // CHECK-NEXT: ( 2, 5, 6, 7 ) + // CHECK-NEXT: ( 1, 3, 5, 7 ) + // CHECK-NEXT: ( 0, 3, 4, 5 ) + // CHECK-NEXT: ( 1, 4, 5, 6 ) + // CHECK-NEXT: ( 0, 2, 4, 6 ) + // CHECK-NEXT: ( 1, 3, 6, 7 ) + // + // CHECK-NEXT: 21 + // CHECK-NEXT: 39 + // CHECK-NEXT: 73 + // CHECK-NEXT: 24 + // CHECK-NEXT: 20 + // CHECK-NEXT: 36 + // CHECK-NEXT: 37 + // CHECK-NEXT: 29 + // + + // + // Free. + // + + dealloc %AVAL : memref<8xvector<4xf32>> + dealloc %AIDX : memref<8xvector<4xi32>> + dealloc %X : memref + dealloc %B : memref + + return +} From b52b2e1c188072e3cbc91500cfd503fb26d50ffc Mon Sep 17 00:00:00 2001 From: Dokyung Song Date: Thu, 16 Jul 2020 21:24:06 +0000 Subject: [PATCH 0228/1035] Recommit "[libFuzzer] Disable implicit builtin knowledge about memcmp-like functions when -fsanitize=fuzzer-no-link is given." Summary: This patch disables implicit builtin knowledge about memcmp-like functions when compiling the program for fuzzing, i.e., when -fsanitize=fuzzer(-no-link) is given. This allows libFuzzer to always intercept memcmp-like functions as it effectively disables optimizing calls to such functions into different forms. This is done by adding a set of flags (-fno-builtin-memcmp and others) in the clang driver. Individual -fno-builtin-* flags previously used in several libFuzzer tests are now removed, as it is now done automatically in the clang driver. The patch was once reverted in 8ef9e2bf355d05bc81d8b0fe1e5333eec59a0a91, as this patch was dependent on a reverted commit f78d9fceea736d431e9e3cbca291e3909e3aa46d. This reverted commit was recommitted in 831ae45e3dc609e43ba561af07670a8fe47461ef, so relanding this dependent patch too. Reviewers: morehouse, hctim Subscribers: cfe-commits, #sanitizers Tags: #clang, #sanitizers Differential Revision: https://reviews.llvm.org/D83987 --- clang/lib/Driver/SanitizerArgs.cpp | 17 +++++++++++++++++ compiler-rt/test/fuzzer/noasan-bcmp.test | 4 ++++ compiler-rt/test/fuzzer/noasan-memcmp.test | 4 ++-- compiler-rt/test/fuzzer/noasan-memcmp64.test | 2 +- compiler-rt/test/fuzzer/noasan-strcmp.test | 4 ++-- compiler-rt/test/fuzzer/noasan-strncmp.test | 4 ++-- compiler-rt/test/fuzzer/noasan-strstr.test | 4 ++-- 7 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 compiler-rt/test/fuzzer/noasan-bcmp.test diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index e4fda752c041d..8c49e92b2c0f9 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -1088,6 +1088,23 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args, Sanitizers.has(SanitizerKind::Address)) CmdArgs.push_back("-fno-assume-sane-operator-new"); + // libFuzzer wants to intercept calls to certain library functions, so the + // following -fno-builtin-* flags force the compiler to emit interposable + // libcalls to these functions. Other sanitizers effectively do the same thing + // by marking all library call sites with NoBuiltin attribute in their LLVM + // pass. (see llvm::maybeMarkSanitizerLibraryCallNoBuiltin) + if (Sanitizers.has(SanitizerKind::FuzzerNoLink)) { + CmdArgs.push_back("-fno-builtin-bcmp"); + CmdArgs.push_back("-fno-builtin-memcmp"); + CmdArgs.push_back("-fno-builtin-strncmp"); + CmdArgs.push_back("-fno-builtin-strcmp"); + CmdArgs.push_back("-fno-builtin-strncasecmp"); + CmdArgs.push_back("-fno-builtin-strcasecmp"); + CmdArgs.push_back("-fno-builtin-strstr"); + CmdArgs.push_back("-fno-builtin-strcasestr"); + CmdArgs.push_back("-fno-builtin-memmem"); + } + // Require -fvisibility= flag on non-Windows when compiling if vptr CFI is // enabled. if (Sanitizers.hasOneOf(CFIClasses) && !TC.getTriple().isOSWindows() && diff --git a/compiler-rt/test/fuzzer/noasan-bcmp.test b/compiler-rt/test/fuzzer/noasan-bcmp.test new file mode 100644 index 0000000000000..a3dd17bf6e2b8 --- /dev/null +++ b/compiler-rt/test/fuzzer/noasan-bcmp.test @@ -0,0 +1,4 @@ +UNSUPPORTED: darwin, freebsd, windows +RUN: %cpp_compiler -fno-sanitize=address -DMEMCMP=bcmp %S/MemcmpTest.cpp -o %t +RUN: not %run %t -seed=1 -runs=10000000 2>&1 | FileCheck %s +CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-memcmp.test b/compiler-rt/test/fuzzer/noasan-memcmp.test index c90755c53a900..c5ce2fff8c9fa 100644 --- a/compiler-rt/test/fuzzer/noasan-memcmp.test +++ b/compiler-rt/test/fuzzer/noasan-memcmp.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-memcmp %S/MemcmpTest.cpp -o %t-NoAsanMemcmpTest +RUN: %cpp_compiler -fno-sanitize=address %S/MemcmpTest.cpp -o %t-NoAsanMemcmpTest RUN: not %run %t-NoAsanMemcmpTest -seed=1 -runs=10000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-memcmp %S/CustomAllocator.cpp %S/MemcmpTest.cpp -o %t-NoAsanCustomAllocatorMemcmpTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/MemcmpTest.cpp -o %t-NoAsanCustomAllocatorMemcmpTest RUN: not %run %t-NoAsanCustomAllocatorMemcmpTest -seed=1 -runs=10000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-memcmp64.test b/compiler-rt/test/fuzzer/noasan-memcmp64.test index a6b8f88594d03..496ee386193e9 100644 --- a/compiler-rt/test/fuzzer/noasan-memcmp64.test +++ b/compiler-rt/test/fuzzer/noasan-memcmp64.test @@ -1,6 +1,6 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-memcmp %S/Memcmp64BytesTest.cpp -o %t-NoAsanMemcmp64BytesTest +RUN: %cpp_compiler -fno-sanitize=address %S/Memcmp64BytesTest.cpp -o %t-NoAsanMemcmp64BytesTest RUN: not %run %t-NoAsanMemcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-strcmp.test b/compiler-rt/test/fuzzer/noasan-strcmp.test index 76b7c5de7c7bb..c264dec6bea2f 100644 --- a/compiler-rt/test/fuzzer/noasan-strcmp.test +++ b/compiler-rt/test/fuzzer/noasan-strcmp.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-strcmp %S/StrcmpTest.cpp -o %t-NoAsanStrcmpTest +RUN: %cpp_compiler -fno-sanitize=address %S/StrcmpTest.cpp -o %t-NoAsanStrcmpTest RUN: not %run %t-NoAsanStrcmpTest -seed=1 -runs=2000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-strcmp %S/CustomAllocator.cpp %S/StrcmpTest.cpp -o %t-NoAsanCustomAllocatorStrcmpTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/StrcmpTest.cpp -o %t-NoAsanCustomAllocatorStrcmpTest RUN: not %run %t-NoAsanCustomAllocatorStrcmpTest -seed=1 -runs=2000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-strncmp.test b/compiler-rt/test/fuzzer/noasan-strncmp.test index 705781ec39586..dd0f254609221 100644 --- a/compiler-rt/test/fuzzer/noasan-strncmp.test +++ b/compiler-rt/test/fuzzer/noasan-strncmp.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-strncmp %S/StrncmpTest.cpp -o %t-NoAsanStrncmpTest +RUN: %cpp_compiler -fno-sanitize=address %S/StrncmpTest.cpp -o %t-NoAsanStrncmpTest RUN: not %run %t-NoAsanStrncmpTest -seed=2 -runs=10000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-strncmp %S/CustomAllocator.cpp %S/StrncmpTest.cpp -o %t-NoAsanCustomAllocatorStrncmpTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/StrncmpTest.cpp -o %t-NoAsanCustomAllocatorStrncmpTest RUN: not %run %t-NoAsanCustomAllocatorStrncmpTest -seed=2 -runs=10000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-strstr.test b/compiler-rt/test/fuzzer/noasan-strstr.test index f06e903149bd9..e969170bfac98 100644 --- a/compiler-rt/test/fuzzer/noasan-strstr.test +++ b/compiler-rt/test/fuzzer/noasan-strstr.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-strstr %S/StrstrTest.cpp -o %t-NoAsanStrstrTest +RUN: %cpp_compiler -fno-sanitize=address %S/StrstrTest.cpp -o %t-NoAsanStrstrTest RUN: not %run %t-NoAsanStrstrTest -seed=1 -runs=2000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-strstr %S/CustomAllocator.cpp %S/StrstrTest.cpp -o %t-NoAsanCustomAllocatorStrstrTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/StrstrTest.cpp -o %t-NoAsanCustomAllocatorStrstrTest RUN: not %run %t-NoAsanCustomAllocatorStrstrTest -seed=1 -runs=2000000 2>&1 | FileCheck %s CHECK: BINGO From 34ddf0b2b040918a6c946f589eeaf1d4fef95e7a Mon Sep 17 00:00:00 2001 From: Matt Morehouse Date: Mon, 27 Jul 2020 18:15:35 +0000 Subject: [PATCH 0229/1035] Replace fuzzer::FuzzerDriver's INTERFACE marking with new LLVMRunFuzzerDriver. This adds a new extern "C" function that serves the same purpose. This removes the need for external users to depend on internal headers in order to use this feature. It also standardizes the interface in a way that other fuzzing engines will be able to match. Patch By: IanPudney Reviewed By: kcc Differential Revision: https://reviews.llvm.org/D84561 --- compiler-rt/lib/fuzzer/FuzzerDriver.cpp | 6 +++++ llvm/docs/LibFuzzer.rst | 29 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp index 00a33a413d2f3..8339697396c21 100644 --- a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp @@ -858,6 +858,12 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { exit(0); // Don't let F destroy itself. } +extern "C" ATTRIBUTE_INTERFACE int +LLVMFuzzerRunDriver(int *argc, char ***argv, + int (*UserCb)(const uint8_t *Data, size_t Size)) { + return FuzzerDriver(argc, argv, UserCb); +} + // Storage for global ExternalFunctions object. ExternalFunctions *EF = nullptr; diff --git a/llvm/docs/LibFuzzer.rst b/llvm/docs/LibFuzzer.rst index 4e83955a05460..70a3f029c6f3e 100644 --- a/llvm/docs/LibFuzzer.rst +++ b/llvm/docs/LibFuzzer.rst @@ -617,6 +617,35 @@ really need to access ``argv``/``argc``. return 0; } +Using libFuzzer as a library +---------------------------- +If the code being fuzzed must provide its own `main`, it's possible to +invoke libFuzzer as a library. Be sure to pass ``-fsanitize=fuzzer-no-link`` +during compilation, and link your binary against the no-main version of +libFuzzer. On Linux installations, this is typically located at: + +.. code-block:: bash + + /usr/lib//lib/clang//lib/linux/libclang_rt.fuzzer_no_main-.a + +If building libFuzzer from source, this is located at the following path +in the build output directory: + +.. code-block:: bash + + lib/linux/libclang_rt.fuzzer_no_main-.a + +From here, the code can do whatever setup it requires, and when it's ready +to start fuzzing, it can call `LLVMFuzzerRunDriver`, passing in the program +arguments and a callback. This callback is invoked just like +`LLVMFuzzerTestOneInput`, and has the same signature. + +.. code-block:: c++ + + extern "C" int LLVMFuzzerRunDriver(int *argc, char ***argv, + int (*UserCb)(const uint8_t *Data, size_t Size)); + + Leaks ----- From 731043c0c494672efe1eeea9ee0f0c7788813dea Mon Sep 17 00:00:00 2001 From: Kirill Bobyrev Date: Mon, 27 Jul 2020 20:45:05 +0200 Subject: [PATCH 0230/1035] [clangd] Add more logs and attach tracers to remote index server routines Reviewers: kadircet Reviewed By: kadircet Subscribers: sammccall, ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D84499 --- .../clangd/index/remote/Client.cpp | 9 +- .../clangd/index/remote/server/Server.cpp | 89 ++++++++++++++++--- 2 files changed, 83 insertions(+), 15 deletions(-) diff --git a/clang-tools-extra/clangd/index/remote/Client.cpp b/clang-tools-extra/clangd/index/remote/Client.cpp index 35ce84068f406..5a33fd2eaf143 100644 --- a/clang-tools-extra/clangd/index/remote/Client.cpp +++ b/clang-tools-extra/clangd/index/remote/Client.cpp @@ -37,12 +37,15 @@ class IndexClient : public clangd::SymbolIndex { bool FinalResult = false; trace::Span Tracer(RequestT::descriptor()->name()); const auto RPCRequest = ProtobufMarshaller->toProtobuf(Request); + SPAN_ATTACH(Tracer, "Request", RPCRequest.DebugString()); grpc::ClientContext Context; std::chrono::system_clock::time_point Deadline = std::chrono::system_clock::now() + DeadlineWaitingTime; Context.set_deadline(Deadline); auto Reader = (Stub.get()->*RPCCall)(&Context, RPCRequest); ReplyT Reply; + unsigned Successful = 0; + unsigned FailedToParse = 0; while (Reader->Read(&Reply)) { if (!Reply.has_stream_result()) { FinalResult = Reply.final_result(); @@ -51,11 +54,15 @@ class IndexClient : public clangd::SymbolIndex { auto Response = ProtobufMarshaller->fromProtobuf(Reply.stream_result()); if (!Response) { elog("Received invalid {0}", ReplyT::descriptor()->name()); + ++FailedToParse; continue; } Callback(*Response); + ++Successful; } - SPAN_ATTACH(Tracer, "status", Reader->Finish().ok()); + SPAN_ATTACH(Tracer, "Status", Reader->Finish().ok()); + SPAN_ATTACH(Tracer, "Successful", Successful); + SPAN_ATTACH(Tracer, "Failed to parse", FailedToParse); return FinalResult; } diff --git a/clang-tools-extra/clangd/index/remote/server/Server.cpp b/clang-tools-extra/clangd/index/remote/server/Server.cpp index 7bf47a288e79c..364e4db6503c2 100644 --- a/clang-tools-extra/clangd/index/remote/server/Server.cpp +++ b/clang-tools-extra/clangd/index/remote/server/Server.cpp @@ -6,10 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "Index.pb.h" #include "index/Index.h" #include "index/Serialization.h" +#include "index/Symbol.h" #include "index/remote/marshalling/Marshalling.h" #include "support/Logger.h" +#include "support/Trace.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Path.h" @@ -36,6 +39,16 @@ llvm::cl::opt IndexPath(llvm::cl::desc(""), llvm::cl::opt IndexRoot(llvm::cl::desc(""), llvm::cl::Positional, llvm::cl::Required); +llvm::cl::opt TraceFile( + "trace-file", + llvm::cl::desc("Path to the file where tracer logs will be stored")); + +llvm::cl::opt PrettyPrint{ + "pretty", + llvm::cl::desc("Pretty-print JSON output in the trace"), + llvm::cl::init(false), +}; + llvm::cl::opt ServerAddress( "server-address", llvm::cl::init("0.0.0.0:50051"), llvm::cl::desc("Address of the invoked server. Defaults to 0.0.0.0:50051")); @@ -60,66 +73,90 @@ class RemoteIndexServer final : public SymbolIndex::Service { grpc::Status Lookup(grpc::ServerContext *Context, const LookupRequest *Request, grpc::ServerWriter *Reply) override { + trace::Span Tracer(LookupRequest::descriptor()->name()); auto Req = ProtobufMarshaller->fromProtobuf(Request); if (!Req) { elog("Can not parse LookupRequest from protobuf: {0}", Req.takeError()); return grpc::Status::CANCELLED; } - Index->lookup(*Req, [&](const clangd::Symbol &Sym) { - auto SerializedSymbol = ProtobufMarshaller->toProtobuf(Sym); - if (!SerializedSymbol) + unsigned Sent = 0; + unsigned FailedToSend = 0; + Index->lookup(*Req, [&](const auto &Item) { + auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); + if (!SerializedItem) { + ++FailedToSend; return; + } LookupReply NextMessage; - *NextMessage.mutable_stream_result() = *SerializedSymbol; + *NextMessage.mutable_stream_result() = *SerializedItem; Reply->Write(NextMessage); + ++Sent; }); LookupReply LastMessage; LastMessage.set_final_result(true); Reply->Write(LastMessage); + SPAN_ATTACH(Tracer, "Sent", Sent); + SPAN_ATTACH(Tracer, "Failed to send", FailedToSend); return grpc::Status::OK; } grpc::Status FuzzyFind(grpc::ServerContext *Context, const FuzzyFindRequest *Request, grpc::ServerWriter *Reply) override { + trace::Span Tracer(FuzzyFindRequest::descriptor()->name()); auto Req = ProtobufMarshaller->fromProtobuf(Request); if (!Req) { elog("Can not parse FuzzyFindRequest from protobuf: {0}", Req.takeError()); return grpc::Status::CANCELLED; } - bool HasMore = Index->fuzzyFind(*Req, [&](const clangd::Symbol &Sym) { - auto SerializedSymbol = ProtobufMarshaller->toProtobuf(Sym); - if (!SerializedSymbol) + unsigned Sent = 0; + unsigned FailedToSend = 0; + bool HasMore = Index->fuzzyFind(*Req, [&](const auto &Item) { + auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); + if (!SerializedItem) { + ++FailedToSend; return; + } FuzzyFindReply NextMessage; - *NextMessage.mutable_stream_result() = *SerializedSymbol; + *NextMessage.mutable_stream_result() = *SerializedItem; Reply->Write(NextMessage); + ++Sent; }); FuzzyFindReply LastMessage; LastMessage.set_final_result(HasMore); Reply->Write(LastMessage); + SPAN_ATTACH(Tracer, "Sent", Sent); + SPAN_ATTACH(Tracer, "Failed to send", FailedToSend); return grpc::Status::OK; } grpc::Status Refs(grpc::ServerContext *Context, const RefsRequest *Request, grpc::ServerWriter *Reply) override { + trace::Span Tracer(RefsRequest::descriptor()->name()); auto Req = ProtobufMarshaller->fromProtobuf(Request); if (!Req) { elog("Can not parse RefsRequest from protobuf: {0}", Req.takeError()); return grpc::Status::CANCELLED; } - bool HasMore = Index->refs(*Req, [&](const clangd::Ref &Reference) { - auto SerializedRef = ProtobufMarshaller->toProtobuf(Reference); - if (!SerializedRef) + unsigned Sent = 0; + unsigned FailedToSend = 0; + bool HasMore = Index->refs(*Req, [&](const auto &Item) { + auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); + if (!SerializedItem) { + ++FailedToSend; return; + } RefsReply NextMessage; - *NextMessage.mutable_stream_result() = *SerializedRef; + *NextMessage.mutable_stream_result() = *SerializedItem; Reply->Write(NextMessage); + ++Sent; }); RefsReply LastMessage; LastMessage.set_final_result(HasMore); Reply->Write(LastMessage); + SPAN_ATTACH(Tracer, "Sent", Sent); + SPAN_ATTACH(Tracer, "Failed to send", FailedToSend); return grpc::Status::OK; } @@ -146,20 +183,44 @@ void runServer(std::unique_ptr Index, } // namespace clangd } // namespace clang +using clang::clangd::elog; + int main(int argc, char *argv[]) { using namespace clang::clangd::remote; llvm::cl::ParseCommandLineOptions(argc, argv, Overview); llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); if (!llvm::sys::path::is_absolute(IndexRoot)) { - llvm::errs() << "Index root should be an absolute path.\n"; + elog("Index root should be an absolute path."); return -1; } + llvm::Optional TracerStream; + std::unique_ptr Tracer; + if (!TraceFile.empty()) { + std::error_code EC; + TracerStream.emplace(TraceFile, EC, + llvm::sys::fs::FA_Read | llvm::sys::fs::FA_Write); + if (EC) { + TracerStream.reset(); + elog("Error while opening trace file {0}: {1}", TraceFile, EC.message()); + } else { + // FIXME(kirillbobyrev): Also create metrics tracer to track latency and + // accumulate other request statistics. + Tracer = clang::clangd::trace::createJSONTracer(*TracerStream, + /*PrettyPrint=*/false); + clang::clangd::vlog("Successfully created a tracer."); + } + } + + llvm::Optional TracingSession; + if (Tracer) + TracingSession.emplace(*Tracer); + std::unique_ptr Index = openIndex(IndexPath); if (!Index) { - llvm::errs() << "Failed to open the index.\n"; + elog("Failed to open the index."); return -1; } From bef19abcf7e2807f8b48b00445b221ff20dd3923 Mon Sep 17 00:00:00 2001 From: Shinji Okumura Date: Tue, 28 Jul 2020 03:50:43 +0900 Subject: [PATCH 0231/1035] [Attributor][NFC] Add tests to noalias.ll Summary: Add tests to `noalias.ll` to make changes in D84665 clear Reviewers: jdoerfert, sstefan1, uenoku, homerdin, baziotis Subscribers: uenoku, kuter, bbn, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84688 --- llvm/test/Transforms/Attributor/noalias.ll | 131 +++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/llvm/test/Transforms/Attributor/noalias.ll b/llvm/test/Transforms/Attributor/noalias.ll index 71cf2263867cb..d17a98685d72c 100644 --- a/llvm/test/Transforms/Attributor/noalias.ll +++ b/llvm/test/Transforms/Attributor/noalias.ll @@ -786,3 +786,134 @@ define void @test16_caller(i32* %p, i32 %c) { tail call void @test16_sub(i32* %p, i32 %c, i32 %c) ret void } + +; test 17 +; +; only_store is not called after make_alias is called. +; +; void test17_caller(int* p, int c) { +; if(c) { +; make_alias(p); +; if(0 == 0) { +; goto l3; +; } else { +; goto l2; +; } +; } +; l2: +; only_store(p); +; l3: +; return; +; } + +define void @test17_caller(i32* noalias %p, i32 %c) { +; NOT_CGSCC_NPM: Function Attrs: nofree nosync nounwind willreturn writeonly +; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@test17_caller +; NOT_CGSCC_NPM-SAME: (i32* noalias nofree writeonly [[P:%.*]], i32 [[C:%.*]]) +; NOT_CGSCC_NPM-NEXT: entry: +; NOT_CGSCC_NPM-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[C]], 0 +; NOT_CGSCC_NPM-NEXT: br i1 [[TOBOOL]], label [[L1:%.*]], label [[L2:%.*]] +; NOT_CGSCC_NPM: l1: +; NOT_CGSCC_NPM-NEXT: tail call void @make_alias(i32* nofree writeonly [[P]]) +; NOT_CGSCC_NPM-NEXT: br label [[L3:%.*]] +; NOT_CGSCC_NPM: l2: +; NOT_CGSCC_NPM-NEXT: tail call void @only_store(i32* nocapture nofree writeonly align 4 [[P]]) +; NOT_CGSCC_NPM-NEXT: br label [[L3]] +; NOT_CGSCC_NPM: l3: +; NOT_CGSCC_NPM-NEXT: ret void +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind willreturn writeonly +; IS__CGSCC____-LABEL: define {{[^@]+}}@test17_caller +; IS__CGSCC____-SAME: (i32* noalias nofree writeonly [[P:%.*]], i32 [[C:%.*]]) +; IS__CGSCC____-NEXT: entry: +; IS__CGSCC____-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[C]], 0 +; IS__CGSCC____-NEXT: br i1 [[TOBOOL]], label [[L1:%.*]], label [[L2:%.*]] +; IS__CGSCC____: l1: +; IS__CGSCC____-NEXT: tail call void @make_alias(i32* nofree writeonly [[P]]) +; IS__CGSCC____-NEXT: br label [[L3:%.*]] +; IS__CGSCC____: l2: +; IS__CGSCC____-NEXT: tail call void @only_store(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[P]]) +; IS__CGSCC____-NEXT: br label [[L3]] +; IS__CGSCC____: l3: +; IS__CGSCC____-NEXT: ret void +; +entry: + %tobool = icmp eq i32 %c, 0 + br i1 %tobool, label %l1, label %l2 + +l1: + tail call void @make_alias(i32* %p) + %tobool2 = icmp eq i32 0, 0 + br i1 %tobool2, label %l3, label %l2 + +l2: + tail call void @only_store(i32* %p) + br label %l3 + +l3: + ret void +} + +; test 18 +; void test18_caller(int* p, int c) { +; if(c) { +; make_alias(p); +; noreturn(); +; } +; only_store(p); +; return; +; } + +define void @noreturn() { +; NOT_CGSCC_NPM: Function Attrs: nofree noreturn nosync nounwind readnone willreturn +; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@noreturn() +; NOT_CGSCC_NPM-NEXT: unreachable +; +; IS__CGSCC____: Function Attrs: nofree norecurse noreturn nosync nounwind readnone willreturn +; IS__CGSCC____-LABEL: define {{[^@]+}}@noreturn() +; IS__CGSCC____-NEXT: unreachable +; + call void @noreturn() + ret void +} + +define void @test18_caller(i32* noalias %p, i32 %c) { +; NOT_CGSCC_NPM: Function Attrs: nofree nosync nounwind willreturn writeonly +; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@test18_caller +; NOT_CGSCC_NPM-SAME: (i32* noalias nofree writeonly [[P:%.*]], i32 [[C:%.*]]) +; NOT_CGSCC_NPM-NEXT: entry: +; NOT_CGSCC_NPM-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[C]], 0 +; NOT_CGSCC_NPM-NEXT: br i1 [[TOBOOL]], label [[L1:%.*]], label [[L2:%.*]] +; NOT_CGSCC_NPM: l1: +; NOT_CGSCC_NPM-NEXT: tail call void @make_alias(i32* nofree writeonly [[P]]) +; NOT_CGSCC_NPM-NEXT: unreachable +; NOT_CGSCC_NPM: l2: +; NOT_CGSCC_NPM-NEXT: tail call void @only_store(i32* nocapture nofree writeonly align 4 [[P]]) +; NOT_CGSCC_NPM-NEXT: ret void +; +; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind willreturn writeonly +; IS__CGSCC____-LABEL: define {{[^@]+}}@test18_caller +; IS__CGSCC____-SAME: (i32* noalias nofree nonnull writeonly align 4 dereferenceable(4) [[P:%.*]], i32 [[C:%.*]]) +; IS__CGSCC____-NEXT: entry: +; IS__CGSCC____-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[C]], 0 +; IS__CGSCC____-NEXT: br i1 [[TOBOOL]], label [[L1:%.*]], label [[L2:%.*]] +; IS__CGSCC____: l1: +; IS__CGSCC____-NEXT: tail call void @make_alias(i32* nofree nonnull writeonly align 4 dereferenceable(4) [[P]]) +; IS__CGSCC____-NEXT: unreachable +; IS__CGSCC____: l2: +; IS__CGSCC____-NEXT: tail call void @only_store(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[P]]) +; IS__CGSCC____-NEXT: ret void +; +entry: + %tobool = icmp eq i32 %c, 0 + br i1 %tobool, label %l1, label %l2 + +l1: + tail call void @make_alias(i32* %p) + tail call void @noreturn() + br label %l2 + +l2: + tail call void @only_store(i32* %p) + ret void +} From 05ad8e942996f36cc694478542ccd84aa5bbb80f Mon Sep 17 00:00:00 2001 From: Xiangling Liao Date: Tue, 2 Jun 2020 11:51:58 -0400 Subject: [PATCH 0232/1035] [AIX] Implement AIX special alignment rule about double/long double Implement AIX default `power` alignment rule by adding `PreferredAlignment` and `PreferredNVAlignment` in ASTRecordLayout class. The patchh aims at returning correct value for `__alignof(x)` and `alignof(x)` under `power` alignment rules. Differential Revision: https://reviews.llvm.org/D79719 --- clang/include/clang/AST/RecordLayout.h | 61 ++- clang/include/clang/Basic/TargetInfo.h | 3 + clang/lib/AST/ASTContext.cpp | 25 +- clang/lib/AST/RecordLayout.cpp | 41 +- clang/lib/AST/RecordLayoutBuilder.cpp | 308 +++++++++---- clang/lib/Basic/Targets/OSTargets.h | 2 + clang/lib/Basic/Targets/PPC.h | 11 +- .../aix-Wpacked-expecting-diagnostics.cpp | 30 ++ .../Layout/aix-Wpacked-no-diagnostics.cpp | 31 ++ .../test/Layout/aix-double-struct-member.cpp | 428 ++++++++++++++++++ .../aix-no-unique-address-with-double.cpp | 158 +++++++ clang/test/Layout/aix-pack-attr-on-base.cpp | 20 + .../Layout/aix-power-alignment-typedef-2.cpp | 15 + .../Layout/aix-power-alignment-typedef.cpp | 39 ++ ...-virtual-function-and-base-with-double.cpp | 112 +++++ 15 files changed, 1148 insertions(+), 136 deletions(-) create mode 100644 clang/test/Layout/aix-Wpacked-expecting-diagnostics.cpp create mode 100644 clang/test/Layout/aix-Wpacked-no-diagnostics.cpp create mode 100644 clang/test/Layout/aix-double-struct-member.cpp create mode 100644 clang/test/Layout/aix-no-unique-address-with-double.cpp create mode 100644 clang/test/Layout/aix-pack-attr-on-base.cpp create mode 100644 clang/test/Layout/aix-power-alignment-typedef-2.cpp create mode 100644 clang/test/Layout/aix-power-alignment-typedef.cpp create mode 100644 clang/test/Layout/aix-virtual-function-and-base-with-double.cpp diff --git a/clang/include/clang/AST/RecordLayout.h b/clang/include/clang/AST/RecordLayout.h index b259791af509d..946fbd8f4ce24 100644 --- a/clang/include/clang/AST/RecordLayout.h +++ b/clang/include/clang/AST/RecordLayout.h @@ -70,6 +70,11 @@ class ASTRecordLayout { // Alignment - Alignment of record in characters. CharUnits Alignment; + // PreferredAlignment - Preferred alignment of record in characters. This + // can be different than Alignment in cases where it is beneficial for + // performance or backwards compatibility preserving (e.g. AIX-ABI). + CharUnits PreferredAlignment; + // UnadjustedAlignment - Maximum of the alignments of the record members in // characters. CharUnits UnadjustedAlignment; @@ -91,6 +96,11 @@ class ASTRecordLayout { /// which is the alignment of the object without virtual bases. CharUnits NonVirtualAlignment; + /// PreferredNVAlignment - The preferred non-virtual alignment (in chars) of + /// an object, which is the preferred alignment of the object without + /// virtual bases. + CharUnits PreferredNVAlignment; + /// SizeOfLargestEmptySubobject - The size of the largest empty subobject /// (either a base or a member). Will be zero if the class doesn't contain /// any empty subobjects. @@ -139,30 +149,26 @@ class ASTRecordLayout { CXXRecordLayoutInfo *CXXInfo = nullptr; ASTRecordLayout(const ASTContext &Ctx, CharUnits size, CharUnits alignment, - CharUnits unadjustedAlignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, CharUnits requiredAlignment, CharUnits datasize, ArrayRef fieldoffsets); using BaseOffsetsMapTy = CXXRecordLayoutInfo::BaseOffsetsMapTy; // Constructor for C++ records. - ASTRecordLayout(const ASTContext &Ctx, - CharUnits size, CharUnits alignment, - CharUnits unadjustedAlignment, - CharUnits requiredAlignment, - bool hasOwnVFPtr, bool hasExtendableVFPtr, - CharUnits vbptroffset, - CharUnits datasize, - ArrayRef fieldoffsets, + ASTRecordLayout(const ASTContext &Ctx, CharUnits size, CharUnits alignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, + CharUnits requiredAlignment, bool hasOwnVFPtr, + bool hasExtendableVFPtr, CharUnits vbptroffset, + CharUnits datasize, ArrayRef fieldoffsets, CharUnits nonvirtualsize, CharUnits nonvirtualalignment, + CharUnits preferrednvalignment, CharUnits SizeOfLargestEmptySubobject, - const CXXRecordDecl *PrimaryBase, - bool IsPrimaryBaseVirtual, + const CXXRecordDecl *PrimaryBase, bool IsPrimaryBaseVirtual, const CXXRecordDecl *BaseSharingVBPtr, - bool EndsWithZeroSizedObject, - bool LeadsWithZeroSizedBase, - const BaseOffsetsMapTy& BaseOffsets, - const VBaseOffsetsMapTy& VBaseOffsets); + bool EndsWithZeroSizedObject, bool LeadsWithZeroSizedBase, + const BaseOffsetsMapTy &BaseOffsets, + const VBaseOffsetsMapTy &VBaseOffsets); ~ASTRecordLayout() = default; @@ -175,6 +181,10 @@ class ASTRecordLayout { /// getAlignment - Get the record alignment in characters. CharUnits getAlignment() const { return Alignment; } + /// getPreferredFieldAlignment - Get the record preferred alignment in + /// characters. + CharUnits getPreferredAlignment() const { return PreferredAlignment; } + /// getUnadjustedAlignment - Get the record alignment in characters, before /// alignment adjustement. CharUnits getUnadjustedAlignment() const { return UnadjustedAlignment; } @@ -193,9 +203,7 @@ class ASTRecordLayout { /// getDataSize() - Get the record data size, which is the record size /// without tail padding, in characters. - CharUnits getDataSize() const { - return DataSize; - } + CharUnits getDataSize() const { return DataSize; } /// getNonVirtualSize - Get the non-virtual size (in chars) of an object, /// which is the size of the object without virtual bases. @@ -205,14 +213,23 @@ class ASTRecordLayout { return CXXInfo->NonVirtualSize; } - /// getNonVirtualSize - Get the non-virtual alignment (in chars) of an object, - /// which is the alignment of the object without virtual bases. + /// getNonVirtualAlignment - Get the non-virtual alignment (in chars) of an + /// object, which is the alignment of the object without virtual bases. CharUnits getNonVirtualAlignment() const { assert(CXXInfo && "Record layout does not have C++ specific info!"); return CXXInfo->NonVirtualAlignment; } + /// getPreferredNVAlignment - Get the preferred non-virtual alignment (in + /// chars) of an object, which is the preferred alignment of the object + /// without virtual bases. + CharUnits getPreferredNVAlignment() const { + assert(CXXInfo && "Record layout does not have C++ specific info!"); + + return CXXInfo->PreferredNVAlignment; + } + /// getPrimaryBase - Get the primary base for this record. const CXXRecordDecl *getPrimaryBase() const { assert(CXXInfo && "Record layout does not have C++ specific info!"); @@ -287,9 +304,7 @@ class ASTRecordLayout { return !CXXInfo->VBPtrOffset.isNegative(); } - CharUnits getRequiredAlignment() const { - return RequiredAlignment; - } + CharUnits getRequiredAlignment() const { return RequiredAlignment; } bool endsWithZeroSizedObject() const { return CXXInfo && CXXInfo->EndsWithZeroSizedObject; diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 2ee3b16596302..004990ee31528 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -1401,6 +1401,9 @@ class TargetInfo : public virtual TransferrableTargetInfo, /// Whether target allows to overalign ABI-specified preferred alignment virtual bool allowsLargerPreferedTypeAlignment() const { return true; } + /// Whether target defaults to the `power` alignment rules of AIX. + virtual bool defaultsToAIXPowerAlignment() const { return false; } + /// Set supported OpenCL extensions and optional core features. virtual void setSupportedOpenCLOpts() {} diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index fc7631712c3cf..e7518a538fe67 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2439,8 +2439,8 @@ CharUnits ASTContext::getTypeUnadjustedAlignInChars(const Type *T) const { /// getPreferredTypeAlign - Return the "preferred" alignment of the specified /// type for the current target in bits. This can be different than the ABI -/// alignment in cases where it is beneficial for performance to overalign -/// a data type. +/// alignment in cases where it is beneficial for performance or backwards +/// compatibility preserving to overalign a data type. unsigned ASTContext::getPreferredTypeAlign(const Type *T) const { TypeInfo TI = getTypeInfo(T); unsigned ABIAlign = TI.Align; @@ -2450,18 +2450,33 @@ unsigned ASTContext::getPreferredTypeAlign(const Type *T) const { // The preferred alignment of member pointers is that of a pointer. if (T->isMemberPointerType()) return getPreferredTypeAlign(getPointerDiffType().getTypePtr()); - + if (!Target->allowsLargerPreferedTypeAlignment()) return ABIAlign; - // Double and long long should be naturally aligned if possible. + if (const auto *RT = T->getAs()) { + if (TI.AlignIsRequired) + return ABIAlign; + + unsigned PreferredAlign = static_cast( + toBits(getASTRecordLayout(RT->getDecl()).PreferredAlignment)); + assert(PreferredAlign >= ABIAlign && + "PreferredAlign should be at least as large as ABIAlign."); + return PreferredAlign; + } + + // Double (and, for targets supporting AIX `power` alignment, long double) and + // long long should be naturally aligned (despite requiring less alignment) if + // possible. if (const auto *CT = T->getAs()) T = CT->getElementType().getTypePtr(); if (const auto *ET = T->getAs()) T = ET->getDecl()->getIntegerType().getTypePtr(); if (T->isSpecificBuiltinType(BuiltinType::Double) || T->isSpecificBuiltinType(BuiltinType::LongLong) || - T->isSpecificBuiltinType(BuiltinType::ULongLong)) + T->isSpecificBuiltinType(BuiltinType::ULongLong) || + (T->isSpecificBuiltinType(BuiltinType::LongDouble) && + Target->defaultsToAIXPowerAlignment())) // Don't increase the alignment if an alignment attribute was specified on a // typedef declaration. if (!TI.AlignIsRequired) diff --git a/clang/lib/AST/RecordLayout.cpp b/clang/lib/AST/RecordLayout.cpp index e7b500e1902d7..8f70a20729262 100644 --- a/clang/lib/AST/RecordLayout.cpp +++ b/clang/lib/AST/RecordLayout.cpp @@ -29,45 +29,42 @@ void ASTRecordLayout::Destroy(ASTContext &Ctx) { ASTRecordLayout::ASTRecordLayout(const ASTContext &Ctx, CharUnits size, CharUnits alignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, CharUnits requiredAlignment, CharUnits datasize, ArrayRef fieldoffsets) : Size(size), DataSize(datasize), Alignment(alignment), + PreferredAlignment(preferredAlignment), UnadjustedAlignment(unadjustedAlignment), RequiredAlignment(requiredAlignment) { FieldOffsets.append(Ctx, fieldoffsets.begin(), fieldoffsets.end()); } // Constructor for C++ records. -ASTRecordLayout::ASTRecordLayout(const ASTContext &Ctx, - CharUnits size, CharUnits alignment, - CharUnits unadjustedAlignment, - CharUnits requiredAlignment, - bool hasOwnVFPtr, bool hasExtendableVFPtr, - CharUnits vbptroffset, - CharUnits datasize, - ArrayRef fieldoffsets, - CharUnits nonvirtualsize, - CharUnits nonvirtualalignment, - CharUnits SizeOfLargestEmptySubobject, - const CXXRecordDecl *PrimaryBase, - bool IsPrimaryBaseVirtual, - const CXXRecordDecl *BaseSharingVBPtr, - bool EndsWithZeroSizedObject, - bool LeadsWithZeroSizedBase, - const BaseOffsetsMapTy& BaseOffsets, - const VBaseOffsetsMapTy& VBaseOffsets) - : Size(size), DataSize(datasize), Alignment(alignment), - UnadjustedAlignment(unadjustedAlignment), - RequiredAlignment(requiredAlignment), CXXInfo(new (Ctx) CXXRecordLayoutInfo) -{ +ASTRecordLayout::ASTRecordLayout( + const ASTContext &Ctx, CharUnits size, CharUnits alignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, + CharUnits requiredAlignment, bool hasOwnVFPtr, bool hasExtendableVFPtr, + CharUnits vbptroffset, CharUnits datasize, ArrayRef fieldoffsets, + CharUnits nonvirtualsize, CharUnits nonvirtualalignment, + CharUnits preferrednvalignment, CharUnits SizeOfLargestEmptySubobject, + const CXXRecordDecl *PrimaryBase, bool IsPrimaryBaseVirtual, + const CXXRecordDecl *BaseSharingVBPtr, bool EndsWithZeroSizedObject, + bool LeadsWithZeroSizedBase, const BaseOffsetsMapTy &BaseOffsets, + const VBaseOffsetsMapTy &VBaseOffsets) + : Size(size), DataSize(datasize), Alignment(alignment), + PreferredAlignment(preferredAlignment), + UnadjustedAlignment(unadjustedAlignment), + RequiredAlignment(requiredAlignment), + CXXInfo(new (Ctx) CXXRecordLayoutInfo) { FieldOffsets.append(Ctx, fieldoffsets.begin(), fieldoffsets.end()); CXXInfo->PrimaryBase.setPointer(PrimaryBase); CXXInfo->PrimaryBase.setInt(IsPrimaryBaseVirtual); CXXInfo->NonVirtualSize = nonvirtualsize; CXXInfo->NonVirtualAlignment = nonvirtualalignment; + CXXInfo->PreferredNVAlignment = preferrednvalignment; CXXInfo->SizeOfLargestEmptySubobject = SizeOfLargestEmptySubobject; CXXInfo->BaseOffsets = BaseOffsets; CXXInfo->VBaseOffsets = VBaseOffsets; diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp index d56c7e2ab8c0e..0afe91b446ee4 100644 --- a/clang/lib/AST/RecordLayoutBuilder.cpp +++ b/clang/lib/AST/RecordLayoutBuilder.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "clang/AST/RecordLayout.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTDiagnostic.h" #include "clang/AST/Attr.h" @@ -16,6 +15,7 @@ #include "clang/AST/DeclObjC.h" #include "clang/AST/Expr.h" #include "clang/AST/VTableBuilder.h" +#include "clang/AST/RecordLayout.h" #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/Format.h" @@ -589,6 +589,9 @@ class ItaniumRecordLayoutBuilder { /// Alignment - The current alignment of the record layout. CharUnits Alignment; + /// PreferredAlignment - The preferred alignment of the record layout. + CharUnits PreferredAlignment; + /// The alignment if attribute packed is not used. CharUnits UnpackedAlignment; @@ -632,6 +635,7 @@ class ItaniumRecordLayoutBuilder { CharUnits NonVirtualSize; CharUnits NonVirtualAlignment; + CharUnits PreferredNVAlignment; /// If we've laid out a field but not included its tail padding in Size yet, /// this is the size up to the end of that field. @@ -652,6 +656,12 @@ class ItaniumRecordLayoutBuilder { /// the flag of field offset changing due to packed attribute. bool HasPackedField; + /// HandledFirstNonOverlappingEmptyField - An auxiliary field used for AIX. + /// When there are OverlappingEmptyFields existing in the aggregate, the + /// flag shows if the following first non-empty or empty-but-non-overlapping + /// field has been handled, if any. + bool HandledFirstNonOverlappingEmptyField; + typedef llvm::DenseMap BaseOffsetsMapTy; /// Bases - base classes and their offsets in the record. @@ -678,17 +688,19 @@ class ItaniumRecordLayoutBuilder { ItaniumRecordLayoutBuilder(const ASTContext &Context, EmptySubobjectMap *EmptySubobjects) : Context(Context), EmptySubobjects(EmptySubobjects), Size(0), - Alignment(CharUnits::One()), UnpackedAlignment(CharUnits::One()), - UnadjustedAlignment(CharUnits::One()), - UseExternalLayout(false), InferAlignment(false), Packed(false), - IsUnion(false), IsMac68kAlign(false), IsMsStruct(false), - UnfilledBitsInLastUnit(0), LastBitfieldTypeSize(0), - MaxFieldAlignment(CharUnits::Zero()), DataSize(0), - NonVirtualSize(CharUnits::Zero()), + Alignment(CharUnits::One()), PreferredAlignment(CharUnits::One()), + UnpackedAlignment(CharUnits::One()), + UnadjustedAlignment(CharUnits::One()), UseExternalLayout(false), + InferAlignment(false), Packed(false), IsUnion(false), + IsMac68kAlign(false), IsMsStruct(false), UnfilledBitsInLastUnit(0), + LastBitfieldTypeSize(0), MaxFieldAlignment(CharUnits::Zero()), + DataSize(0), NonVirtualSize(CharUnits::Zero()), NonVirtualAlignment(CharUnits::One()), + PreferredNVAlignment(CharUnits::One()), PaddedFieldSize(CharUnits::Zero()), PrimaryBase(nullptr), - PrimaryBaseIsVirtual(false), HasOwnVFPtr(false), - HasPackedField(false), FirstNearlyEmptyVBase(nullptr) {} + PrimaryBaseIsVirtual(false), HasOwnVFPtr(false), HasPackedField(false), + HandledFirstNonOverlappingEmptyField(false), + FirstNearlyEmptyVBase(nullptr) {} void Layout(const RecordDecl *D); void Layout(const CXXRecordDecl *D); @@ -763,9 +775,13 @@ class ItaniumRecordLayoutBuilder { /// alignment. void FinishLayout(const NamedDecl *D); - void UpdateAlignment(CharUnits NewAlignment, CharUnits UnpackedNewAlignment); + void UpdateAlignment(CharUnits NewAlignment, CharUnits UnpackedNewAlignment, + CharUnits PreferredAlignment); + void UpdateAlignment(CharUnits NewAlignment, CharUnits UnpackedNewAlignment) { + UpdateAlignment(NewAlignment, UnpackedNewAlignment, NewAlignment); + } void UpdateAlignment(CharUnits NewAlignment) { - UpdateAlignment(NewAlignment, NewAlignment); + UpdateAlignment(NewAlignment, NewAlignment, NewAlignment); } /// Retrieve the externally-supplied field offset for the given @@ -998,7 +1014,7 @@ void ItaniumRecordLayoutBuilder::EnsureVTablePointerAlignment( setSize(getSize().alignTo(BaseAlign)); // Update the alignment. - UpdateAlignment(BaseAlign, UnpackedBaseAlign); + UpdateAlignment(BaseAlign, UnpackedBaseAlign, BaseAlign); } void ItaniumRecordLayoutBuilder::LayoutNonVirtualBases( @@ -1044,6 +1060,10 @@ void ItaniumRecordLayoutBuilder::LayoutNonVirtualBases( Context.toCharUnitsFromBits(Context.getTargetInfo().getPointerAlign(0)); EnsureVTablePointerAlignment(PtrAlign); HasOwnVFPtr = true; + + assert(!IsUnion && "Unions cannot be dynamic classes."); + HandledFirstNonOverlappingEmptyField = true; + setSize(getSize() + PtrWidth); setDataSize(getSize()); } @@ -1179,9 +1199,9 @@ void ItaniumRecordLayoutBuilder::LayoutVirtualBase( CharUnits ItaniumRecordLayoutBuilder::LayoutBase(const BaseSubobjectInfo *Base) { - const ASTRecordLayout &Layout = Context.getASTRecordLayout(Base->Class); - + assert(!IsUnion && "Unions cannot have base classes."); + const ASTRecordLayout &Layout = Context.getASTRecordLayout(Base->Class); CharUnits Offset; // Query the external layout to see if it provides an offset. @@ -1193,45 +1213,77 @@ ItaniumRecordLayoutBuilder::LayoutBase(const BaseSubobjectInfo *Base) { HasExternalLayout = External.getExternalNVBaseOffset(Base->Class, Offset); } - // Clang <= 6 incorrectly applied the 'packed' attribute to base classes. - // Per GCC's documentation, it only applies to non-static data members. + auto getBaseOrPreferredBaseAlignFromUnpacked = [&](CharUnits UnpackedAlign) { + // Clang <= 6 incorrectly applied the 'packed' attribute to base classes. + // Per GCC's documentation, it only applies to non-static data members. + return (Packed && ((Context.getLangOpts().getClangABICompat() <= + LangOptions::ClangABI::Ver6) || + Context.getTargetInfo().getTriple().isPS4() || + Context.getTargetInfo().getTriple().isOSAIX())) + ? CharUnits::One() + : UnpackedAlign; + }; + CharUnits UnpackedBaseAlign = Layout.getNonVirtualAlignment(); + CharUnits UnpackedPreferredBaseAlign = Layout.getPreferredNVAlignment(); CharUnits BaseAlign = - (Packed && ((Context.getLangOpts().getClangABICompat() <= - LangOptions::ClangABI::Ver6) || - Context.getTargetInfo().getTriple().isPS4())) - ? CharUnits::One() - : UnpackedBaseAlign; + getBaseOrPreferredBaseAlignFromUnpacked(UnpackedBaseAlign); + CharUnits PreferredBaseAlign = + getBaseOrPreferredBaseAlignFromUnpacked(UnpackedPreferredBaseAlign); + + const bool DefaultsToAIXPowerAlignment = + Context.getTargetInfo().defaultsToAIXPowerAlignment(); + if (DefaultsToAIXPowerAlignment) { + // AIX `power` alignment does not apply the preferred alignment for + // non-union classes if the source of the alignment (the current base in + // this context) follows introduction of the first subobject with + // exclusively allocated space or zero-extent array. + if (!Base->Class->isEmpty() && !HandledFirstNonOverlappingEmptyField) { + // By handling a base class that is not empty, we're handling the + // "first (inherited) member". + HandledFirstNonOverlappingEmptyField = true; + } else { + UnpackedPreferredBaseAlign = UnpackedBaseAlign; + PreferredBaseAlign = BaseAlign; + } + } + CharUnits UnpackedAlignTo = !DefaultsToAIXPowerAlignment + ? UnpackedBaseAlign + : UnpackedPreferredBaseAlign; // If we have an empty base class, try to place it at offset 0. if (Base->Class->isEmpty() && (!HasExternalLayout || Offset == CharUnits::Zero()) && EmptySubobjects->CanPlaceBaseAtOffset(Base, CharUnits::Zero())) { setSize(std::max(getSize(), Layout.getSize())); - UpdateAlignment(BaseAlign, UnpackedBaseAlign); + UpdateAlignment(BaseAlign, UnpackedAlignTo, PreferredBaseAlign); return CharUnits::Zero(); } - // The maximum field alignment overrides base align. + // The maximum field alignment overrides the base align/(AIX-only) preferred + // base align. if (!MaxFieldAlignment.isZero()) { BaseAlign = std::min(BaseAlign, MaxFieldAlignment); - UnpackedBaseAlign = std::min(UnpackedBaseAlign, MaxFieldAlignment); + PreferredBaseAlign = std::min(PreferredBaseAlign, MaxFieldAlignment); + UnpackedAlignTo = std::min(UnpackedAlignTo, MaxFieldAlignment); } + CharUnits AlignTo = + !DefaultsToAIXPowerAlignment ? BaseAlign : PreferredBaseAlign; if (!HasExternalLayout) { // Round up the current record size to the base's alignment boundary. - Offset = getDataSize().alignTo(BaseAlign); + Offset = getDataSize().alignTo(AlignTo); // Try to place the base. while (!EmptySubobjects->CanPlaceBaseAtOffset(Base, Offset)) - Offset += BaseAlign; + Offset += AlignTo; } else { bool Allowed = EmptySubobjects->CanPlaceBaseAtOffset(Base, Offset); (void)Allowed; assert(Allowed && "Base subobject externally placed at overlapping offset"); - if (InferAlignment && Offset < getDataSize().alignTo(BaseAlign)) { + if (InferAlignment && Offset < getDataSize().alignTo(AlignTo)) { // The externally-supplied base offset is before the base offset we // computed. Assume that the structure is packed. Alignment = CharUnits::One(); @@ -1248,7 +1300,7 @@ ItaniumRecordLayoutBuilder::LayoutBase(const BaseSubobjectInfo *Base) { setSize(std::max(getSize(), Offset + Layout.getSize())); // Remember max struct/class alignment. - UpdateAlignment(BaseAlign, UnpackedBaseAlign); + UpdateAlignment(BaseAlign, UnpackedAlignTo, PreferredBaseAlign); return Offset; } @@ -1260,6 +1312,8 @@ void ItaniumRecordLayoutBuilder::InitializeLayout(const Decl *D) { } Packed = D->hasAttr(); + HandledFirstNonOverlappingEmptyField = + !Context.getTargetInfo().defaultsToAIXPowerAlignment(); // Honor the default struct packing maximum alignment flag. if (unsigned DefaultMaxFieldAlignment = Context.getLangOpts().PackStruct) { @@ -1274,6 +1328,7 @@ void ItaniumRecordLayoutBuilder::InitializeLayout(const Decl *D) { IsMac68kAlign = true; MaxFieldAlignment = CharUnits::fromQuantity(2); Alignment = CharUnits::fromQuantity(2); + PreferredAlignment = CharUnits::fromQuantity(2); } else { if (const MaxFieldAlignmentAttr *MFAA = D->getAttr()) MaxFieldAlignment = Context.toCharUnitsFromBits(MFAA->getAlignment()); @@ -1293,6 +1348,7 @@ void ItaniumRecordLayoutBuilder::InitializeLayout(const Decl *D) { if (UseExternalLayout) { if (External.Align > 0) { Alignment = Context.toCharUnitsFromBits(External.Align); + PreferredAlignment = Context.toCharUnitsFromBits(External.Align); } else { // The external source didn't have alignment information; infer it. InferAlignment = true; @@ -1321,6 +1377,7 @@ void ItaniumRecordLayoutBuilder::Layout(const CXXRecordDecl *RD) { NonVirtualSize = Context.toCharUnitsFromBits( llvm::alignTo(getSizeInBits(), Context.getTargetInfo().getCharAlign())); NonVirtualAlignment = Alignment; + PreferredNVAlignment = PreferredAlignment; // Lay out the virtual bases and add the primary virtual base offsets. LayoutVirtualBases(RD, RD); @@ -1733,25 +1790,46 @@ void ItaniumRecordLayoutBuilder::LayoutBitField(const FieldDecl *D) { void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, bool InsertExtraPadding) { + auto *FieldClass = D->getType()->getAsCXXRecordDecl(); + bool PotentiallyOverlapping = D->hasAttr() && FieldClass; + bool IsOverlappingEmptyField = + PotentiallyOverlapping && FieldClass->isEmpty(); + + CharUnits FieldOffset = + (IsUnion || IsOverlappingEmptyField) ? CharUnits::Zero() : getDataSize(); + + const bool DefaultsToAIXPowerAlignment = + Context.getTargetInfo().defaultsToAIXPowerAlignment(); + bool FoundFirstNonOverlappingEmptyFieldForAIX = false; + if (DefaultsToAIXPowerAlignment && !HandledFirstNonOverlappingEmptyField) { + assert(FieldOffset == CharUnits::Zero() && + "The first non-overlapping empty field should have been handled."); + + if (!IsOverlappingEmptyField) { + FoundFirstNonOverlappingEmptyFieldForAIX = true; + + // We're going to handle the "first member" based on + // `FoundFirstNonOverlappingEmptyFieldForAIX` during the current + // invocation of this function; record it as handled for future + // invocations (except for unions, because the current field does not + // represent all "firsts"). + HandledFirstNonOverlappingEmptyField = !IsUnion; + } + } + if (D->isBitField()) { LayoutBitField(D); return; } uint64_t UnpaddedFieldOffset = getDataSizeInBits() - UnfilledBitsInLastUnit; - // Reset the unfilled bits. UnfilledBitsInLastUnit = 0; LastBitfieldTypeSize = 0; - auto *FieldClass = D->getType()->getAsCXXRecordDecl(); - bool PotentiallyOverlapping = D->hasAttr() && FieldClass; - bool IsOverlappingEmptyField = PotentiallyOverlapping && FieldClass->isEmpty(); bool FieldPacked = Packed || D->hasAttr(); - CharUnits FieldOffset = (IsUnion || IsOverlappingEmptyField) - ? CharUnits::Zero() - : getDataSize(); + bool AlignIsRequired = false; CharUnits FieldSize; CharUnits FieldAlign; // The amount of this class's dsize occupied by the field. @@ -1759,25 +1837,27 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, // into the field's tail padding. CharUnits EffectiveFieldSize; + auto setDeclInfo = [&](bool IsIncompleteArrayType) { + TypeInfo TI = Context.getTypeInfo(D->getType()); + FieldAlign = Context.toCharUnitsFromBits(TI.Align); + // Flexible array members don't have any size, but they have to be + // aligned appropriately for their element type. + EffectiveFieldSize = FieldSize = + IsIncompleteArrayType ? CharUnits::Zero() + : Context.toCharUnitsFromBits(TI.Width); + AlignIsRequired = TI.AlignIsRequired; + }; + if (D->getType()->isIncompleteArrayType()) { - // This is a flexible array member; we can't directly - // query getTypeInfo about these, so we figure it out here. - // Flexible array members don't have any size, but they - // have to be aligned appropriately for their element type. - EffectiveFieldSize = FieldSize = CharUnits::Zero(); - const ArrayType* ATy = Context.getAsArrayType(D->getType()); - FieldAlign = Context.getTypeAlignInChars(ATy->getElementType()); + setDeclInfo(true /* IsIncompleteArrayType */); } else if (const ReferenceType *RT = D->getType()->getAs()) { unsigned AS = Context.getTargetAddressSpace(RT->getPointeeType()); - EffectiveFieldSize = FieldSize = - Context.toCharUnitsFromBits(Context.getTargetInfo().getPointerWidth(AS)); - FieldAlign = - Context.toCharUnitsFromBits(Context.getTargetInfo().getPointerAlign(AS)); + EffectiveFieldSize = FieldSize = Context.toCharUnitsFromBits( + Context.getTargetInfo().getPointerWidth(AS)); + FieldAlign = Context.toCharUnitsFromBits( + Context.getTargetInfo().getPointerAlign(AS)); } else { - std::pair FieldInfo = - Context.getTypeInfoInChars(D->getType()); - EffectiveFieldSize = FieldSize = FieldInfo.first; - FieldAlign = FieldInfo.second; + setDeclInfo(false /* IsIncompleteArrayType */); // A potentially-overlapping field occupies its dsize or nvsize, whichever // is larger. @@ -1829,31 +1909,72 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, } } + // The AIX `power` alignment rules apply the natural alignment of the + // "first member" if it is of a floating-point data type (or is an aggregate + // whose recursively "first" member or element is such a type). The alignment + // associated with these types for subsequent members use an alignment value + // where the floating-point data type is considered to have 4-byte alignment. + // + // For the purposes of the foregoing: vtable pointers, non-empty base classes, + // and zero-width bit-fields count as prior members; members of empty class + // types marked `no_unique_address` are not considered to be prior members. + CharUnits PreferredAlign = FieldAlign; + if (DefaultsToAIXPowerAlignment && !AlignIsRequired && + FoundFirstNonOverlappingEmptyFieldForAIX) { + auto performBuiltinTypeAlignmentUpgrade = [&](const BuiltinType *BTy) { + if (BTy->getKind() == BuiltinType::Double || + BTy->getKind() == BuiltinType::LongDouble) { + assert(PreferredAlign == CharUnits::fromQuantity(4) && + "No need to upgrade the alignment value."); + PreferredAlign = CharUnits::fromQuantity(8); + } + }; + + const Type *Ty = D->getType()->getBaseElementTypeUnsafe(); + if (const ComplexType *CTy = Ty->getAs()) { + performBuiltinTypeAlignmentUpgrade(CTy->getElementType()->castAs()); + } else if (const BuiltinType *BTy = Ty->getAs()) { + performBuiltinTypeAlignmentUpgrade(BTy); + } else if (const RecordType *RT = Ty->getAs()) { + const RecordDecl *RD = RT->getDecl(); + assert(RD && "Expected non-null RecordDecl."); + const ASTRecordLayout &FieldRecord = Context.getASTRecordLayout(RD); + PreferredAlign = FieldRecord.getPreferredAlignment(); + } + } + // The align if the field is not packed. This is to check if the attribute // was unnecessary (-Wpacked). - CharUnits UnpackedFieldAlign = FieldAlign; + CharUnits UnpackedFieldAlign = + !DefaultsToAIXPowerAlignment ? FieldAlign : PreferredAlign; CharUnits UnpackedFieldOffset = FieldOffset; - if (FieldPacked) + if (FieldPacked) { FieldAlign = CharUnits::One(); + PreferredAlign = CharUnits::One(); + } CharUnits MaxAlignmentInChars = - Context.toCharUnitsFromBits(D->getMaxAlignment()); + Context.toCharUnitsFromBits(D->getMaxAlignment()); FieldAlign = std::max(FieldAlign, MaxAlignmentInChars); + PreferredAlign = std::max(PreferredAlign, MaxAlignmentInChars); UnpackedFieldAlign = std::max(UnpackedFieldAlign, MaxAlignmentInChars); // The maximum field alignment overrides the aligned attribute. if (!MaxFieldAlignment.isZero()) { FieldAlign = std::min(FieldAlign, MaxFieldAlignment); + PreferredAlign = std::min(PreferredAlign, MaxFieldAlignment); UnpackedFieldAlign = std::min(UnpackedFieldAlign, MaxFieldAlignment); } + CharUnits AlignTo = + !DefaultsToAIXPowerAlignment ? FieldAlign : PreferredAlign; // Round up the current record size to the field's alignment boundary. - FieldOffset = FieldOffset.alignTo(FieldAlign); + FieldOffset = FieldOffset.alignTo(AlignTo); UnpackedFieldOffset = UnpackedFieldOffset.alignTo(UnpackedFieldAlign); if (UseExternalLayout) { FieldOffset = Context.toCharUnitsFromBits( - updateExternalFieldOffset(D, Context.toBits(FieldOffset))); + updateExternalFieldOffset(D, Context.toBits(FieldOffset))); if (!IsUnion && EmptySubobjects) { // Record the fact that we're placing a field at this offset. @@ -1869,9 +1990,9 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, // We try offset 0 (for an empty field) and then dsize(C) onwards. if (FieldOffset == CharUnits::Zero() && getDataSize() != CharUnits::Zero()) - FieldOffset = getDataSize().alignTo(FieldAlign); + FieldOffset = getDataSize().alignTo(AlignTo); else - FieldOffset += FieldAlign; + FieldOffset += AlignTo; } } } @@ -1908,9 +2029,9 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, (uint64_t)Context.toBits(FieldOffset + FieldSize))); } - // Remember max struct/class alignment. + // Remember max struct/class ABI-specified alignment. UnadjustedAlignment = std::max(UnadjustedAlignment, FieldAlign); - UpdateAlignment(FieldAlign, UnpackedFieldAlign); + UpdateAlignment(FieldAlign, UnpackedFieldAlign, PreferredAlign); } void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { @@ -1936,8 +2057,12 @@ void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { uint64_t UnpaddedSize = getSizeInBits() - UnfilledBitsInLastUnit; uint64_t UnpackedSizeInBits = llvm::alignTo(getSizeInBits(), Context.toBits(UnpackedAlignment)); - uint64_t RoundedSize = - llvm::alignTo(getSizeInBits(), Context.toBits(Alignment)); + + uint64_t RoundedSize = llvm::alignTo( + getSizeInBits(), + Context.toBits(!Context.getTargetInfo().defaultsToAIXPowerAlignment() + ? Alignment + : PreferredAlignment)); if (UseExternalLayout) { // If we're inferring alignment, and the external size is smaller than @@ -1945,6 +2070,7 @@ void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { // alignment to 1. if (InferAlignment && External.Size < RoundedSize) { Alignment = CharUnits::One(); + PreferredAlignment = CharUnits::One(); InferAlignment = false; } setSize(External.Size); @@ -1981,7 +2107,8 @@ void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { } void ItaniumRecordLayoutBuilder::UpdateAlignment( - CharUnits NewAlignment, CharUnits UnpackedNewAlignment) { + CharUnits NewAlignment, CharUnits UnpackedNewAlignment, + CharUnits PreferredNewAlignment) { // The alignment is not modified when using 'mac68k' alignment or when // we have an externally-supplied layout that also provides overall alignment. if (IsMac68kAlign || (UseExternalLayout && !InferAlignment)) @@ -1998,6 +2125,12 @@ void ItaniumRecordLayoutBuilder::UpdateAlignment( "Alignment not a power of 2"); UnpackedAlignment = UnpackedNewAlignment; } + + if (PreferredNewAlignment > PreferredAlignment) { + assert(llvm::isPowerOf2_64(PreferredNewAlignment.getQuantity()) && + "Alignment not a power of 2"); + PreferredAlignment = PreferredNewAlignment; + } } uint64_t @@ -2009,6 +2142,7 @@ ItaniumRecordLayoutBuilder::updateExternalFieldOffset(const FieldDecl *Field, // The externally-supplied field offset is before the field offset we // computed. Assume that the structure is packed. Alignment = CharUnits::One(); + PreferredAlignment = CharUnits::One(); InferAlignment = false; } @@ -3063,10 +3197,10 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { Builder.cxxLayout(RD); NewEntry = new (*this) ASTRecordLayout( *this, Builder.Size, Builder.Alignment, Builder.Alignment, - Builder.RequiredAlignment, - Builder.HasOwnVFPtr, Builder.HasOwnVFPtr || Builder.PrimaryBase, - Builder.VBPtrOffset, Builder.DataSize, Builder.FieldOffsets, - Builder.NonVirtualSize, Builder.Alignment, CharUnits::Zero(), + Builder.Alignment, Builder.RequiredAlignment, Builder.HasOwnVFPtr, + Builder.HasOwnVFPtr || Builder.PrimaryBase, Builder.VBPtrOffset, + Builder.DataSize, Builder.FieldOffsets, Builder.NonVirtualSize, + Builder.Alignment, Builder.Alignment, CharUnits::Zero(), Builder.PrimaryBase, false, Builder.SharedVBPtrBase, Builder.EndsWithZeroSizedObject, Builder.LeadsWithZeroSizedBase, Builder.Bases, Builder.VBases); @@ -3074,8 +3208,8 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { Builder.layout(D); NewEntry = new (*this) ASTRecordLayout( *this, Builder.Size, Builder.Alignment, Builder.Alignment, - Builder.RequiredAlignment, - Builder.Size, Builder.FieldOffsets); + Builder.Alignment, Builder.RequiredAlignment, Builder.Size, + Builder.FieldOffsets); } } else { if (const auto *RD = dyn_cast(D)) { @@ -3095,11 +3229,13 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { CharUnits NonVirtualSize = skipTailPadding ? DataSize : Builder.NonVirtualSize; NewEntry = new (*this) ASTRecordLayout( - *this, Builder.getSize(), Builder.Alignment, Builder.UnadjustedAlignment, + *this, Builder.getSize(), Builder.Alignment, + Builder.PreferredAlignment, Builder.UnadjustedAlignment, /*RequiredAlignment : used by MS-ABI)*/ Builder.Alignment, Builder.HasOwnVFPtr, RD->isDynamicClass(), CharUnits::fromQuantity(-1), DataSize, Builder.FieldOffsets, NonVirtualSize, Builder.NonVirtualAlignment, + Builder.PreferredNVAlignment, EmptySubobjects.SizeOfLargestEmptySubobject, Builder.PrimaryBase, Builder.PrimaryBaseIsVirtual, nullptr, false, false, Builder.Bases, Builder.VBases); @@ -3108,7 +3244,8 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { Builder.Layout(D); NewEntry = new (*this) ASTRecordLayout( - *this, Builder.getSize(), Builder.Alignment, Builder.UnadjustedAlignment, + *this, Builder.getSize(), Builder.Alignment, + Builder.PreferredAlignment, Builder.UnadjustedAlignment, /*RequiredAlignment : used by MS-ABI)*/ Builder.Alignment, Builder.getSize(), Builder.FieldOffsets); } @@ -3260,14 +3397,11 @@ ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, ItaniumRecordLayoutBuilder Builder(*this, /*EmptySubobjects=*/nullptr); Builder.Layout(D); - const ASTRecordLayout *NewEntry = - new (*this) ASTRecordLayout(*this, Builder.getSize(), - Builder.Alignment, - Builder.UnadjustedAlignment, - /*RequiredAlignment : used by MS-ABI)*/ - Builder.Alignment, - Builder.getDataSize(), - Builder.FieldOffsets); + const ASTRecordLayout *NewEntry = new (*this) ASTRecordLayout( + *this, Builder.getSize(), Builder.Alignment, Builder.PreferredAlignment, + Builder.UnadjustedAlignment, + /*RequiredAlignment : used by MS-ABI)*/ + Builder.Alignment, Builder.getDataSize(), Builder.FieldOffsets); ObjCLayouts[Key] = NewEntry; @@ -3430,22 +3564,26 @@ static void DumpRecordLayout(raw_ostream &OS, const RecordDecl *RD, if (CXXRD && !isMsLayout(C)) OS << ", dsize=" << Layout.getDataSize().getQuantity(); OS << ", align=" << Layout.getAlignment().getQuantity(); + if (C.getTargetInfo().defaultsToAIXPowerAlignment()) + OS << ", preferredalign=" << Layout.getPreferredAlignment().getQuantity(); if (CXXRD) { OS << ",\n"; PrintIndentNoOffset(OS, IndentLevel - 1); OS << " nvsize=" << Layout.getNonVirtualSize().getQuantity(); OS << ", nvalign=" << Layout.getNonVirtualAlignment().getQuantity(); + if (C.getTargetInfo().defaultsToAIXPowerAlignment()) + OS << ", preferrednvalign=" + << Layout.getPreferredNVAlignment().getQuantity(); } OS << "]\n"; } -void ASTContext::DumpRecordLayout(const RecordDecl *RD, - raw_ostream &OS, +void ASTContext::DumpRecordLayout(const RecordDecl *RD, raw_ostream &OS, bool Simple) const { if (!Simple) { ::DumpRecordLayout(OS, RD, *this, CharUnits(), 0, nullptr, - /*PrintSizeInfo*/true, + /*PrintSizeInfo*/ true, /*IncludeVirtualBases=*/true); return; } @@ -3465,9 +3603,13 @@ void ASTContext::DumpRecordLayout(const RecordDecl *RD, if (!isMsLayout(*this)) OS << " DataSize:" << toBits(Info.getDataSize()) << "\n"; OS << " Alignment:" << toBits(Info.getAlignment()) << "\n"; + if (Target->defaultsToAIXPowerAlignment()) + OS << " PreferredAlignment:" << toBits(Info.getPreferredAlignment()) + << "\n"; OS << " FieldOffsets: ["; for (unsigned i = 0, e = Info.getFieldCount(); i != e; ++i) { - if (i) OS << ", "; + if (i) + OS << ", "; OS << Info.getFieldOffset(i); } OS << "]>\n"; diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h index cfa362bef1b1c..7b3acc335a352 100644 --- a/clang/lib/Basic/Targets/OSTargets.h +++ b/clang/lib/Basic/Targets/OSTargets.h @@ -719,6 +719,8 @@ class AIXTargetInfo : public OSTargetInfo { // AIX sets FLT_EVAL_METHOD to be 1. unsigned getFloatEvalMethod() const override { return 1; } bool hasInt128Type() const override { return false; } + + bool defaultsToAIXPowerAlignment() const override { return true; } }; void addWindowsDefines(const llvm::Triple &Triple, const LangOptions &Opts, diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 858059bacb86b..ff8579b6c3cf4 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -371,13 +371,16 @@ class LLVM_LIBRARY_VISIBILITY PPC32TargetInfo : public PPCTargetInfo { PtrDiffType = SignedLong; IntPtrType = SignedLong; SuitableAlign = 64; + LongDoubleWidth = 64; + LongDoubleAlign = DoubleAlign = 32; + LongDoubleFormat = &llvm::APFloat::IEEEdouble(); break; default: break; } if (Triple.isOSFreeBSD() || Triple.isOSNetBSD() || Triple.isOSOpenBSD() || - Triple.getOS() == llvm::Triple::AIX || Triple.isMusl()) { + Triple.isMusl()) { LongDoubleWidth = LongDoubleAlign = 64; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } @@ -406,6 +409,9 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo { // TODO: Set appropriate ABI for AIX platform. resetDataLayout("E-m:a-i64:64-n32:64"); SuitableAlign = 64; + LongDoubleWidth = 64; + LongDoubleAlign = DoubleAlign = 32; + LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } else if ((Triple.getArch() == llvm::Triple::ppc64le)) { resetDataLayout("e-m:e-i64:64-n32:64"); ABI = "elfv2"; @@ -414,8 +420,7 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo { ABI = "elfv1"; } - if (Triple.isOSFreeBSD() || Triple.getOS() == llvm::Triple::AIX || - Triple.isMusl()) { + if (Triple.isOSFreeBSD() || Triple.isMusl()) { LongDoubleWidth = LongDoubleAlign = 64; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } diff --git a/clang/test/Layout/aix-Wpacked-expecting-diagnostics.cpp b/clang/test/Layout/aix-Wpacked-expecting-diagnostics.cpp new file mode 100644 index 0000000000000..1980c04877c51 --- /dev/null +++ b/clang/test/Layout/aix-Wpacked-expecting-diagnostics.cpp @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +struct A { + double d; +}; + +struct B { + char x[8]; +}; + +struct [[gnu::packed]] C : B, A { // expected-warning{{packed attribute is unnecessary for 'C'}} + char x alignas(4)[8]; +}; + +int b = sizeof(C); + +// CHECK: 0 | struct C +// CHECK-NEXT: 0 | struct B (base) +// CHECK-NEXT: 0 | char [8] x +// CHECK-NEXT: 8 | struct A (base) +// CHECK-NEXT: 8 | double d +// CHECK-NEXT: 16 | char [8] x +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=4] diff --git a/clang/test/Layout/aix-Wpacked-no-diagnostics.cpp b/clang/test/Layout/aix-Wpacked-no-diagnostics.cpp new file mode 100644 index 0000000000000..ed5362b3e8cd9 --- /dev/null +++ b/clang/test/Layout/aix-Wpacked-no-diagnostics.cpp @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +// expected-no-diagnostics + +struct [[gnu::packed]] Q { + double x [[gnu::aligned(4)]]; +}; + +struct QQ : Q { char x; }; + +int a = sizeof(QQ); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct Q +// CHECK-NEXT: 0 | double x +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct QQ +// CHECK-NEXT: 0 | struct Q (base) +// CHECK-NEXT: 0 | double x +// CHECK-NEXT: 8 | char x +// CHECK-NEXT: | [sizeof=12, dsize=9, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=9, nvalign=4, preferrednvalign=4] diff --git a/clang/test/Layout/aix-double-struct-member.cpp b/clang/test/Layout/aix-double-struct-member.cpp new file mode 100644 index 0000000000000..b51d10467bceb --- /dev/null +++ b/clang/test/Layout/aix-double-struct-member.cpp @@ -0,0 +1,428 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +namespace test1 { +// Test the class layout when having a double which is/is not the first struct +// member. +struct D { + double d1; + int i1; +}; + +struct DoubleFirst { + struct D d2; + int i2; +}; + +struct IntFirst { + int i3; + struct D d3; +}; + +int a = sizeof(DoubleFirst); +int b = sizeof(IntFirst); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::D +// CHECK-NEXT: 0 | double d1 +// CHECK-NEXT: 8 | int i1 +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::DoubleFirst +// CHECK-NEXT: 0 | struct test1::D d2 +// CHECK-NEXT: 0 | double d1 +// CHECK-NEXT: 8 | int i1 +// CHECK-NEXT: 16 | int i2 +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::IntFirst +// CHECK-NEXT: 0 | int i3 +// CHECK-NEXT: 4 | struct test1::D d3 +// CHECK-NEXT: 4 | double d1 +// CHECK-NEXT: 12 | int i1 +// CHECK-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] +} // namespace test1 + +namespace test2 { +// Test the class layout when having a zero-sized bitfield followed by double. +struct Double { + int : 0; + double d; +}; + +int a = sizeof(Double); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::Double +// CHECK-NEXT: 0:- | int +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] +} // namespace test2 + +namespace test3 { +// Test the class layout when having a double member in union. +union A { + int b; + double d; +}; + +struct UnionStruct { + union A a; + int i; +}; + +int a = sizeof(UnionStruct); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | union test3::A +// CHECK-NEXT: 0 | int b +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test3::UnionStruct +// CHECK-NEXT: 0 | union test3::A a +// CHECK-NEXT: 0 | int b +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | int i +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +} // namespace test3 + +namespace test4 { +// Test the class layout when having multiple base classes. +struct A { + int a; +}; + +struct B { + double d; +}; + +class S : A, B { +}; + +int a = sizeof(S); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test4::A +// CHECK-NEXT: 0 | int a +// CHECK-NEXT: | [sizeof=4, dsize=4, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=4, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test4::B +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | class test4::S +// CHECK-NEXT: 0 | struct test4::A (base) +// CHECK-NEXT: 0 | int a +// CHECK-NEXT: 4 | struct test4::B (base) +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: | [sizeof=12, dsize=12, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +} // namespace test4 + +namespace test5 { +struct Empty { +}; + +struct EmptyDer : Empty { + double d; +}; + +struct NonEmpty { + int i; +}; + +struct NonEmptyDer : NonEmpty { + double d; +}; + +int a = sizeof(EmptyDer); +int b = sizeof(NonEmptyDer); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::Empty (empty) +// CHECK-NEXT: | [sizeof=1, dsize=1, align=1, preferredalign=1, +// CHECK-NEXT: | nvsize=1, nvalign=1, preferrednvalign=1] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::EmptyDer +// CHECK-NEXT: 0 | struct test5::Empty (base) (empty) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::NonEmpty +// CHECK-NEXT: 0 | int i +// CHECK-NEXT: | [sizeof=4, dsize=4, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=4, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::NonEmptyDer +// CHECK-NEXT: 0 | struct test5::NonEmpty (base) +// CHECK-NEXT: 0 | int i +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: | [sizeof=12, dsize=12, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +} // namespace test5 + +namespace test6 { +struct A { + struct B { + double d[3]; + } b; +}; + +int a = sizeof(A); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test6::A::B +// CHECK-NEXT: 0 | double [3] d +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test6::A +// CHECK-NEXT: 0 | struct test6::A::B b +// CHECK-NEXT: 0 | double [3] d +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +} // namespace test6 + +namespace test7 { +struct A { + struct B { + long double _Complex d[3]; + } b; +}; + +int a = sizeof(A); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test7::A::B +// CHECK-NEXT: 0 | _Complex long double [3] d +// CHECK-NEXT: | [sizeof=48, dsize=48, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=48, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test7::A +// CHECK-NEXT: 0 | struct test7::A::B b +// CHECK-NEXT: 0 | _Complex long double [3] d +// CHECK-NEXT: | [sizeof=48, dsize=48, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=48, nvalign=4, preferrednvalign=8] + +} // namespace test7 + +namespace test8 { +struct Emp {}; + +struct Y : Emp { + double d; +}; + +struct Z : Emp { + Y y; +}; + +int a = sizeof(Z); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test8::Emp (empty) +// CHECK-NEXT: | [sizeof=1, dsize=1, align=1, preferredalign=1, +// CHECK-NEXT: | nvsize=1, nvalign=1, preferrednvalign=1] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test8::Y +// CHECK-NEXT: 0 | struct test8::Emp (base) (empty) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test8::Z +// CHECK-NEXT: 0 | struct test8::Emp (base) (empty) +// CHECK-NEXT: 8 | struct test8::Y y +// CHECK-NEXT: 8 | struct test8::Emp (base) (empty) +// CHECK-NEXT: 8 | double d +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +} // namespace test8 + +namespace test9 { +// Test the class layout when having a zero-extent array in a base class, which +// renders the base class not empty. +struct A { char zea[0]; }; + +struct B : A { double d; }; + +struct C { double d; }; +struct D : A, C { char x; }; + +int a = sizeof(B); +int b = sizeof(D); + +// CHECK: 0 | struct test9::B +// CHECK-NEXT: 0 | struct test9::A (base) +// CHECK-NEXT: 0 | char [0] zea +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] + +// CHECK: 0 | struct test9::D +// CHECK-NEXT: 0 | struct test9::A (base) +// CHECK-NEXT: 0 | char [0] zea +// CHECK-NEXT: 0 | struct test9::C (base) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | char x +// CHECK-NEXT: | [sizeof=12, dsize=9, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=9, nvalign=4, preferrednvalign=4] + +} // namespace test9 + +namespace test10 { +struct A { double x; }; +struct B : A {}; + +int a = sizeof(B); + +// CHECK: 0 | struct test10::B +// CHECK-NEXT: 0 | struct test10::A (base) +// CHECK-NEXT: 0 | double x +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +} // namespace test10 + +namespace test11 { +// Test how #pragma pack and align attribute interacts with AIX `power` +// alignment rules. +struct A { + char a; + double __attribute__((aligned(16))) d; + int i; +}; + +struct B { + double __attribute__((aligned(4))) d1; + char a; + double d2; +}; + +#pragma pack(2) +struct C { + int i; + short j; + double k; +}; + +#pragma pack(2) +struct D { + double d; + short j; + int i; +}; + +#pragma pack(8) +struct E { + double __attribute__((aligned(4))) d; + short s; +}; + +#pragma pack(4) +struct F : public D { + double d; +}; + +#pragma pack(2) +struct G : public E { + int i; +}; + +int a = sizeof(A); +int b = sizeof(B); +int c = sizeof(C); +int d = sizeof(D); +int e = sizeof(E); +int f = sizeof(F); +int g = sizeof(G); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::A +// CHECK-NEXT: 0 | char a +// CHECK-NEXT: 16 | double d +// CHECK-NEXT: 24 | int i +// CHECK-NEXT: | [sizeof=32, dsize=32, align=16, preferredalign=16, +// CHECK-NEXT: | nvsize=32, nvalign=16, preferrednvalign=16] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::B +// CHECK-NEXT: 0 | double d1 +// CHECK-NEXT: 8 | char a +// CHECK-NEXT: 12 | double d2 +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::C +// CHECK-NEXT: 0 | int i +// CHECK-NEXT: 4 | short j +// CHECK-NEXT: 6 | double k +// CHECK-NEXT: | [sizeof=14, dsize=14, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=14, nvalign=2, preferrednvalign=2] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::D +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short j +// CHECK-NEXT: 10 | int i +// CHECK-NEXT: | [sizeof=14, dsize=14, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=14, nvalign=2, preferrednvalign=2] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::E +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short s +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::F +// CHECK-NEXT: 0 | struct test11::D (base) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short j +// CHECK-NEXT: 10 | int i +// CHECK-NEXT: 16 | double d +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::G +// CHECK-NEXT: 0 | struct test11::E (base) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short s +// CHECK-NEXT: 16 | int i +// CHECK-NEXT: | [sizeof=20, dsize=20, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=20, nvalign=2, preferrednvalign=2] + +} // namespace test11 diff --git a/clang/test/Layout/aix-no-unique-address-with-double.cpp b/clang/test/Layout/aix-no-unique-address-with-double.cpp new file mode 100644 index 0000000000000..5188bf128bc50 --- /dev/null +++ b/clang/test/Layout/aix-no-unique-address-with-double.cpp @@ -0,0 +1,158 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +struct Empty {}; + +struct A { + double d; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct Empty (empty) +// CHECK-NEXT: | [sizeof=1, dsize=1, align=1, preferredalign=1, +// CHECK-NEXT: | nvsize=1, nvalign=1, preferrednvalign=1] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct A +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +struct B { + ~B(); + + Empty emp; + A a; + char c; +}; + +struct B1 { + [[no_unique_address]] B b; + char ext[7]; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct B +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: | [sizeof=16, dsize=13, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=13, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct B1 +// CHECK-NEXT: 0 | struct B b +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: 13 | char [7] ext +// CHECK-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] + +struct C { + ~C(); + + [[no_unique_address]] Empty emp; + A a; + char c; +}; + +struct C1 { + [[no_unique_address]] C c; + char ext[7]; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct C +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 0 | struct A a +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | char c +// CHECK-NEXT: | [sizeof=16, dsize=9, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=9, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct C1 +// CHECK-NEXT: 0 | struct C c +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 0 | struct A a +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | char c +// CHECK-NEXT: 9 | char [7] ext +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +struct D { + ~D(); + + [[no_unique_address]] char notEmp; + A a; + char c; +}; + +struct D1 { + [[no_unique_address]] D d; + char ext[7]; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct D +// CHECK-NEXT: 0 | char notEmp +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: | [sizeof=16, dsize=13, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=13, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct D1 +// CHECK-NEXT: 0 | struct D d +// CHECK-NEXT: 0 | char notEmp +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: 13 | char [7] ext +// CHECK-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] + +struct E { + [[no_unique_address]] Empty emp; + int : 0; + double d; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct E +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 0:- | int +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] + +struct F { + [[no_unique_address]] Empty emp, emp2; + double d; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct F +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 1 | struct Empty emp2 (empty) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +int a = sizeof(Empty); +int b = sizeof(A); +int c = sizeof(B1); +int d = sizeof(C1); +int e = sizeof(D1); +int f = sizeof(E); +int g = sizeof(F); diff --git a/clang/test/Layout/aix-pack-attr-on-base.cpp b/clang/test/Layout/aix-pack-attr-on-base.cpp new file mode 100644 index 0000000000000..3d0ebabf79a82 --- /dev/null +++ b/clang/test/Layout/aix-pack-attr-on-base.cpp @@ -0,0 +1,20 @@ +// RUN: %clang_cc1 -emit-llvm -triple powerpc-ibm-aix-xcoff -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -emit-llvm -triple powerpc64-ibm-aix-xcoff -x c++ < %s | \ +// RUN: FileCheck %s + +struct A { + char x; +}; + +struct B { + int x; +}; + +struct __attribute__((__packed__)) C : A, B {} c; + +int s = sizeof(c); + +// CHECK: @c = global %struct.C zeroinitializer, align 1 +// CHECK: @s = global i32 5 diff --git a/clang/test/Layout/aix-power-alignment-typedef-2.cpp b/clang/test/Layout/aix-power-alignment-typedef-2.cpp new file mode 100644 index 0000000000000..8e7e3db47c602 --- /dev/null +++ b/clang/test/Layout/aix-power-alignment-typedef-2.cpp @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \ +// RUN: FileCheck %s + +struct C { + double x; +}; + +typedef struct C __attribute__((__aligned__(2))) CC; + +CC cc; + +// CHECK: @cc = global %struct.C zeroinitializer, align 2 diff --git a/clang/test/Layout/aix-power-alignment-typedef.cpp b/clang/test/Layout/aix-power-alignment-typedef.cpp new file mode 100644 index 0000000000000..fc973a1fdfd81 --- /dev/null +++ b/clang/test/Layout/aix-power-alignment-typedef.cpp @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts %s | \ +// RUN: FileCheck %s + +namespace test1 { +typedef double __attribute__((__aligned__(2))) Dbl; +struct A { + Dbl x; +}; + +int b = sizeof(A); + +// CHECK: 0 | struct test1::A +// CHECK-NEXT: 0 | test1::Dbl x +// CHECK-NEXT: | [sizeof=8, dsize=8, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=8, nvalign=2, preferrednvalign=2] + +} // namespace test1 + +namespace test2 { +typedef double Dbl __attribute__((__aligned__(2))); +typedef Dbl DblArr[]; + +union U { + DblArr da; + char x; +}; + +int x = sizeof(U); + +// CHECK: 0 | union test2::U +// CHECK-NEXT: 0 | test2::DblArr da +// CHECK-NEXT: 0 | char x +// CHECK-NEXT: | [sizeof=2, dsize=2, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=2, nvalign=2, preferrednvalign=2] + +} // namespace test2 diff --git a/clang/test/Layout/aix-virtual-function-and-base-with-double.cpp b/clang/test/Layout/aix-virtual-function-and-base-with-double.cpp new file mode 100644 index 0000000000000..d3bc4418db1fe --- /dev/null +++ b/clang/test/Layout/aix-virtual-function-and-base-with-double.cpp @@ -0,0 +1,112 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK32 %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK64 %s + +namespace test1 { +struct A { + double d1; + virtual void boo() {} +}; + +struct B { + double d2; + A a; +}; + +struct C : public A { + double d3; +}; + +int i = sizeof(B); +int j = sizeof(C); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::A +// CHECK-NEXT: 0 | (A vtable pointer) +// CHECK32-NEXT: 4 | double d1 +// CHECK32-NEXT: | [sizeof=12, dsize=12, align=4, preferredalign=4, +// CHECK32-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +// CHECK64-NEXT: 8 | double d1 +// CHECK64-NEXT: | [sizeof=16, dsize=16, align=8, preferredalign=8, +// CHECK64-NEXT: | nvsize=16, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::B +// CHECK-NEXT: 0 | double d2 +// CHECK-NEXT: 8 | struct test1::A a +// CHECK-NEXT: 8 | (A vtable pointer) +// CHECK32-NEXT: 12 | double d1 +// CHECK32-NEXT: | [sizeof=24, dsize=20, align=4, preferredalign=8, +// CHECK32-NEXT: | nvsize=20, nvalign=4, preferrednvalign=8] +// CHECK64-NEXT: 16 | double d1 +// CHECK64-NEXT: | [sizeof=24, dsize=24, align=8, preferredalign=8, +// CHECK64-NEXT: | nvsize=24, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::C +// CHECK-NEXT: 0 | struct test1::A (primary base) +// CHECK-NEXT: 0 | (A vtable pointer) +// CHECK32-NEXT: 4 | double d1 +// CHECK32-NEXT: 12 | double d3 +// CHECK32-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK32-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] +// CHECK64-NEXT: 8 | double d1 +// CHECK64-NEXT: 16 | double d3 +// CHECK64-NEXT: | [sizeof=24, dsize=24, align=8, preferredalign=8, +// CHECK64-NEXT: | nvsize=24, nvalign=8, preferrednvalign=8] + +} // namespace test1 + +namespace test2 { +struct A { + long long l1; +}; + +struct B : public virtual A { + double d2; +}; + +#pragma pack(2) +struct C : public virtual A { + double __attribute__((aligned(4))) d3; +}; + +int i = sizeof(B); +int j = sizeof(C); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::A +// CHECK-NEXT: 0 | long long l1 +// CHECK-NEXT: | [sizeof=8, dsize=8, align=8, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::B +// CHECK-NEXT: 0 | (B vtable pointer) +// CHECK32-NEXT: 4 | double d2 +// CHECK64-NEXT: 8 | double d2 +// CHECK-NEXT: 16 | struct test2::A (virtual base) +// CHECK-NEXT: 16 | long long l1 +// CHECK-NEXT: | [sizeof=24, dsize=24, align=8, preferredalign=8, +// CHECK32-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +// CHECK64-NEXT: | nvsize=16, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::C +// CHECK-NEXT: 0 | (C vtable pointer) +// CHECK32-NEXT: 4 | double d3 +// CHECK32-NEXT: 12 | struct test2::A (virtual base) +// CHECK32-NEXT: 12 | long long l1 +// CHECK32-NEXT: | [sizeof=20, dsize=20, align=2, preferredalign=2, +// CHECK32-NEXT: | nvsize=12, nvalign=2, preferrednvalign=2] +// CHECK64-NEXT: 8 | double d3 +// CHECK64-NEXT: 16 | struct test2::A (virtual base) +// CHECK64-NEXT: 16 | long long l1 +// CHECK64-NEXT: | [sizeof=24, dsize=24, align=2, preferredalign=2, +// CHECK64-NEXT: | nvsize=16, nvalign=2, preferrednvalign=2] + +} // namespace test2 From fbe911ee750fe62061eb15c5c8f71270fdc2fe98 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Mon, 27 Jul 2020 12:13:08 -0700 Subject: [PATCH 0233/1035] [mlir][AffineToStandard] Make LowerAffine pass Op-agnostic. The LowerAffine psas was a FunctionPass only for legacy reasons. Making this Op-agnostic allows it to be used from command line when affine expressions are within operations other than `std.func`. Differential Revision: https://reviews.llvm.org/D84590 --- mlir/include/mlir/Conversion/Passes.td | 4 ++-- mlir/include/mlir/Transforms/Passes.h | 2 +- .../AffineToStandard/AffineToStandard.cpp | 6 +++--- .../AffineToStandard/lower-affine-gpu.mlir | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 6 deletions(-) create mode 100644 mlir/test/Conversion/AffineToStandard/lower-affine-gpu.mlir diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 1c3b776e97808..f4c790655b1f1 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -15,12 +15,12 @@ include "mlir/Pass/PassBase.td" // AffineToStandard //===----------------------------------------------------------------------===// -def ConvertAffineToStandard : FunctionPass<"lower-affine"> { +def ConvertAffineToStandard : Pass<"lower-affine"> { let summary = "Lower Affine operations to a combination of Standard and SCF " "operations"; let description = [{ - Convert operations from the affine dialect into operations from the loop and + Convert operations from the affine dialect into operations from the SCF and standard dialects. `affine.for` operations are converted to `scf.for` operations that are free diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h index d528723af8ec6..955b0e99a1d17 100644 --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -57,7 +57,7 @@ std::unique_ptr> createPipelineDataTransferPass(); /// Lowers affine control flow operations (ForStmt, IfStmt and AffineApplyOp) /// to equivalent lower-level constructs (flow of basic blocks and arithmetic /// primitives). -std::unique_ptr> createLowerAffinePass(); +std::unique_ptr createLowerAffinePass(); /// Creates a pass that transforms perfectly nested loops with independent /// bounds into a single loop. diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp index bc48ef35fcd10..a0cfdb8e6fa7b 100644 --- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp +++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp @@ -672,14 +672,14 @@ void mlir::populateAffineToVectorConversionPatterns( namespace { class LowerAffinePass : public ConvertAffineToStandardBase { - void runOnFunction() override { + void runOnOperation() override { OwningRewritePatternList patterns; populateAffineToStdConversionPatterns(patterns, &getContext()); populateAffineToVectorConversionPatterns(patterns, &getContext()); ConversionTarget target(getContext()); target .addLegalDialect(); - if (failed(applyPartialConversion(getFunction(), target, patterns))) + if (failed(applyPartialConversion(getOperation(), target, patterns))) signalPassFailure(); } }; @@ -687,6 +687,6 @@ class LowerAffinePass : public ConvertAffineToStandardBase { /// Lowers If and For operations within a function into their lower level CFG /// equivalent blocks. -std::unique_ptr> mlir::createLowerAffinePass() { +std::unique_ptr mlir::createLowerAffinePass() { return std::make_unique(); } diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine-gpu.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine-gpu.mlir new file mode 100644 index 0000000000000..8e0d4f4fe8a4e --- /dev/null +++ b/mlir/test/Conversion/AffineToStandard/lower-affine-gpu.mlir @@ -0,0 +1,15 @@ +// RUN: mlir-opt -pass-pipeline="gpu.module(lower-affine)" %s | FileCheck %s + +#map0gpufunc = affine_map<(d0) -> (d0)> +gpu.module @kernels { + gpu.func @foo(%arg0 : index, %arg1 : memref) -> f32 { + %0 = affine.apply #map0gpufunc(%arg0) + %1 = load %arg1[%0] : memref + gpu.return %1 : f32 + } + +// CHECK: gpu.func +// CHECK-SAME: %[[ARG0:.*]]: index +// CHECK-NOT: affine.apply +// CHECK: load %{{.*}}[%[[ARG0]]] +} From adffce71538e219aab4eeb024819baa7687262ff Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Mon, 27 Jul 2020 18:01:40 +0000 Subject: [PATCH 0234/1035] [PowerPC] Remove QPX/A2Q BGQ/BGP CNK support Per RFC http://lists.llvm.org/pipermail/llvm-dev/2020-April/141295.html no one is making use of QPX/A2Q/BGQ/BGP CNK anymore. This patch remove the support of QPX/A2Q in llvm, BGQ/BGP in clang, CNK support in openmp/polly. Reviewed By: hfinkel Differential Revision: https://reviews.llvm.org/D83915 --- clang/lib/Basic/Targets/PPC.cpp | 39 +- clang/lib/Basic/Targets/PPC.h | 3 - clang/lib/Driver/ToolChains/Arch/PPC.cpp | 1 - clang/lib/Driver/ToolChains/Clang.cpp | 12 - clang/test/Driver/clang-translation.c | 6 - clang/test/Driver/ppc-abi.c | 20 - clang/test/Misc/target-invalid-cpu-note.c | 2 +- clang/test/Preprocessor/init-ppc64.c | 16 - llvm/docs/LangRef.rst | 11 +- llvm/include/llvm/ADT/Triple.h | 3 - llvm/include/llvm/IR/IntrinsicsPowerPC.td | 176 --- llvm/lib/Support/Triple.cpp | 6 - .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 18 - llvm/lib/Target/PowerPC/CMakeLists.txt | 1 - .../PowerPC/Disassembler/PPCDisassembler.cpp | 15 +- .../PowerPC/MCTargetDesc/PPCInstPrinter.cpp | 12 - .../PowerPC/MCTargetDesc/PPCMCTargetDesc.h | 1 - llvm/lib/Target/PowerPC/PPC.h | 2 - llvm/lib/Target/PowerPC/PPC.td | 14 +- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 3 - llvm/lib/Target/PowerPC/PPCCallingConv.td | 16 - llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 16 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 1025 +------------- llvm/lib/Target/PowerPC/PPCISelLowering.h | 20 - llvm/lib/Target/PowerPC/PPCInstrFormats.td | 52 - llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 27 +- llvm/lib/Target/PowerPC/PPCInstrInfo.h | 23 +- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 11 - llvm/lib/Target/PowerPC/PPCInstrQPX.td | 1212 ----------------- llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp | 161 --- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 3 - llvm/lib/Target/PowerPC/PPCRegisterInfo.h | 1 - llvm/lib/Target/PowerPC/PPCRegisterInfo.td | 23 - llvm/lib/Target/PowerPC/PPCScheduleP9.td | 9 +- llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 12 +- llvm/lib/Target/PowerPC/PPCSubtarget.h | 14 - llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 23 +- .../Target/PowerPC/PPCTargetTransformInfo.cpp | 76 +- .../Instrumentation/MemorySanitizer.cpp | 7 +- llvm/test/Analysis/BasicAA/phi-spec-order.ll | 2 +- .../CostModel/PowerPC/unal-vec-ldst.ll | 73 - .../CodeGen/PowerPC/2012-11-16-mischedcall.ll | 2 +- ...leHoistingDueToBlockHotnessProfileData.mir | 2 +- .../NoCRFieldRedefWhenSpillingCRBIT.mir | 2 +- llvm/test/CodeGen/PowerPC/a2q-stackalign.ll | 23 - llvm/test/CodeGen/PowerPC/a2q.ll | 10 - .../PowerPC/aantidep-inline-asm-use.ll | 2 +- llvm/test/CodeGen/PowerPC/asm-Zy.ll | 3 +- llvm/test/CodeGen/PowerPC/asm-constraints.ll | 2 +- ...rt-rr-to-ri-instrs-R0-special-handling.mir | 4 +- .../convert-rr-to-ri-instrs-out-of-range.mir | 2 +- .../PowerPC/convert-rr-to-ri-instrs.mir | 8 +- llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll | 11 - .../CodeGen/PowerPC/ctrloop-shortLoops.ll | 7 - llvm/test/CodeGen/PowerPC/ec-input.ll | 2 +- .../CodeGen/PowerPC/extra-toc-reg-deps.ll | 8 +- .../CodeGen/PowerPC/fast-isel-icmp-split.ll | 2 +- .../PowerPC/fma-mutate-duplicate-vreg.ll | 2 +- .../CodeGen/PowerPC/fp2int2fp-ppcfp128.ll | 3 +- .../CodeGen/PowerPC/glob-comp-aa-crash.ll | 4 +- .../PowerPC/ifcvt-forked-bug-2016-08-08.ll | 2 +- .../test/CodeGen/PowerPC/inlineasm-i64-reg.ll | 4 +- llvm/test/CodeGen/PowerPC/load-two-flts.ll | 3 +- .../PowerPC/loop-data-prefetch-inner.ll | 4 +- .../CodeGen/PowerPC/loop-data-prefetch.ll | 4 +- llvm/test/CodeGen/PowerPC/loop-prep-all.ll | 10 +- .../PowerPC/lxv-aligned-stack-slots.ll | 2 +- llvm/test/CodeGen/PowerPC/machine-combiner.ll | 24 - llvm/test/CodeGen/PowerPC/mc-instrlat.ll | 4 +- llvm/test/CodeGen/PowerPC/mcount-insertion.ll | 3 +- llvm/test/CodeGen/PowerPC/memcpy-vec.ll | 23 - llvm/test/CodeGen/PowerPC/memset-nc.ll | 48 - .../PowerPC/misched-inorder-latency.ll | 3 +- llvm/test/CodeGen/PowerPC/misched.ll | 1 - .../CodeGen/PowerPC/optnone-crbits-i1-ret.ll | 3 +- .../CodeGen/PowerPC/pcrel-local-caller-toc.ll | 6 +- llvm/test/CodeGen/PowerPC/popcnt.ll | 2 - llvm/test/CodeGen/PowerPC/ppc-passname.ll | 11 - llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll | 21 +- llvm/test/CodeGen/PowerPC/pr24546.ll | 4 +- llvm/test/CodeGen/PowerPC/pr27350.ll | 2 +- llvm/test/CodeGen/PowerPC/pr28130.ll | 2 +- .../CodeGen/PowerPC/preinc-ld-sel-crash.ll | 2 +- llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll | 33 - llvm/test/CodeGen/PowerPC/qpx-bv.ll | 37 - llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll | 22 - llvm/test/CodeGen/PowerPC/qpx-load-splat.ll | 80 -- llvm/test/CodeGen/PowerPC/qpx-load.ll | 26 - llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll | 79 -- llvm/test/CodeGen/PowerPC/qpx-recipest.ll | 473 ------- llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll | 109 -- llvm/test/CodeGen/PowerPC/qpx-s-load.ll | 26 - llvm/test/CodeGen/PowerPC/qpx-s-sel.ll | 143 -- llvm/test/CodeGen/PowerPC/qpx-s-store.ll | 25 - llvm/test/CodeGen/PowerPC/qpx-sel.ll | 151 -- llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll | 31 - llvm/test/CodeGen/PowerPC/qpx-store.ll | 25 - .../test/CodeGen/PowerPC/qpx-unal-cons-lds.ll | 217 --- llvm/test/CodeGen/PowerPC/qpx-unalperm.ll | 64 - llvm/test/CodeGen/PowerPC/rlwimi-and.ll | 4 +- .../CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir | 2 +- .../CodeGen/PowerPC/s000-alias-misched.ll | 5 +- llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll | 571 -------- .../selectiondag-extload-computeknownbits.ll | 2 +- llvm/test/CodeGen/PowerPC/setcr_bc.mir | 4 +- llvm/test/CodeGen/PowerPC/setcr_bc2.mir | 4 +- llvm/test/CodeGen/PowerPC/stwu-sched.ll | 2 +- llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll | 149 -- llvm/test/CodeGen/PowerPC/uwtables.ll | 2 +- .../MemorySanitizer/PowerPC/vararg-ppc64.ll | 15 - llvm/test/MC/Disassembler/PowerPC/qpx.txt | 371 ----- llvm/test/MC/PowerPC/qpx.s | 252 ---- .../IPConstantProp/fp-bc-icmp-const-fold.ll | 2 +- .../MSSA/combined-partial-overwrites.ll | 2 +- .../combined-partial-overwrites.ll | 2 +- .../EntryExitInstrumenter/mcount.ll | 2 +- .../InstCombine/PowerPC/aligned-qpx.ll | 165 --- .../LoopDataPrefetch/PowerPC/basic.ll | 5 +- .../test/Transforms/LoopSimplify/dup-preds.ll | 2 +- llvm/test/Transforms/LoopUnroll/pr14167.ll | 2 +- .../PowerPC/agg-interleave-a2.ll | 40 - .../PowerPC/vectorize-only-for-real.ll | 2 +- llvm/test/Transforms/NewGVN/pr31483.ll | 4 +- .../Transforms/SCCP/fp-bc-icmp-const-fold.ll | 2 +- llvm/unittests/ADT/TripleTest.cpp | 35 - .../llvm/lib/Target/PowerPC/BUILD.gn | 1 - openmp/runtime/src/kmp.h | 3 - openmp/runtime/src/kmp_csupport.cpp | 11 - openmp/runtime/src/kmp_lock.h | 2 +- openmp/runtime/src/kmp_os.h | 2 +- openmp/runtime/src/kmp_platform.h | 6 - openmp/runtime/src/z_Linux_asm.S | 7 +- openmp/runtime/src/z_Linux_util.cpp | 2 +- polly/lib/External/isl/config.sub | 11 +- polly/lib/External/ppcg/config.sub | 11 +- 135 files changed, 174 insertions(+), 6525 deletions(-) delete mode 100644 llvm/lib/Target/PowerPC/PPCInstrQPX.td delete mode 100644 llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp delete mode 100644 llvm/test/CodeGen/PowerPC/a2q-stackalign.ll delete mode 100644 llvm/test/CodeGen/PowerPC/a2q.ll delete mode 100644 llvm/test/CodeGen/PowerPC/memset-nc.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-bv.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-load-splat.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-load.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-recipest.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-load.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-sel.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-store.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-sel.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-store.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-unalperm.ll delete mode 100644 llvm/test/MC/Disassembler/PowerPC/qpx.txt delete mode 100644 llvm/test/MC/PowerPC/qpx.s delete mode 100644 llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll delete mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index f0de2bf070ea4..5f716a541ae92 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -46,8 +46,6 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector &Features, HasP8Crypto = true; } else if (Feature == "+direct-move") { HasDirectMove = true; - } else if (Feature == "+qpx") { - HasQPX = true; } else if (Feature == "+htm") { HasHTM = true; } else if (Feature == "+float128") { @@ -99,7 +97,7 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, } // ABI options. - if (ABI == "elfv1" || ABI == "elfv1-qpx") + if (ABI == "elfv1") Builder.defineMacro("_CALL_ELF", "1"); if (ABI == "elfv2") Builder.defineMacro("_CALL_ELF", "2"); @@ -159,22 +157,11 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("_ARCH_PWR10"); if (ArchDefs & ArchDefineA2) Builder.defineMacro("_ARCH_A2"); - if (ArchDefs & ArchDefineA2q) { - Builder.defineMacro("_ARCH_A2Q"); - Builder.defineMacro("_ARCH_QP"); - } if (ArchDefs & ArchDefineE500) Builder.defineMacro("__NO_LWSYNC__"); if (ArchDefs & ArchDefineFuture) Builder.defineMacro("_ARCH_PWR_FUTURE"); - if (getTriple().getVendor() == llvm::Triple::BGQ) { - Builder.defineMacro("__bg__"); - Builder.defineMacro("__THW_BLUEGENE__"); - Builder.defineMacro("__bgq__"); - Builder.defineMacro("__TOS_BGQ__"); - } - if (HasAltivec) { Builder.defineMacro("__VEC__", "10206"); Builder.defineMacro("__ALTIVEC__"); @@ -277,7 +264,6 @@ bool PPCTargetInfo::initFeatureMap( .Case("ppc64le", true) .Default(false); - Features["qpx"] = (CPU == "a2q"); Features["power9-vector"] = (CPU == "pwr9"); Features["crypto"] = llvm::StringSwitch(CPU) .Case("ppc64le", true) @@ -373,7 +359,6 @@ bool PPCTargetInfo::hasFeature(StringRef Feature) const { .Case("power8-vector", HasP8Vector) .Case("crypto", HasP8Crypto) .Case("direct-move", HasDirectMove) - .Case("qpx", HasQPX) .Case("htm", HasHTM) .Case("bpermd", HasBPERMD) .Case("extdiv", HasExtDiv) @@ -503,17 +488,17 @@ ArrayRef PPCTargetInfo::getGCCAddlRegNames() const { } static constexpr llvm::StringLiteral ValidCPUNames[] = { - {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, - {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, - {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, - {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, - {"g5"}, {"a2"}, {"a2q"}, {"e500"}, {"e500mc"}, - {"e5500"}, {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, - {"power5"}, {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, - {"pwr6"}, {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, - {"power8"}, {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, - {"pwr10"}, {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, - {"powerpc64le"}, {"ppc64le"}, {"future"}}; + {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, + {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, + {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, + {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, + {"g5"}, {"a2"}, {"e500"}, {"e500mc"}, {"e5500"}, + {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, {"power5"}, + {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, {"pwr6"}, + {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, {"power8"}, + {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, {"pwr10"}, + {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, {"powerpc64le"}, + {"ppc64le"}, {"future"}}; bool PPCTargetInfo::isValidCPUName(StringRef Name) const { return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames); diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index ff8579b6c3cf4..c2048b2145918 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -46,7 +46,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { ArchDefinePwr10 = 1 << 14, ArchDefineFuture = 1 << 15, ArchDefineA2 = 1 << 16, - ArchDefineA2q = 1 << 17, ArchDefineE500 = 1 << 18 } ArchDefineTypes; @@ -63,7 +62,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool HasP8Vector = false; bool HasP8Crypto = false; bool HasDirectMove = false; - bool HasQPX = false; bool HasHTM = false; bool HasBPERMD = false; bool HasExtDiv = false; @@ -118,7 +116,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { .Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) .Case("a2", ArchDefineA2) - .Case("a2q", ArchDefineName | ArchDefineA2 | ArchDefineA2q) .Cases("power3", "pwr3", ArchDefinePpcgr) .Cases("power4", "pwr4", ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index 144e276a6bd87..bcaecf4b2d980 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -57,7 +57,6 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) { .Case("970", "970") .Case("G5", "g5") .Case("a2", "a2") - .Case("a2q", "a2q") .Case("e500", "e500") .Case("e500mc", "e500mc") .Case("e5500", "e5500") diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7a73eea013bdf..b0de225f8abf5 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1883,18 +1883,6 @@ void Clang::AddPPCTargetArgs(const ArgList &Args, if (T.isOSBinFormatELF()) { switch (getToolChain().getArch()) { case llvm::Triple::ppc64: { - // When targeting a processor that supports QPX, or if QPX is - // specifically enabled, default to using the ABI that supports QPX (so - // long as it is not specifically disabled). - bool HasQPX = false; - if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) - HasQPX = A->getValue() == StringRef("a2q"); - HasQPX = Args.hasFlag(options::OPT_mqpx, options::OPT_mno_qpx, HasQPX); - if (HasQPX) { - ABIName = "elfv1-qpx"; - break; - } - if (T.isMusl() || (T.isOSFreeBSD() && T.getOSMajorVersion() >= 13)) ABIName = "elfv2"; else diff --git a/clang/test/Driver/clang-translation.c b/clang/test/Driver/clang-translation.c index 2f02970a2a8ee..d1daeb80004b7 100644 --- a/clang/test/Driver/clang-translation.c +++ b/clang/test/Driver/clang-translation.c @@ -167,12 +167,6 @@ // PPCPWR8: "-cc1" // PPCPWR8: "-target-cpu" "pwr8" -// RUN: %clang -target powerpc64-unknown-linux-gnu \ -// RUN: -### -S %s -mcpu=a2q 2>&1 | FileCheck -check-prefix=PPCA2Q %s -// PPCA2Q: clang -// PPCA2Q: "-cc1" -// PPCA2Q: "-target-cpu" "a2q" - // RUN: %clang -target powerpc64-unknown-linux-gnu \ // RUN: -### -S %s -mcpu=630 2>&1 | FileCheck -check-prefix=PPC630 %s // PPC630: clang diff --git a/clang/test/Driver/ppc-abi.c b/clang/test/Driver/ppc-abi.c index acc4981a2eee6..2b5cc463e7c3d 100644 --- a/clang/test/Driver/ppc-abi.c +++ b/clang/test/Driver/ppc-abi.c @@ -5,14 +5,6 @@ // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1 %s // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1 %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-BE %s // RUN: %clang -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -34,8 +26,6 @@ // CHECK-ELFv1: "-target-abi" "elfv1" // CHECK-ELFv1-LE: "-mrelocation-model" "static" // CHECK-ELFv1-LE: "-target-abi" "elfv1" -// CHECK-ELFv1-QPX: "-mrelocation-model" "static" -// CHECK-ELFv1-QPX: "-target-abi" "elfv1-qpx" // CHECK-ELFv2: "-mrelocation-model" "static" // CHECK-ELFv2: "-target-abi" "elfv2" // CHECK-ELFv2-BE: "-mrelocation-model" "static" @@ -48,14 +38,6 @@ // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1-PIC %s // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-PIC %s // RUN: %clang -fPIC -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -69,8 +51,6 @@ // CHECK-ELFv1-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv1-PIC: "-target-abi" "elfv1" -// CHECK-ELFv1-QPX-PIC: "-mrelocation-model" "pic" "-pic-level" "2" -// CHECK-ELFv1-QPX-PIC: "-target-abi" "elfv1-qpx" // CHECK-ELFv2-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv2-PIC: "-target-abi" "elfv2" diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index 3a376a7caab46..bf6eaefe0b3ca 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -79,7 +79,7 @@ // PPC: error: unknown target CPU 'not-a-cpu' // PPC: note: valid target CPU values are: generic, 440, 450, 601, 602, 603, // PPC-SAME: 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, -// PPC-SAME: 8548, 970, g5, a2, a2q, e500, e500mc, e5500, power3, pwr3, power4, +// PPC-SAME: 8548, 970, g5, a2, e500, e500mc, e5500, power3, pwr3, power4, // PPC-SAME: pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x, // PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, powerpc64, // PPC-SAME: ppc64, powerpc64le, ppc64le, future diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c index ed8601636554e..48d35c95aa570 100644 --- a/clang/test/Preprocessor/init-ppc64.c +++ b/clang/test/Preprocessor/init-ppc64.c @@ -408,21 +408,6 @@ // PPC64LE:#define __ppc64__ 1 // PPC64LE:#define __ppc__ 1 // -// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCA2Q %s -// -// PPCA2Q:#define _ARCH_A2 1 -// PPCA2Q:#define _ARCH_A2Q 1 -// PPCA2Q:#define _ARCH_PPC 1 -// PPCA2Q:#define _ARCH_PPC64 1 -// PPCA2Q:#define _ARCH_QP 1 -// -// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCBGQ %s -// -// PPCBGQ:#define __THW_BLUEGENE__ 1 -// PPCBGQ:#define __TOS_BGQ__ 1 -// PPCBGQ:#define __bg__ 1 -// PPCBGQ:#define __bgq__ 1 -// // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC630 %s // // PPC630:#define _ARCH_630 1 @@ -1069,7 +1054,6 @@ // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s -// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 6b9c5c6899819..af93a6ed5c56e 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4310,14 +4310,9 @@ PowerPC: - ``r``: A 32 or 64-bit integer register. - ``b``: A 32 or 64-bit integer register, excluding ``R0`` (that is: ``R1-R31``). -- ``f``: A 32 or 64-bit float register (``F0-F31``), or when QPX is enabled, a - 128 or 256-bit QPX register (``Q0-Q31``; aliases the ``F`` registers). -- ``v``: For ``4 x f32`` or ``4 x f64`` types, when QPX is enabled, a - 128 or 256-bit QPX register (``Q0-Q31``), otherwise a 128-bit - altivec vector register (``V0-V31``). - - .. FIXME: is this a bug that v accepts QPX registers? I think this - is supposed to only use the altivec vector registers? +- ``f``: A 32 or 64-bit float register (``F0-F31``), +- ``v``: For ``4 x f32`` or ``4 x f64`` types, a 128-bit altivec vector + register (``V0-V31``). - ``y``: Condition register (``CR0-CR7``). - ``wc``: An individual CR bit in a CR register. diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h index 6bad18f19244e..c578c097c6f64 100644 --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -142,8 +142,6 @@ class Triple { Apple, PC, SCEI, - BGP, - BGQ, Freescale, IBM, ImaginationTechnologies, @@ -179,7 +177,6 @@ class Triple { Minix, RTEMS, NaCl, // Native Client - CNK, // BG/P Compute-Node Kernel AIX, CUDA, // NVIDIA CUDA NVCL, // NVIDIA OpenCL diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index 23bcf3ce1959c..853d26c67ee3d 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1109,182 +1109,6 @@ def int_ppc_vsx_xxblendvd: GCCBuiltin<"__builtin_vsx_xxblendvd">, [IntrNoMem]>; } -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsics. -// - -let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.". - /// PowerPC_QPX_Intrinsic - Base class for all QPX intrinsics. - class PowerPC_QPX_Intrinsic ret_types, - list param_types, - list properties> - : GCCBuiltin, - Intrinsic; -} - -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsic Class Definitions. -// - -/// PowerPC_QPX_FF_Intrinsic - A PowerPC intrinsic that takes one v4f64 -/// vector and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f64 -/// vectors and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FFF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_FFFF_Intrinsic - A PowerPC intrinsic that takes three v4f64 -/// vectors and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FFFF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_Load_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and returns a v4f64. -class PowerPC_QPX_Load_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_LoadPerm_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and returns a v4f64 permutation. -class PowerPC_QPX_LoadPerm_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_Store_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and stores a v4f64. -class PowerPC_QPX_Store_Intrinsic - : PowerPC_QPX_Intrinsic; - -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsic Definitions. - -let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". - // Add Instructions - def int_ppc_qpx_qvfadd : PowerPC_QPX_FFF_Intrinsic<"qvfadd">; - def int_ppc_qpx_qvfadds : PowerPC_QPX_FFF_Intrinsic<"qvfadds">; - def int_ppc_qpx_qvfsub : PowerPC_QPX_FFF_Intrinsic<"qvfsub">; - def int_ppc_qpx_qvfsubs : PowerPC_QPX_FFF_Intrinsic<"qvfsubs">; - - // Estimate Instructions - def int_ppc_qpx_qvfre : PowerPC_QPX_FF_Intrinsic<"qvfre">; - def int_ppc_qpx_qvfres : PowerPC_QPX_FF_Intrinsic<"qvfres">; - def int_ppc_qpx_qvfrsqrte : PowerPC_QPX_FF_Intrinsic<"qvfrsqrte">; - def int_ppc_qpx_qvfrsqrtes : PowerPC_QPX_FF_Intrinsic<"qvfrsqrtes">; - - // Multiply Instructions - def int_ppc_qpx_qvfmul : PowerPC_QPX_FFF_Intrinsic<"qvfmul">; - def int_ppc_qpx_qvfmuls : PowerPC_QPX_FFF_Intrinsic<"qvfmuls">; - def int_ppc_qpx_qvfxmul : PowerPC_QPX_FFF_Intrinsic<"qvfxmul">; - def int_ppc_qpx_qvfxmuls : PowerPC_QPX_FFF_Intrinsic<"qvfxmuls">; - - // Multiply-add instructions - def int_ppc_qpx_qvfmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfmadd">; - def int_ppc_qpx_qvfmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfmadds">; - def int_ppc_qpx_qvfnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadd">; - def int_ppc_qpx_qvfnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadds">; - def int_ppc_qpx_qvfmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfmsub">; - def int_ppc_qpx_qvfmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfmsubs">; - def int_ppc_qpx_qvfnmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsub">; - def int_ppc_qpx_qvfnmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsubs">; - def int_ppc_qpx_qvfxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadd">; - def int_ppc_qpx_qvfxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadds">; - def int_ppc_qpx_qvfxxnpmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadd">; - def int_ppc_qpx_qvfxxnpmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadds">; - def int_ppc_qpx_qvfxxcpnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadd">; - def int_ppc_qpx_qvfxxcpnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadds">; - def int_ppc_qpx_qvfxxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadd">; - def int_ppc_qpx_qvfxxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadds">; - - // Select Instruction - def int_ppc_qpx_qvfsel : PowerPC_QPX_FFFF_Intrinsic<"qvfsel">; - - // Permute Instruction - def int_ppc_qpx_qvfperm : PowerPC_QPX_FFFF_Intrinsic<"qvfperm">; - - // Convert and Round Instructions - def int_ppc_qpx_qvfctid : PowerPC_QPX_FF_Intrinsic<"qvfctid">; - def int_ppc_qpx_qvfctidu : PowerPC_QPX_FF_Intrinsic<"qvfctidu">; - def int_ppc_qpx_qvfctidz : PowerPC_QPX_FF_Intrinsic<"qvfctidz">; - def int_ppc_qpx_qvfctiduz : PowerPC_QPX_FF_Intrinsic<"qvfctiduz">; - def int_ppc_qpx_qvfctiw : PowerPC_QPX_FF_Intrinsic<"qvfctiw">; - def int_ppc_qpx_qvfctiwu : PowerPC_QPX_FF_Intrinsic<"qvfctiwu">; - def int_ppc_qpx_qvfctiwz : PowerPC_QPX_FF_Intrinsic<"qvfctiwz">; - def int_ppc_qpx_qvfctiwuz : PowerPC_QPX_FF_Intrinsic<"qvfctiwuz">; - def int_ppc_qpx_qvfcfid : PowerPC_QPX_FF_Intrinsic<"qvfcfid">; - def int_ppc_qpx_qvfcfidu : PowerPC_QPX_FF_Intrinsic<"qvfcfidu">; - def int_ppc_qpx_qvfcfids : PowerPC_QPX_FF_Intrinsic<"qvfcfids">; - def int_ppc_qpx_qvfcfidus : PowerPC_QPX_FF_Intrinsic<"qvfcfidus">; - def int_ppc_qpx_qvfrsp : PowerPC_QPX_FF_Intrinsic<"qvfrsp">; - def int_ppc_qpx_qvfriz : PowerPC_QPX_FF_Intrinsic<"qvfriz">; - def int_ppc_qpx_qvfrin : PowerPC_QPX_FF_Intrinsic<"qvfrin">; - def int_ppc_qpx_qvfrip : PowerPC_QPX_FF_Intrinsic<"qvfrip">; - def int_ppc_qpx_qvfrim : PowerPC_QPX_FF_Intrinsic<"qvfrim">; - - // Move Instructions - def int_ppc_qpx_qvfneg : PowerPC_QPX_FF_Intrinsic<"qvfneg">; - def int_ppc_qpx_qvfabs : PowerPC_QPX_FF_Intrinsic<"qvfabs">; - def int_ppc_qpx_qvfnabs : PowerPC_QPX_FF_Intrinsic<"qvfnabs">; - def int_ppc_qpx_qvfcpsgn : PowerPC_QPX_FFF_Intrinsic<"qvfcpsgn">; - - // Compare Instructions - def int_ppc_qpx_qvftstnan : PowerPC_QPX_FFF_Intrinsic<"qvftstnan">; - def int_ppc_qpx_qvfcmplt : PowerPC_QPX_FFF_Intrinsic<"qvfcmplt">; - def int_ppc_qpx_qvfcmpgt : PowerPC_QPX_FFF_Intrinsic<"qvfcmpgt">; - def int_ppc_qpx_qvfcmpeq : PowerPC_QPX_FFF_Intrinsic<"qvfcmpeq">; - - // Load instructions - def int_ppc_qpx_qvlfd : PowerPC_QPX_Load_Intrinsic<"qvlfd">; - def int_ppc_qpx_qvlfda : PowerPC_QPX_Load_Intrinsic<"qvlfda">; - def int_ppc_qpx_qvlfs : PowerPC_QPX_Load_Intrinsic<"qvlfs">; - def int_ppc_qpx_qvlfsa : PowerPC_QPX_Load_Intrinsic<"qvlfsa">; - - def int_ppc_qpx_qvlfcda : PowerPC_QPX_Load_Intrinsic<"qvlfcda">; - def int_ppc_qpx_qvlfcd : PowerPC_QPX_Load_Intrinsic<"qvlfcd">; - def int_ppc_qpx_qvlfcsa : PowerPC_QPX_Load_Intrinsic<"qvlfcsa">; - def int_ppc_qpx_qvlfcs : PowerPC_QPX_Load_Intrinsic<"qvlfcs">; - def int_ppc_qpx_qvlfiwaa : PowerPC_QPX_Load_Intrinsic<"qvlfiwaa">; - def int_ppc_qpx_qvlfiwa : PowerPC_QPX_Load_Intrinsic<"qvlfiwa">; - def int_ppc_qpx_qvlfiwza : PowerPC_QPX_Load_Intrinsic<"qvlfiwza">; - def int_ppc_qpx_qvlfiwz : PowerPC_QPX_Load_Intrinsic<"qvlfiwz">; - - def int_ppc_qpx_qvlpcld : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcld">; - def int_ppc_qpx_qvlpcls : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcls">; - def int_ppc_qpx_qvlpcrd : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrd">; - def int_ppc_qpx_qvlpcrs : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrs">; - - // Store instructions - def int_ppc_qpx_qvstfd : PowerPC_QPX_Store_Intrinsic<"qvstfd">; - def int_ppc_qpx_qvstfda : PowerPC_QPX_Store_Intrinsic<"qvstfda">; - def int_ppc_qpx_qvstfs : PowerPC_QPX_Store_Intrinsic<"qvstfs">; - def int_ppc_qpx_qvstfsa : PowerPC_QPX_Store_Intrinsic<"qvstfsa">; - - def int_ppc_qpx_qvstfcda : PowerPC_QPX_Store_Intrinsic<"qvstfcda">; - def int_ppc_qpx_qvstfcd : PowerPC_QPX_Store_Intrinsic<"qvstfcd">; - def int_ppc_qpx_qvstfcsa : PowerPC_QPX_Store_Intrinsic<"qvstfcsa">; - def int_ppc_qpx_qvstfcs : PowerPC_QPX_Store_Intrinsic<"qvstfcs">; - def int_ppc_qpx_qvstfiwa : PowerPC_QPX_Store_Intrinsic<"qvstfiwa">; - def int_ppc_qpx_qvstfiw : PowerPC_QPX_Store_Intrinsic<"qvstfiw">; - - // Logical and permutation formation - def int_ppc_qpx_qvflogical : PowerPC_QPX_Intrinsic<"qvflogical", - [llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_ppc_qpx_qvgpci : PowerPC_QPX_Intrinsic<"qvgpci", - [llvm_v4f64_ty], [llvm_i32_ty], [IntrNoMem]>; -} - //===----------------------------------------------------------------------===// // PowerPC HTM Intrinsic Definitions. diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp index fec1985ccacae..72648273b4cd5 100644 --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -160,8 +160,6 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case AMD: return "amd"; case Apple: return "apple"; - case BGP: return "bgp"; - case BGQ: return "bgq"; case CSR: return "csr"; case Freescale: return "fsl"; case IBM: return "ibm"; @@ -187,7 +185,6 @@ StringRef Triple::getOSTypeName(OSType Kind) { case AMDHSA: return "amdhsa"; case AMDPAL: return "amdpal"; case Ananas: return "ananas"; - case CNK: return "cnk"; case CUDA: return "cuda"; case CloudABI: return "cloudabi"; case Contiki: return "contiki"; @@ -470,8 +467,6 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("apple", Triple::Apple) .Case("pc", Triple::PC) .Case("scei", Triple::SCEI) - .Case("bgp", Triple::BGP) - .Case("bgq", Triple::BGQ) .Case("fsl", Triple::Freescale) .Case("ibm", Triple::IBM) .Case("img", Triple::ImaginationTechnologies) @@ -508,7 +503,6 @@ static Triple::OSType parseOS(StringRef OSName) { .StartsWith("minix", Triple::Minix) .StartsWith("rtems", Triple::RTEMS) .StartsWith("nacl", Triple::NaCl) - .StartsWith("cnk", Triple::CNK) .StartsWith("aix", Triple::AIX) .StartsWith("cuda", Triple::CUDA) .StartsWith("nvcl", Triple::NVCL) diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 13fd7d05ab9f4..81008d3ea5662 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -492,21 +492,6 @@ struct PPCOperand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()])); } - void addRegQFRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - - void addRegQSRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - - void addRegQBRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(RRegs[getReg()])); @@ -1207,9 +1192,6 @@ bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) { } else if (Name.startswith_lower("v") && !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { RegNo = VRegs[IntVal]; - } else if (Name.startswith_lower("q") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = QFRegs[IntVal]; } else if (Name.startswith_lower("cr") && !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { RegNo = CRRegs[IntVal]; diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index 91021d4e584e1..5a06faa16be19 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -36,7 +36,6 @@ add_llvm_target(PowerPCCodeGen PPCMacroFusion.cpp PPCMIPeephole.cpp PPCRegisterInfo.cpp - PPCQPXLoadSplat.cpp PPCSubtarget.cpp PPCTargetMachine.cpp PPCTargetObjectFile.cpp diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 74c6fd3733f03..362ddf7204557 100644 --- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -167,12 +167,6 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass -static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo, - uint64_t Address, - const void *Decoder) { - return decodeRegisterClass(Inst, RegNo, QFRegs); -} - static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { @@ -401,14 +395,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Read the instruction in the proper endianness. uint64_t Inst = ReadFunc(Bytes.data()); - if (STI.getFeatureBits()[PPC::FeatureQPX]) { - DecodeStatus result = - decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); - if (result != MCDisassembler::Fail) - return result; - } else if (STI.getFeatureBits()[PPC::FeatureSPE]) { + if (STI.getFeatureBits()[PPC::FeatureSPE]) { DecodeStatus result = - decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); + decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); if (result != MCDisassembler::Fail) return result; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index 222bf2fa82836..ce1a43a0c25b2 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -49,18 +49,6 @@ FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { const char *RegName = getRegisterName(RegNo); - if (RegName[0] == 'q' /* QPX */) { - // The system toolchain on the BG/Q does not understand QPX register names - // in .cfi_* directives, so print the name of the floating-point - // subregister instead. - std::string RN(RegName); - - RN[0] = 'f'; - OS << RN; - - return; - } - OS << RegName; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 719e005d98135..325ede0fc17ac 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -159,7 +159,6 @@ using llvm::MCPhysReg; static const MCPhysReg SPERegs[32] = PPC_REGS0_31(PPC::S); \ static const MCPhysReg VFRegs[32] = PPC_REGS0_31(PPC::VF); \ static const MCPhysReg VRegs[32] = PPC_REGS0_31(PPC::V); \ - static const MCPhysReg QFRegs[32] = PPC_REGS0_31(PPC::QF); \ static const MCPhysReg RRegsNoR0[32] = \ PPC_REGS_NO0_31(PPC::ZERO, PPC::R); \ static const MCPhysReg XRegsNoX0[32] = \ diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 3106290442afa..24a9d419d3ea5 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -44,7 +44,6 @@ namespace llvm { FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCBranchCoalescingPass(); - FunctionPass *createPPCQPXLoadSplatPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); @@ -68,7 +67,6 @@ namespace llvm { void initializePPCReduceCRLogicalsPass(PassRegistry&); void initializePPCBSelPass(PassRegistry&); void initializePPCBranchCoalescingPass(PassRegistry&); - void initializePPCQPXLoadSplatPass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCPreEmitPeepholePass(PassRegistry &); diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 9ad78bf67fe6c..adb9366217d51 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -132,9 +132,6 @@ def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true", "Enable PPC 4xx instructions">; def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true", "Enable PPC 6xx instructions">; -def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", - "Enable QPX instructions", - [FeatureFPU]>; def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", "Enable VSX instructions", [FeatureAltivec]>; @@ -193,7 +190,7 @@ def FeatureFloat128 : def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD", "POPCNTD_Fast", "Enable the popcnt[dw] instructions">; -// Note that for the a2/a2q processor models we should not use popcnt[dw] by +// Note that for the a2 processor models we should not use popcnt[dw] by // default. These processors do support the instructions, but they're // microcoded, and the software emulation is about twice as fast. def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD", @@ -514,15 +511,6 @@ def : ProcessorModel<"a2", PPCA2Model, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>; -def : ProcessorModel<"a2q", PPCA2Model, - [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, - FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, - FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, - FeatureSTFIWX, FeatureLFIWAX, - FeatureFPRND, FeatureFPCVT, FeatureISEL, - FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, - Feature64Bit /*, Feature64BitRegs */, FeatureQPX, - FeatureMFTB]>; def : ProcessorModel<"pwr3", G5Model, [DirectivePwr3, FeatureAltivec, FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 540e620a845bc..5affddd8d147a 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -549,9 +549,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { if (Subtarget->hasSPE()) { if (PPC::F4RCRegClass.contains(Reg) || PPC::F8RCRegClass.contains(Reg) || - PPC::QBRCRegClass.contains(Reg) || - PPC::QFRCRegClass.contains(Reg) || - PPC::QSRCRegClass.contains(Reg) || PPC::VFRCRegClass.contains(Reg) || PPC::VRRCRegClass.contains(Reg) || PPC::VSFRCRegClass.contains(Reg) || diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td index 1eaa7f7a44b39..9a15490f1fb0d 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -61,9 +61,6 @@ def RetCC_PPC_Cold : CallingConv<[ CCIfType<[f64], CCAssignToReg<[F1]>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>, - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>, - CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>> @@ -98,10 +95,6 @@ def RetCC_PPC : CallingConv<[ CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - // QPX vectors are returned in QF1 and QF2. - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, - // Vector types returned as "direct" go into V2 .. V9; note that only the // ELFv2 ABI fully utilizes all these registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], @@ -158,8 +151,6 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>> @@ -223,9 +214,6 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>, CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>, - // QPX vectors that are stored in double precision need 32-byte alignment. - CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>, - // Vectors and float128 get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>> @@ -243,10 +231,6 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[ // put vector arguments in vector registers before putting them on the stack. let Entry = 1 in def CC_PPC32_SVR4 : CallingConv<[ - // QPX vectors mirror the scalar FP convention. - CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()", - CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>, - // The first 12 Vector arguments are passed in AltiVec registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 8ffd89ef5ccd2..3e218e14d8d44 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4142,7 +4142,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { // Altivec Vector compare instructions do not set any CR register by default and // vector compare operations return the same type as the operands. if (LHS.getValueType().isVector()) { - if (Subtarget->hasQPX() || Subtarget->hasSPE()) + if (Subtarget->hasSPE()) return false; EVT VecVT = LHS.getValueType(); @@ -4813,8 +4813,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); switch (LoadedVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid PPC load type!"); - case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX - case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX case MVT::f64: Opcode = PPC::LFDUX; break; case MVT::f32: Opcode = PPC::LFSUX; break; case MVT::i32: Opcode = PPC::LWZUX; break; @@ -5095,12 +5093,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SelectCCOp = PPC::SELECT_CC_F16; else if (Subtarget->hasSPE()) SelectCCOp = PPC::SELECT_CC_SPE; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64) - SelectCCOp = PPC::SELECT_CC_QFRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32) - SelectCCOp = PPC::SELECT_CC_QSRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1) - SelectCCOp = PPC::SELECT_CC_QBRC; else if (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64) SelectCCOp = PPC::SELECT_CC_VSRC; @@ -5856,9 +5848,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: @@ -6177,9 +6166,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ae840a9fa37de..db3833d595797 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1095,161 +1095,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } } - if (Subtarget.hasQPX()) { - setOperationAction(ISD::FADD, MVT::v4f64, Legal); - setOperationAction(ISD::FSUB, MVT::v4f64, Legal); - setOperationAction(ISD::FMUL, MVT::v4f64, Legal); - setOperationAction(ISD::FREM, MVT::v4f64, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f64, Custom); - setOperationAction(ISD::STORE , MVT::v4f64, Custom); - - setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f64, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); - - setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); - - setOperationAction(ISD::FNEG , MVT::v4f64, Legal); - setOperationAction(ISD::FABS , MVT::v4f64, Legal); - setOperationAction(ISD::FSIN , MVT::v4f64, Expand); - setOperationAction(ISD::FCOS , MVT::v4f64, Expand); - setOperationAction(ISD::FPOW , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); - - addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); - - setOperationAction(ISD::FADD, MVT::v4f32, Legal); - setOperationAction(ISD::FSUB, MVT::v4f32, Legal); - setOperationAction(ISD::FMUL, MVT::v4f32, Legal); - setOperationAction(ISD::FREM, MVT::v4f32, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f32, Custom); - setOperationAction(ISD::STORE , MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f32, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); - - setOperationAction(ISD::FNEG , MVT::v4f32, Legal); - setOperationAction(ISD::FABS , MVT::v4f32, Legal); - setOperationAction(ISD::FSIN , MVT::v4f32, Expand); - setOperationAction(ISD::FCOS , MVT::v4f32, Expand); - setOperationAction(ISD::FPOW , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); - - addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); - - setOperationAction(ISD::AND , MVT::v4i1, Legal); - setOperationAction(ISD::OR , MVT::v4i1, Legal); - setOperationAction(ISD::XOR , MVT::v4i1, Legal); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); - - setOperationAction(ISD::LOAD , MVT::v4i1, Custom); - setOperationAction(ISD::STORE , MVT::v4i1, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); - - setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); - - addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); - - setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); - setOperationAction(ISD::FROUND, MVT::v4f64, Legal); - - setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); - setOperationAction(ISD::FROUND, MVT::v4f32, Legal); - - setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); - - // These need to set FE_INEXACT, and so cannot be vectorized here. - setOperationAction(ISD::FRINT, MVT::v4f64, Expand); - setOperationAction(ISD::FRINT, MVT::v4f32, Expand); - - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); - - setOperationAction(ISD::FDIV, MVT::v4f32, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); - } else { - setOperationAction(ISD::FDIV, MVT::v4f64, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); - - setOperationAction(ISD::FDIV, MVT::v4f32, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); - } - - // TODO: Handle constrained floating-point operations of v4f64 - } - if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); @@ -1438,8 +1283,8 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4); - if (Subtarget.hasAltivec() || Subtarget.hasQPX()) - getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16)); + if (Subtarget.hasAltivec()) + getMaxByValAlign(Ty, Alignment, Align(16)); return Alignment.value(); } @@ -1577,12 +1422,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::VABSD: return "PPCISD::VABSD"; - case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; - case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; - case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; - case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; - case PPCISD::QBFLT: return "PPCISD::QBFLT"; - case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; @@ -1601,9 +1440,6 @@ EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; - if (Subtarget.hasQPX()) - return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); - return VT.changeVectorElementTypeToInteger(); } @@ -2777,16 +2613,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, return false; } - // PowerPC doesn't have preinc load/store instructions for vectors (except - // for QPX, which does have preinc r+r forms). - if (VT.isVector()) { - if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { - return false; - } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { - AM = ISD::PRE_INC; - return true; - } - } + // PowerPC doesn't have preinc load/store instructions for vectors + if (VT.isVector()) + return false; if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { // Common code will reject creating a pre-inc form if the base pointer @@ -3508,11 +3337,6 @@ static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; -/// QFPR - The set of QPX registers that should be allocated for arguments. -static const MCPhysReg QFPR[] = { - PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, - PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; - /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, @@ -3542,10 +3366,6 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128 || ArgVT == MVT::f128) Alignment = Align(16); - // QPX vector types stored in double-precision are padded to a 32 byte - // boundary. - else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) - Alignment = Align(32); // ByVal parameters are aligned as requested. if (Flags.isByVal()) { @@ -3577,14 +3397,11 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. -static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, - ISD::ArgFlagsTy Flags, - unsigned PtrByteSize, - unsigned LinkageSize, - unsigned ParamAreaSize, - unsigned &ArgOffset, +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, unsigned LinkageSize, + unsigned ParamAreaSize, unsigned &ArgOffset, unsigned &AvailableFPRs, - unsigned &AvailableVRs, bool HasQPX) { + unsigned &AvailableVRs) { bool UseMemory = false; // Respect alignment of argument on the stack. @@ -3608,11 +3425,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, // However, if the argument is actually passed in an FPR or a VR, // we don't use memory after all. if (!Flags.isByVal()) { - if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || - // QPX registers overlap with the scalar FP registers. - (HasQPX && (ArgVT == MVT::v4f32 || - ArgVT == MVT::v4f64 || - ArgVT == MVT::v4i1))) + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) if (AvailableFPRs > 0) { --AvailableFPRs; return false; @@ -3751,18 +3564,12 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( RC = &PPC::VRRCRegClass; break; case MVT::v4f32: - RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; + RC = &PPC::VRRCRegClass; break; case MVT::v2f64: case MVT::v2i64: RC = &PPC::VRRCRegClass; break; - case MVT::v4f64: - RC = &PPC::QFRCRegClass; - break; - case MVT::v4i1: - RC = &PPC::QBRCRegClass; - break; } SDValue ArgValue; @@ -3961,7 +3768,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; const unsigned Num_VR_Regs = array_lengthof(VR); - const unsigned Num_QFPR_Regs = Num_FPR_Regs; // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area @@ -3980,8 +3786,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + NumBytes, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } @@ -3991,7 +3796,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; @@ -4234,51 +4038,20 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { - // These can be scalar arguments or elements of a vector array type - // passed directly. The latter are used to implement ELFv2 homogenous - // vector aggregates. - if (VR_idx != Num_VR_Regs) { - unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++VR_idx; - } else { - if (CallConv == CallingConv::Fast) - ComputeArgOffset(); - needsLoad = true; - } - if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += 16; - break; - } // not QPX - - assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - LLVM_FALLTHROUGH; - - case MVT::v4f64: - case MVT::v4i1: - // QPX vectors are treated like their scalar floating-point subregisters - // (except that they're larger). - unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; - if (QFPR_idx != Num_QFPR_Regs) { - const TargetRegisterClass *RC; - switch (ObjectVT.getSimpleVT().SimpleTy) { - case MVT::v4f64: RC = &PPC::QFRCRegClass; break; - case MVT::v4f32: RC = &PPC::QSRCRegClass; break; - default: RC = &PPC::QBRCRegClass; break; - } - - unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++QFPR_idx; + ++VR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += Sz; + ArgOffset += 16; break; } @@ -4831,10 +4604,9 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, for (const ISD::OutputArg& Param : Outs) { if (Param.Flags.isNest()) continue; - if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize, + LinkageSize, ParamAreaSize, NumBytes, + AvailableFPRs, AvailableVRs)) return true; } return false; @@ -6064,7 +5836,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -6078,7 +5849,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); - const unsigned NumQFPRs = NumFPRs; // On ELFv2, we can avoid allocating the parameter area if all the arguments // can be passed to the callee in registers. @@ -6093,9 +5863,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( for (unsigned i = 0; i != NumOps; ++i) { if (Outs[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytesTmp, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytesTmp, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } } @@ -6143,20 +5912,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( continue; break; case MVT::v4f32: - // When using QPX, this is handled like a FP register, otherwise, it - // is an Altivec register. - if (Subtarget.hasQPX()) { - if (++NumFPRsUsed <= NumFPRs) - continue; - } else { - if (++NumVRsUsed <= NumVRs) - continue; - } + if (++NumVRsUsed <= NumVRs) + continue; break; case MVT::f32: case MVT::f64: - case MVT::v4f64: // QPX - case MVT::v4i1: // QPX if (++NumFPRsUsed <= NumFPRs) continue; break; @@ -6518,7 +6278,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. @@ -6574,63 +6333,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (!IsFastCall) ArgOffset += 16; break; - } // not QPX - - assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - - LLVM_FALLTHROUGH; - case MVT::v4f64: - case MVT::v4i1: { - bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; - if (CFlags.IsVarArg) { - assert(HasParameterArea && - "Parameter area must exist if we have a varargs call."); - // We could elide this store in the case where the object fits - // entirely in R registers. Maybe later. - SDValue Store = - DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Store); - if (QFPR_idx != NumQFPRs) { - SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, - PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); - } - ArgOffset += (IsF32 ? 16 : 32); - for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { - if (GPR_idx == NumGPRs) - break; - SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, - DAG.getConstant(i, dl, PtrVT)); - SDValue Load = - DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - break; - } - - // Non-varargs QPX params go into registers or on the stack. - if (QFPR_idx != NumQFPRs) { - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); - } else { - if (IsFastCall) - ComputePtrOff(); - - assert(HasParameterArea && - "Parameter area must exist to pass an argument in memory."); - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - true, CFlags.IsTailCall, true, MemOpChains, - TailCallArguments, dl); - if (IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - } - - if (!IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - break; - } } } @@ -7301,8 +7003,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( const PPCSubtarget &Subtarget = static_cast(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX support is not supported on AIX."); const bool IsPPC64 = Subtarget.isPPC64(); const unsigned PtrByteSize = IsPPC64 ? 8 : 4; @@ -7522,8 +7222,6 @@ SDValue PPCTargetLowering::LowerCall_AIX( const PPCSubtarget& Subtarget = static_cast(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX is not supported on AIX."); if (Subtarget.hasAltivec()) report_fatal_error("Altivec support is unimplemented on AIX."); @@ -7991,8 +7689,6 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) - return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); @@ -8016,9 +7712,6 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(1).getValueType().isVector()) - return LowerVectorStore(Op, DAG); - assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); @@ -8595,27 +8288,6 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType() == MVT::f128) return Op; - if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { - if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) - return SDValue(); - - SDValue Value = Op.getOperand(0); - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - if (Op.getValueType() != MVT::v4f64) - Value = DAG.getNode(ISD::FP_ROUND, dl, - Op.getValueType(), Value, - DAG.getIntPtrConstant(1, dl)); - return Value; - } - // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); @@ -9184,110 +8856,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); - if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { - // We first build an i32 vector, load it into a QPX register, - // then convert it to a floating-point vector and compare it - // to a zero vector to get the boolean result. - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - assert(BVN->getNumOperands() == 4 && - "BUILD_VECTOR for v4i1 does not have 4 operands"); - - bool IsConst = true; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - if (!isa(BVN->getOperand(i))) { - IsConst = false; - break; - } - } - - if (IsConst) { - Constant *One = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); - Constant *NegOne = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); - - Constant *CV[4]; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) - CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); - else if (isNullConstant(BVN->getOperand(i))) - CV[i] = NegOne; - else - CV[i] = One; - } - - Constant *CP = ConstantVector::get(CV); - SDValue CPIdx = - DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16)); - - SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); - return DAG.getMemIntrinsicNode( - PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); - } - - SmallVector Stores; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); - if (StoreSize > 4) { - Stores.push_back( - DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, - PtrInfo.getWithOffset(Offset), MVT::i32)); - } else { - SDValue StoreValue = BVN->getOperand(i); - if (StoreSize < 4) - StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); - - Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, - PtrInfo.getWithOffset(Offset))); - } - } - - SDValue StoreChain; - if (!Stores.empty()) - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - else - StoreChain = DAG.getEntryNode(); - - // Now load from v4i32 into the QPX register; this will extend it to - // v4i64 but not yet convert it to a floating point. Nevertheless, this - // is typed as v4f64 because the QPX register integer states are not - // explicitly represented. - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), - FIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); - - SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), - LoadedVect); - - SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); - - return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); - } - - // All other QPX vectors are handled by generic code. - if (Subtarget.hasQPX()) - return SDValue(); - // Check if this is a splat of a constant value. APInt APSplatBits, APSplatUndef; unsigned SplatBitSize; @@ -10080,42 +9648,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } } - if (Subtarget.hasQPX()) { - if (VT.getVectorNumElements() != 4) - return SDValue(); - - if (V2.isUndef()) V2 = V1; - - int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); - if (AlignIdx != -1) { - return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, - DAG.getConstant(AlignIdx, dl, MVT::i32)); - } else if (SVOp->isSplat()) { - int SplatIdx = SVOp->getSplatIndex(); - if (SplatIdx >= 4) { - std::swap(V1, V2); - SplatIdx -= 4; - } - - return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, - DAG.getConstant(SplatIdx, dl, MVT::i32)); - } - - // Lower this into a qvgpci/qvfperm pair. - - // Compute the qvgpci literal - unsigned idx = 0; - for (unsigned i = 0; i < 4; ++i) { - int m = SVOp->getMaskElt(i); - unsigned mm = m >= 0 ? (unsigned) m : i; - idx |= mm << (3-i)*3; - } - - SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, - DAG.getConstant(idx, dl, MVT::i32)); - return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); - } - // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be // selected by the instruction selector. @@ -10703,279 +10235,6 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return Op; } -SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - SDNode *N = Op.getNode(); - - assert(N->getOperand(0).getValueType() == MVT::v4i1 && - "Unknown extract_vector_elt type"); - - SDValue Value = N->getOperand(0); - - // The first part of this is like the store lowering except that we don't - // need to track the chain. - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue StoreChain = DAG.getEntryNode(); - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Extract the value requested. - unsigned Offset = 4*cast(N->getOperand(1))->getZExtValue(); - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - SDValue IntVal = - DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); - - if (!Subtarget.useCRBits()) - return IntVal; - - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); -} - -/// Lowering for QPX v4i1 loads -SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - LoadSDNode *LN = cast(Op.getNode()); - SDValue LoadChain = LN->getChain(); - SDValue BasePtr = LN->getBasePtr(); - - if (Op.getValueType() == MVT::v4f64 || - Op.getValueType() == MVT::v4f32) { - EVT MemVT = LN->getMemoryVT(); - unsigned Alignment = LN->getAlignment(); - - // If this load is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Op.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Vals[4], LoadChains[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Load; - if (ScalarVT != ScalarMemVT) - Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, - BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - else - Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - - if (Idx == 0 && LN->isIndexed()) { - assert(LN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector load"); - Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), - LN->getAddressingMode()); - } - - Vals[Idx] = Load; - LoadChains[Idx] = Load.getValue(1); - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); - - if (LN->isIndexed()) { - SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; - return DAG.getMergeValues(RetOps, dl); - } - - SDValue RetOps[] = { Value, TF }; - return DAG.getMergeValues(RetOps, dl); - } - - assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); - assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); - - // To lower v4i1 from a byte array, we load the byte elements of the - // vector and then reuse the BUILD_VECTOR logic. - - SDValue VectElmts[4], VectElmtChains[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - VectElmts[i] = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, - LN->getPointerInfo().getWithOffset(i), MVT::i8, - /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); - VectElmtChains[i] = VectElmts[i].getValue(1); - } - - LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); - SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); - - SDValue RVals[] = { Value, LoadChain }; - return DAG.getMergeValues(RVals, dl); -} - -/// Lowering for QPX v4i1 stores -SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - StoreSDNode *SN = cast(Op.getNode()); - SDValue StoreChain = SN->getChain(); - SDValue BasePtr = SN->getBasePtr(); - SDValue Value = SN->getValue(); - - if (Value.getValueType() == MVT::v4f64 || - Value.getValueType() == MVT::v4f32) { - EVT MemVT = SN->getMemoryVT(); - unsigned Alignment = SN->getAlignment(); - - // If this store is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Value.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Stores[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, - DAG.getVectorIdxConstant(Idx, dl)); - SDValue Store; - if (ScalarVT != ScalarMemVT) - Store = - DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - else - Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - - if (Idx == 0 && SN->isIndexed()) { - assert(SN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector store"); - Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), - SN->getAddressingMode()); - } - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - Stores[Idx] = Store; - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - - if (SN->isIndexed()) { - SDValue RetOps[] = { TF, Stores[0].getValue(1) }; - return DAG.getMergeValues(RetOps, dl); - } - - return TF; - } - - assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); - assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Move data into the byte array. - SDValue Loads[4], LoadChains[4]; - for (unsigned i = 0; i < 4; ++i) { - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, - PtrInfo.getWithOffset(Offset)); - LoadChains[i] = Loads[i].getValue(1); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - - SDValue Stores[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - Stores[i] = DAG.getTruncStore( - StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), - MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), - SN->getAAInfo()); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - - return StoreChain; -} - SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { @@ -11204,7 +10463,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); @@ -12148,9 +11406,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } else if (MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_F16 || - MI.getOpcode() == PPC::SELECT_CC_QFRC || - MI.getOpcode() == PPC::SELECT_CC_QSRC || - MI.getOpcode() == PPC::SELECT_CC_QBRC || MI.getOpcode() == PPC::SELECT_CC_VRRC || MI.getOpcode() == PPC::SELECT_CC_VSFRC || MI.getOpcode() == PPC::SELECT_CC_VSSRC || @@ -12160,9 +11415,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_SPE || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_VRRC || @@ -12200,9 +11452,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F16 || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_SPE || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || @@ -12895,9 +12144,7 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); @@ -12916,9 +12163,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); @@ -13016,24 +12261,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_vsx_lxvw4x: @@ -13062,24 +12289,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvstfiw: - case Intrinsic::ppc_qpx_qvstfiwa: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_vsx_stxvw4x: @@ -15077,18 +14286,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty); - Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); - Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v4f32)) || - (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && - LD->getAlign() >= ScalarABIAlignment)) && + VT == MVT::v4f32))) && LD->getAlign() < ABIAlignment) { - // This is a type-legal unaligned Altivec or QPX load. + // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); bool isLittleEndian = Subtarget.isLittleEndian(); @@ -15119,24 +14324,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // optimization later. Intrinsic::ID Intr, IntrLD, IntrPerm; MVT PermCntlTy, PermTy, LDTy; - if (Subtarget.hasAltivec()) { - Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : - Intrinsic::ppc_altivec_lvsl; - IntrLD = Intrinsic::ppc_altivec_lvx; - IntrPerm = Intrinsic::ppc_altivec_vperm; - PermCntlTy = MVT::v16i8; - PermTy = MVT::v4i32; - LDTy = MVT::v4i32; - } else { - Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : - Intrinsic::ppc_qpx_qvlpcls; - IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : - Intrinsic::ppc_qpx_qvlfs; - IntrPerm = Intrinsic::ppc_qpx_qvfperm; - PermCntlTy = MVT::v4f64; - PermTy = MVT::v4f64; - LDTy = MemVT.getSimpleVT(); - } + Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr + : Intrinsic::ppc_altivec_lvsl; + IntrLD = Intrinsic::ppc_altivec_lvx; + IntrPerm = Intrinsic::ppc_altivec_vperm; + PermCntlTy = MVT::v16i8; + PermTy = MVT::v4i32; + LDTy = MVT::v4i32; SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); @@ -15207,10 +14401,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != PermTy) - Perm = Subtarget.hasAltivec() ? - DAG.getNode(ISD::BITCAST, dl, VT, Perm) : - DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX - DAG.getTargetConstant(1, dl, MVT::i64)); + Perm = Subtarget.hasAltivec() + ? DAG.getNode(ISD::BITCAST, dl, VT, Perm) + : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, + DAG.getTargetConstant(1, dl, MVT::i64)); // second argument is 1 because this rounding // is always exact. @@ -15226,14 +14420,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, unsigned IID = cast(N->getOperand(0))->getZExtValue(); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); - if ((IID == Intr || - IID == Intrinsic::ppc_qpx_qvlpcld || - IID == Intrinsic::ppc_qpx_qvlpcls) && - N->getOperand(1)->getOpcode() == ISD::ADD) { + if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); - int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? - 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; + int Bits = 4 /* 16 byte alignment */; if (DAG.MaskedValueIsZero(Add->getOperand(1), APInt::getAllOnesValue(Bits /* alignment */) @@ -15243,7 +14433,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast(UI->getOperand(0))->getZExtValue() == IID) { + cast(UI->getOperand(0))->getZExtValue() == + IID) { // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -15792,17 +14983,9 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::F4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); } break; case 'v': - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); break; @@ -16094,12 +15277,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: @@ -16121,18 +15298,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_lxvd2x: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvlfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16147,45 +15312,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad; return true; } - case Intrinsic::ppc_qpx_qvlfda: - case Intrinsic::ppc_qpx_qvlfsa: - case Intrinsic::ppc_qpx_qvlfcda: - case Intrinsic::ppc_qpx_qvlfcsa: - case Intrinsic::ppc_qpx_qvlfiwaa: - case Intrinsic::ppc_qpx_qvlfiwza: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: @@ -16207,18 +15333,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvstfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16233,39 +15347,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } - case Intrinsic::ppc_qpx_qvstfda: - case Intrinsic::ppc_qpx_qvstfsa: - case Intrinsic::ppc_qpx_qvstfcda: - case Intrinsic::ppc_qpx_qvstfcsa: - case Intrinsic::ppc_qpx_qvstfiwa: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(1); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOStore; - return true; - } default: break; } @@ -16278,14 +15359,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, EVT PPCTargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - // When expanding a memset, require at least two QPX instructions to cover - // the cost of loading the value to be stored from the constant pool. - if (Subtarget.hasQPX() && Op.size() >= 32 && - (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) && - !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { - return MVT::v4f64; - } - // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. if (Subtarget.hasAltivec() && Op.size() >= 16 && @@ -16504,7 +15577,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles( if (VT == MVT::v2i64) return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves - if (Subtarget.hasVSX() || Subtarget.hasQPX()) + if (Subtarget.hasVSX()) return true; return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); @@ -16550,8 +15623,7 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, switch (Opc) { case PPCISD::FNMSUB: - // TODO: QPX subtarget is deprecated. No transformation here. - if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX()) + if (!Op.hasOneUse() || !isTypeLegal(VT)) break; const TargetOptions &Options = getTargetMachine().Options; @@ -17032,8 +16104,7 @@ SDValue PPCTargetLowering::combineFMALike(SDNode *N, bool LegalOps = !DCI.isBeforeLegalizeOps(); SDLoc Loc(N); - // TODO: QPX subtarget is deprecated. No transformation here. - if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT)) + if (!isOperationLegal(ISD::FMA, VT)) return SDValue(); // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0 diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 768eaa43e0135..8cc42226d7f0b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -427,22 +427,6 @@ namespace llvm { /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) VABSD, - /// QVFPERM = This corresponds to the QPX qvfperm instruction. - QVFPERM, - - /// QVGPCI = This corresponds to the QPX qvgpci instruction. - QVGPCI, - - /// QVALIGNI = This corresponds to the QPX qvaligni instruction. - QVALIGNI, - - /// QVESPLATI = This corresponds to the QPX qvesplati instruction. - QVESPLATI, - - /// QBFLT = Access the underlying QPX floating-point boolean - /// representation. - QBFLT, - /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or /// lower (IDX=1) half of v4f32 to v2f64. FP_EXTEND_HALF, @@ -519,10 +503,6 @@ namespace llvm { /// Store scalar integers from VSR. ST_VSR_SCAL_INT, - /// QBRC, CHAIN = QVLFSb CHAIN, Ptr - /// The 4xf32 load used for v4i1 constants. - QVLFSb, - /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes /// except they ensure that the compare input is zero-extended for /// sub-word versions because the atomic loads zero-extend. diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 632d4d9deb8a2..5ff5fc78326ba 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -642,7 +642,6 @@ class XForm_17a opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let FRA = 0; } -// Used for QPX class XForm_18 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { @@ -1781,14 +1780,6 @@ class AForm_4 opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } -// Used for QPX -class AForm_4a opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : AForm_1 { - let FRA = 0; - let FRC = 0; -} - // 1.7.13 M-Form class MForm_1 opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> @@ -2099,49 +2090,6 @@ class VX_RD5_RSp5_PS1_XO9 xo, dag OOL, dag IOL, string asmstr, let Inst{23-31} = xo; } -// Z23-Form (used by QPX) -class Z23Form_1 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : I { - bits<5> FRT; - bits<5> FRA; - bits<5> FRB; - bits<2> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-15} = FRA; - let Inst{16-20} = FRB; - let Inst{21-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - -class Z23Form_2 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : Z23Form_1 { - let FRB = 0; -} - -class Z23Form_3 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : I { - bits<5> FRT; - bits<12> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - class Z23Form_8 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 99e25bb130ce4..d4c3c5f5504c7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -259,16 +259,6 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case PPC::XVMULDP: case PPC::XVMULSP: case PPC::XSMULSP: - // QPX Add: - case PPC::QVFADD: - case PPC::QVFADDS: - case PPC::QVFADDSs: - // QPX Multiply: - case PPC::QVFMUL: - case PPC::QVFMULS: - case PPC::QVFMULSs: - return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && - Inst.getFlag(MachineInstr::MIFlag::FmNsz); // Fixed point: // Multiply: case PPC::MULHD: @@ -300,9 +290,7 @@ static const uint16_t FMAOpIdxInfo[][5] = { {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2}, {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2}, {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1}, - {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}, - {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1}, - {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}}; + {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}}; // Check if an opcode is a FMA instruction. If it is, return the index in array // FMAOpIdxInfo. Otherwise, return -1. @@ -666,7 +654,6 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::LI8: case PPC::LIS: case PPC::LIS8: - case PPC::QVGPCI: case PPC::ADDIStocHA: case PPC::ADDIStocHA8: case PPC::ADDItocL: @@ -1343,12 +1330,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) || PPC::VSSRCRegClass.contains(DestReg, SrcReg)) Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf; - else if (PPC::QFRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMR; - else if (PPC::QSRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRs; - else if (PPC::QBRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRb; else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) @@ -1393,12 +1374,6 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) { OpcodeIndex = SOK_VectorFloat4Spill; } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_VRSaveSpill; - } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat8Spill; - } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat4Spill; - } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadBitSpill; } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SpillToVSR; } else { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 43973c627fcf1..bdcfa76505daf 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -123,9 +123,6 @@ enum SpillOpcodeKey { SOK_VectorFloat8Spill, SOK_VectorFloat4Spill, SOK_VRSaveSpill, - SOK_QuadFloat8Spill, - SOK_QuadFloat4Spill, - SOK_QuadBitSpill, SOK_SpillToVSR, SOK_SPESpill, SOK_LastOpcodeSpill // This must be last on the enum. @@ -136,32 +133,28 @@ enum SpillOpcodeKey { { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \ - PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb, \ - PPC::SPILLTOVSR_LD, PPC::EVLDD \ + PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD, PPC::EVLDD \ } #define Pwr9LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ - PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, \ - PPC::QVLFDXb, PPC::SPILLTOVSR_LD \ + PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD \ } #define Pwr8StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \ - PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST, \ - PPC::EVSTDD \ + PPC::SPILLTOVSR_ST, PPC::EVSTDD \ } #define Pwr9StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ - PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, \ - PPC::SPILLTOVSR_ST \ + PPC::SPILL_VRSAVE, PPC::SPILLTOVSR_ST \ } // Initialize arrays for load and store spill opcodes on supported subtargets. @@ -273,10 +266,10 @@ class PPCInstrInfo : public PPCGenInstrInfo { } static bool isSameClassPhysRegCopy(unsigned Opcode) { - unsigned CopyOpcodes[] = - { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf, - PPC::XSCPSGNDP, PPC::MCRF, PPC::QVFMR, PPC::QVFMRs, PPC::QVFMRb, - PPC::CROR, PPC::EVOR, -1U }; + unsigned CopyOpcodes[] = {PPC::OR, PPC::OR8, PPC::FMR, + PPC::VOR, PPC::XXLOR, PPC::XXLORf, + PPC::XSCPSGNDP, PPC::MCRF, PPC::CROR, + PPC::EVOR, -1U}; for (int i = 0; CopyOpcodes[i] != -1U; i++) if (Opcode == CopyOpcodes[i]) return true; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index c565758973bf5..83a434f5e793a 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -203,16 +203,6 @@ def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; -def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; -def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>; -def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>; -def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>; - -def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>; - -def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb, - [SDNPHasChain, SDNPMayLoad]>; - def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>; // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift @@ -3461,7 +3451,6 @@ include "PPCInstrAltivec.td" include "PPCInstrSPE.td" include "PPCInstr64Bit.td" include "PPCInstrVSX.td" -include "PPCInstrQPX.td" include "PPCInstrHTM.td" def crnot : OutPatFrag<(ops node:$in), diff --git a/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/llvm/lib/Target/PowerPC/PPCInstrQPX.td deleted file mode 100644 index 2265af2815cb5..0000000000000 --- a/llvm/lib/Target/PowerPC/PPCInstrQPX.td +++ /dev/null @@ -1,1212 +0,0 @@ -//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the QPX extension to the PowerPC instruction set. -// Reference: -// Book Q: QPX Architecture Definition. IBM (as updated in) 2011. -// -//===----------------------------------------------------------------------===// - -def PPCRegQFRCAsmOperand : AsmOperandClass { - let Name = "RegQFRC"; let PredicateMethod = "isRegNumber"; -} -def qfrc : RegisterOperand { - let ParserMatchClass = PPCRegQFRCAsmOperand; -} -def PPCRegQSRCAsmOperand : AsmOperandClass { - let Name = "RegQSRC"; let PredicateMethod = "isRegNumber"; -} -def qsrc : RegisterOperand { - let ParserMatchClass = PPCRegQSRCAsmOperand; -} -def PPCRegQBRCAsmOperand : AsmOperandClass { - let Name = "RegQBRC"; let PredicateMethod = "isRegNumber"; -} -def qbrc : RegisterOperand { - let ParserMatchClass = PPCRegQBRCAsmOperand; -} - -//===----------------------------------------------------------------------===// -// Helpers for defining instructions that directly correspond to intrinsics. - -// QPXA1_Int - A AForm_1 intrinsic definition. -class QPXA1_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1; -// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions). -class QPXA1s_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1; -// QPXA2_Int - A AForm_2 intrinsic definition. -class QPXA2_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_2; -// QPXA3_Int - A AForm_3 intrinsic definition. -class QPXA3_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_3; -// QPXA4_Int - A AForm_4a intrinsic definition. -class QPXA4_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_4a; -// QPXX18_Int - A XForm_18 intrinsic definition. -class QPXX18_Int opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_18; -// QPXX19_Int - A XForm_19 intrinsic definition. -class QPXX19_Int opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_19; - -//===----------------------------------------------------------------------===// -// Pattern Frags. - -def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr), - (truncstore node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; -def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset), - (pre_truncst node:$val, - node:$base, node:$offset), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast(N->getOperand(1))->getZExtValue() == 0; -}]>; - -def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast(N->getOperand(1))->getZExtValue() == 1; -}]>; - -let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs. - def u12 : ImmLeaf; - -//===----------------------------------------------------------------------===// -// Instruction Definitions. - -def HasQPX : Predicate<"Subtarget->hasQPX()">; -let Predicates = [HasQPX] in { -let DecoderNamespace = "QPX" in { -let hasSideEffects = 0 in { // QPX instructions don't have side effects. -let Uses = [RM] in { - // Add Instructions - let isCommutable = 1 in { - def QVFADD : AForm_2<4, 21, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>; - def QVFADDSs : AForm_2<0, 21, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>; - } - def QVFSUB : AForm_2<4, 20, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>; - def QVFSUBSs : AForm_2<0, 20, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>; - - // Estimate Instructions - def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfre $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>; - def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>; - let isCodeGenOnly = 1 in - def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfres $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>; - - def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrsqrte $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>; - def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>; - let isCodeGenOnly = 1 in - def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>; - - // Multiply Instructions - let isCommutable = 1 in { - def QVFMUL : AForm_3<4, 25, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), - "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>; - def QVFMULSs : AForm_3<0, 25, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC), - "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>; - } - def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>; - def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>; - - // Multiply-add instructions - def QVFMADD : AForm_1<4, 29, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>; - def QVFMADDSs : AForm_1<0, 29, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>; - def QVFNMADD : AForm_1<4, 31, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>; - def QVFNMADDSs : AForm_1<0, 31, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - v4f32:$FRB)))]>; - def QVFMSUB : AForm_1<4, 28, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>; - def QVFMSUBSs : AForm_1<0, 28, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB)))]>; - def QVFNMSUB : AForm_1<4, 30, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB))))]>; - let isCodeGenOnly = 1 in - def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>; - def QVFNMSUBSs : AForm_1<0, 30, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB))))]>; - def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>; - def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>; - def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>; - def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>; - def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>; - def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>; - def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>; - def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>; - - // Select Instruction - let isCodeGenOnly = 1 in - def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>; - def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT), - (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (vselect v4i1:$FRA, - v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT), - (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (vselect v4i1:$FRA, - v4f32:$FRC, v4f32:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT), - (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4i1:$FRT, (vselect v4i1:$FRA, - v4i1:$FRC, v4i1:$FRB))]>; - - // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after - // instruction selection into a branch sequence. - def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, - i32imm:$BROPC), "#SELECT_CC_QFRC", - []>; - def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, - i32imm:$BROPC), "#SELECT_CC_QSRC", - []>; - def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, - i32imm:$BROPC), "#SELECT_CC_QBRC", - []>; - - // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition - // register bit directly. - def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond, - qfrc:$T, qfrc:$F), "#SELECT_QFRC", - [(set v4f64:$dst, - (select i1:$cond, v4f64:$T, v4f64:$F))]>; - def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond, - qsrc:$T, qsrc:$F), "#SELECT_QSRC", - [(set v4f32:$dst, - (select i1:$cond, v4f32:$T, v4f32:$F))]>; - def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond, - qbrc:$T, qbrc:$F), "#SELECT_QBRC", - [(set v4i1:$dst, - (select i1:$cond, v4i1:$T, v4i1:$F))]>; - - // Convert and Round Instructions - def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>; - let isCodeGenOnly = 1 in - def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfctid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>; - def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>; - def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>; - def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>; - def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>; - def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>; - def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>; - def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>; - let isCodeGenOnly = 1 in - def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>; - def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>; - def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>; - - let isCodeGenOnly = 1 in - def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>; - def QVFRSPs : XForm_19<4, 12, - (outs qsrc:$FRT), (ins qfrc:$FRB), - "qvfrsp $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>; - - def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>; - - def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fround v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround v4f32:$FRB))]>; - - def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fceil v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fceil v4f32:$FRB))]>; - - def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>; - - // Move Instructions - def QVFMR : XForm_19<4, 72, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f64:$FRT, v4f64:$FRB) */]>; - let isCodeGenOnly = 1 in { - def QVFMRs : XForm_19<4, 72, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f32:$FRT, v4f32:$FRB) */]>; - def QVFMRb : XForm_19<4, 72, - (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4i1:$FRT, v4i1:$FRB) */]>; - } - def QVFNEG : XForm_19<4, 40, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFNEGs : XForm_19<4, 40, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg v4f32:$FRB))]>; - def QVFABS : XForm_19<4, 264, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fabs v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFABSs : XForm_19<4, 264, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fabs v4f32:$FRB))]>; - def QVFNABS : XForm_19<4, 136, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNABSs : XForm_19<4, 136, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>; - def QVFCPSGN : XForm_18<4, 8, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>; - let isCodeGenOnly = 1 in - def QVFCPSGNs : XForm_18<4, 8, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>; - - def QVALIGNI : Z23Form_1<4, 5, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvaligni v4f64:$FRA, v4f64:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIs : Z23Form_1<4, 5, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvaligni v4f32:$FRA, v4f32:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIb : Z23Form_1<4, 5, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvaligni v4i1:$FRA, v4i1:$FRB, - (i32 imm:$idx)))]>; - - def QVESPLATI : Z23Form_2<4, 37, - (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIs : Z23Form_2<4, 37, - (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIb : Z23Form_2<4, 37, - (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>; - - def QVFPERM : AForm_1<4, 6, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFPERMs : AForm_1<4, 6, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>; - - let isReMaterializable = 1, isAsCheapAsAMove = 1 in - def QVGPCI : Z23Form_3<4, 133, - (outs qfrc:$FRT), (ins u12imm:$idx), - "qvgpci $FRT, $idx", IIC_VecPerm, - [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>; - - // Compare Instruction - let isCodeGenOnly = 1 in - def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>; - def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>; - def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>; - def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>; - def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>; - - let isCodeGenOnly = 1 in - def QVFLOGICAL : XForm_20<4, 4, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - def QVFLOGICALb : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - let isCodeGenOnly = 1 in - def QVFLOGICALs : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - - // Load indexed instructions - let mayLoad = 1 in { - def QVLFDX : XForm_1_memOp<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (load xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFDXb : XForm_1_memOp<31, 583, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFDXA : XForm_1<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFDUX : XForm_1<31, 615, - (outs qfrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfdux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - let RC = 1 in - def QVLFDUXA : XForm_1<31, 615, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSX : XForm_1_memOp<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>; - - let isCodeGenOnly = 1 in - def QVLFSXb : XForm_1<31, 519, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFSXs : XForm_1_memOp<31, 519, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f32:$FRT, (load xoaddr:$src))]>; - - let RC = 1 in - def QVLFSXA : XForm_1<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSUX : XForm_1<31, 551, - (outs qsrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfsux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - - let RC = 1 in - def QVLFSUXA : XForm_1<31, 551, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDX : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDXA : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDUX : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDUXA : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSX : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLFCSXs : XForm_1<31, 7, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFCSXA : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSUX : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCSUXA : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWAX : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwax $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWAXA : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWZX : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWZXA : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>; - } - - - def QVLPCLDX : XForm_1<31, 582, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcldx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCLSX : XForm_1<31, 518, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpclsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLPCLSXint : XForm_11<31, 518, - (outs qfrc:$FRT), (ins G8RC:$src), - "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>; - def QVLPCRDX : XForm_1<31, 70, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCRSX : XForm_1<31, 6, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>; - - // Store indexed instructions - let mayStore = 1 in { - def QVSTFDX : XForm_8_memOp<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, - [(store qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFDXb : XForm_8_memOp<31, 711, - (outs), (ins qbrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFDXA : XForm_8<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFDUXA : XForm_8<31, 743, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDXI : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDXIA : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUXI : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDUXIA : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSX : XForm_8_memOp<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFSXs : XForm_8_memOp<31, 647, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(store qsrc:$FRT, xoaddr:$dst)]>; - - let RC = 1 in - def QVSTFSXA : XForm_8<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qsrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - let isCodeGenOnly = 1 in - def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFSUXA : XForm_8<31, 679, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSXI : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSXIA : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUXI : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSUXIA : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDX : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXA : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSX : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - let isCodeGenOnly = 1 in - def QVSTFCSXs : XForm_8<31, 135, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFCSXA : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUX : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXA : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUX : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXA : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDXI : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXIA : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSXI : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSXIA : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUXI : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXIA : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUXI : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXIA : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFIWX : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFIWXA : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>; - } -} - -} // neverHasSideEffects -} - -def : InstAlias<"qvfclr $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>; -def : InstAlias<"qvfand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>; -def : InstAlias<"qvfandc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>; -def : InstAlias<"qvfctfb $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>; -def : InstAlias<"qvfxor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>; -def : InstAlias<"qvfor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>; -def : InstAlias<"qvfnor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>; -def : InstAlias<"qvfequ $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>; -def : InstAlias<"qvfnot $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>; -def : InstAlias<"qvforc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>; -def : InstAlias<"qvfnand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>; -def : InstAlias<"qvfset $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>; - -//===----------------------------------------------------------------------===// -// Additional QPX Patterns -// - -def : Pat<(v4f64 (scalar_to_vector f64:$A)), - (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>; -def : Pat<(v4f32 (scalar_to_vector f32:$A)), - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 1)), - (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 2)), - (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 3)), - (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; - -def : Pat<(f32 (extractelt v4f32:$S, 1)), - (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 2)), - (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 3)), - (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERM $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERMs $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; - -def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C), - (QVFPERM $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B), - (QVFCPSGN $A, $B)>; - -// FCOPYSIGN's operand types need not agree. -def : Pat<(fcopysign v4f64:$frB, v4f32:$frA), - (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>; -def : Pat<(fcopysign QSRC:$frB, QFRC:$frA), - (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>; - -def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>; -def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>; -def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>; - -def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>; -def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>; -def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>; -def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>; - -def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>; -def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>; - -def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B), - (QVFADD $A, $B)>; -def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B), - (QVFSUB $A, $B)>; -def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B), - (QVFMUL $A, $B)>; - -// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b) -def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; -def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; - -def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMSUB $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMSUB $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src), - (QVLFDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src), - (QVLFSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src), - (QVLFCDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src), - (QVLFCDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src), - (QVLFCSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src), - (QVLFCSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src), - (QVLFIWAXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src), - (QVLFIWAX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src), - (QVLFIWZXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src), - (QVLFIWZX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src), - (QVLPCLDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src), - (QVLPCLSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src), - (QVLPCRDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src), - (QVLPCRSX xoaddr:$src)>; - -def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst), - (QVSTFDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst), - (QVSTFSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst), - (QVSTFCDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst), - (QVSTFCDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst), - (QVSTFCSXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst), - (QVSTFCSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst), - (QVSTFDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst), - (QVSTFIWXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst), - (QVSTFIWX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst), - (QVSTFSXA $T, xoaddr:$dst)>; - -def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFDUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUXs $rS, $ptrreg, $ptroff)>; - -def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)), - (QVFLOGICAL $A, $B, imm:$idx)>; -def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)), - (QVGPCI imm:$idx)>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ), - (QVFCMPEQb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT), - (QVFCMPGTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT), - (QVFCMPLTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 10))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ), - (QVFCMPEQbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT), - (QVFCMPGTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT), - (QVFCMPLTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 10))>; - -def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 4))>; -def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 8))>; -def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 9))>; -def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 13))>; -def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 14))>; - -def : Pat<(and v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 1))>; -def : Pat<(or v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 7))>; -def : Pat<(xor v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 6))>; -def : Pat<(not v4i1:$FRA), - (QVFLOGICALb $FRA, $FRA, (i32 10))>; - -def : Pat<(v4f64 (fpextend v4f32:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f32 (fround_exact v4f64:$src)), - (COPY_TO_REGCLASS $src, QSRC)>; - -// Extract the underlying floating-point values from the -// QPX (-1.0, 1.0) boolean representation. -def : Pat<(v4f64 (PPCqbflt v4i1:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)), - (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)), - (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)), - (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)), - (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)), - (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)), - (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -} // end HasQPX - -let Predicates = [HasQPX, NoNaNsFPMath] in { -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>; -} - -let Predicates = [HasQPX, NaNsFPMath] in { -// When either of these operands is NaN, we should return the other operand. -// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need -// to explicitly or with a NaN test on the second operand. -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -} diff --git a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp deleted file mode 100644 index 6e90426438208..0000000000000 --- a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp +++ /dev/null @@ -1,161 +0,0 @@ -//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The QPX vector registers overlay the scalar floating-point registers, and -// any scalar floating-point loads splat their value across all vector lanes. -// Thus, if we have a scalar load followed by a splat, we can remove the splat -// (i.e. replace the load with a load-and-splat pseudo instruction). -// -// This pass must run after anything that might do store-to-load forwarding. -// -//===----------------------------------------------------------------------===// - -#include "PPC.h" -#include "PPCInstrBuilder.h" -#include "PPCInstrInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "ppc-qpx-load-splat" - -STATISTIC(NumSimplified, "Number of QPX load splats simplified"); - -namespace { - struct PPCQPXLoadSplat : public MachineFunctionPass { - static char ID; - PPCQPXLoadSplat() : MachineFunctionPass(ID) { - initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &Fn) override; - - StringRef getPassName() const override { - return "PowerPC QPX Load Splat Simplification"; - } - }; - char PPCQPXLoadSplat::ID = 0; -} - -INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat", - "PowerPC QPX Load Splat Simplification", - false, false) - -FunctionPass *llvm::createPPCQPXLoadSplatPass() { - return new PPCQPXLoadSplat(); -} - -bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - bool MadeChange = false; - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - - for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) { - MachineBasicBlock *MBB = &*MFI; - SmallVector Splats; - - for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) { - MachineInstr *MI = &*MBBI; - - if (MI->hasUnmodeledSideEffects() || MI->isCall()) { - Splats.clear(); - continue; - } - - // We're looking for a sequence like this: - // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2) - // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm - - for (auto SI = Splats.begin(); SI != Splats.end();) { - MachineInstr *SMI = *SI; - Register SplatReg = SMI->getOperand(0).getReg(); - Register SrcReg = SMI->getOperand(1).getReg(); - - if (MI->modifiesRegister(SrcReg, TRI)) { - switch (MI->getOpcode()) { - default: - SI = Splats.erase(SI); - continue; - case PPC::LFS: - case PPC::LFD: - case PPC::LFSU: - case PPC::LFDU: - case PPC::LFSUX: - case PPC::LFDUX: - case PPC::LFSX: - case PPC::LFDX: - case PPC::LFIWAX: - case PPC::LFIWZX: - if (SplatReg != SrcReg) { - // We need to change the load to define the scalar subregister of - // the QPX splat source register. - unsigned SubRegIndex = - TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg()); - Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); - - // Substitute both the explicit defined register, and also the - // implicit def of the containing QPX register. - MI->getOperand(0).setReg(SplatSubReg); - MI->substituteRegister(SrcReg, SplatReg, 0, *TRI); - } - - SI = Splats.erase(SI); - - // If SMI is directly after MI, then MBBI's base iterator is - // pointing at SMI. Adjust MBBI around the call to erase SMI to - // avoid invalidating MBBI. - ++MBBI; - SMI->eraseFromParent(); - --MBBI; - - ++NumSimplified; - MadeChange = true; - continue; - } - } - - // If this instruction defines the splat register, then we cannot move - // the previous definition above it. If it reads from the splat - // register, then it must already be alive from some previous - // definition, and if the splat register is different from the source - // register, then this definition must not be the load for which we're - // searching. - if (MI->modifiesRegister(SplatReg, TRI) || - (SrcReg != SplatReg && - MI->readsRegister(SplatReg, TRI))) { - SI = Splats.erase(SI); - continue; - } - - ++SI; - } - - if (MI->getOpcode() != PPC::QVESPLATI && - MI->getOpcode() != PPC::QVESPLATIs && - MI->getOpcode() != PPC::QVESPLATIb) - continue; - if (MI->getOperand(2).getImm() != 0) - continue; - - // If there are other uses of the scalar value after this, replacing - // those uses might be non-trivial. - if (!MI->getOperand(1).isKill()) - continue; - - Splats.push_back(MI); - } - } - - return MadeChange; -} diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index ed8948a639728..96666ad58dfe5 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -404,9 +404,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } case PPC::F8RCRegClassID: case PPC::F4RCRegClassID: - case PPC::QFRCRegClassID: - case PPC::QSRCRegClassID: - case PPC::QBRCRegClassID: case PPC::VRRCRegClassID: case PPC::VFRCRegClassID: case PPC::VSLRCRegClassID: diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 61acd955e1cba..a931967862c7b 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -153,7 +153,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { switch (RegName[0]) { case 'r': case 'f': - case 'q': // for QPX case 'v': if (RegName[1] == 's') return RegName + 2; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index b45757c1acc5e..e07b960ae305b 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -54,13 +54,6 @@ class FPR num, string n> : PPCReg { let HWEncoding{4-0} = num; } -// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX) -class QFPR : PPCReg { - let HWEncoding = SubReg.HWEncoding; - let SubRegs = [SubReg]; - let SubRegIndices = [sub_64]; -} - // VF - One of the 32 64-bit floating-point subregisters of the vector // registers (used by VSX). class VF num, string n> : PPCReg { @@ -132,12 +125,6 @@ foreach Index = 0-31 in { DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; } -// QPX Floating-point registers -foreach Index = 0-31 in { - def QF#Index : QFPR("F"#Index), "q"#Index>, - DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; -} - // Vector registers foreach Index = 0-31 in { def V#Index : VR("VF"#Index), "v"#Index>, @@ -343,16 +330,6 @@ def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC, // Register class for single precision scalars in VSX registers def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>; -// For QPX -def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13), - (sequence "QF%u", 31, 14))>; -def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>; -def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> { - // These are actually stored as floating-point values where a positive - // number is true and anything else (including NaN) is false. - let Size = 256; -} - def CRBITRC : RegisterClass<"PPC", [i1], 32, (add CR2LT, CR2GT, CR2EQ, CR2UN, CR3LT, CR3GT, CR3EQ, CR3UN, diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td index 0a1ae7e55b3c2..311d5cf165f63 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td @@ -40,12 +40,9 @@ def P9Model : SchedMachineModel { let CompleteModel = 1; - // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing - // Engine), prefixed instructions on Power 9, PC relative mem ops, or - // instructions introduced in ISA 3.1. - let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops, - IsISA3_1]; - + // Do not support SPE (Signal Processing Engine), prefixed instructions on + // Power 9, PC relative mem ops, or instructions introduced in ISA 3.1. + let UnsupportedFeatures = [HasSPE, PrefixInstrs, PCRelativeMemops, IsISA3_1]; } let SchedModel = P9Model in { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 3836cc960394f..85d2966654970 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -35,10 +35,6 @@ using namespace llvm; static cl::opt UseSubRegLiveness("ppc-track-subreg-liveness", cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden); -static cl::opt QPXStackUnaligned("qpx-stack-unaligned", - cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"), - cl::Hidden); - static cl::opt EnableMachinePipeliner("ppc-enable-pipeliner", cl::desc("Enable Machine Pipeliner for PPC"), @@ -70,7 +66,6 @@ void PPCSubtarget::initializeEnvironment() { HasAltivec = false; HasSPE = false; HasFPU = false; - HasQPX = false; HasVSX = false; NeedsTwoConstNR = false; HasP8Vector = false; @@ -109,7 +104,6 @@ void PPCSubtarget::initializeEnvironment() { HasInvariantFunctionDescriptors = false; HasPartwordAtomics = false; HasDirectMove = false; - IsQPXStackUnaligned = false; HasHTM = false; HasFloat128 = false; HasFusion = false; @@ -158,7 +152,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (HasSPE && IsPPC64) report_fatal_error( "SPE is only supported for 32-bit targets.\n", false); - if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU)) + if (HasSPE && (HasAltivec || HasVSX || HasFPU)) report_fatal_error( "SPE and traditional floating point cannot both be enabled.\n", false); @@ -166,10 +160,6 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (!HasSPE) HasFPU = true; - // QPX requires a 32-byte aligned stack. Note that we need to do this if - // we're compiling for a BG/Q system regardless of whether or not QPX - // is enabled because external functions will assume this alignment. - IsQPXStackUnaligned = QPXStackUnaligned; StackAlignment = getPlatformStackAlignment(); // Determine endianness. diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index ec329022c4572..8a4041518e3c2 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -97,7 +97,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool HasAltivec; bool HasFPU; bool HasSPE; - bool HasQPX; bool HasVSX; bool NeedsTwoConstNR; bool HasP8Vector; @@ -150,11 +149,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { POPCNTDKind HasPOPCNTD; - /// When targeting QPX running a stock PPC64 Linux kernel where the stack - /// alignment has not been changed, we need to keep the 16-byte alignment - /// of the stack. - bool IsQPXStackUnaligned; - const PPCTargetMachine &TM; PPCFrameLowering FrameLowering; PPCInstrInfo InstrInfo; @@ -255,7 +249,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasAltivec() const { return HasAltivec; } bool hasSPE() const { return HasSPE; } bool hasFPU() const { return HasFPU; } - bool hasQPX() const { return HasQPX; } bool hasVSX() const { return HasVSX; } bool needsTwoConstNR() const { return NeedsTwoConstNR; } bool hasP8Vector() const { return HasP8Vector; } @@ -291,11 +284,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasPartwordAtomics() const { return HasPartwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } - bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } Align getPlatformStackAlignment() const { - if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned()) - return Align(32); - return Align(16); } @@ -325,9 +314,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { const Triple &getTargetTriple() const { return TargetTriple; } - /// isBGQ - True if this is a BG/Q platform. - bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } - bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index f15f9c7f49429..27de5b29cd341 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -63,10 +63,6 @@ static cl:: opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); -static cl:: -opt DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden, - cl::desc("Disable QPX load splat simplification")); - static cl:: opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); @@ -114,7 +110,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { initializePPCReduceCRLogicalsPass(PR); initializePPCBSelPass(PR); initializePPCBranchCoalescingPass(PR); - initializePPCQPXLoadSplatPass(PR); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCPreEmitPeepholePass(PR); @@ -411,14 +406,9 @@ void PPCPassConfig::addIRPasses() { // Lower generic MASSV routines to PowerPC subtarget-specific entries. addPass(createPPCLowerMASSVEntriesPass()); - - // For the BG/Q (or if explicitly requested), add explicit data prefetch - // intrinsics. - bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && - getOptLevel() != CodeGenOpt::None; + + // If explicitly requested, add explicit data prefetch intrinsics. if (EnablePrefetch.getNumOccurrences() > 0) - UsePrefetching = EnablePrefetch; - if (UsePrefetching) addPass(createLoopDataPrefetchPass()); if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) { @@ -515,15 +505,8 @@ void PPCPassConfig::addPreRegAlloc() { } void PPCPassConfig::addPreSched2() { - if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() != CodeGenOpt::None) addPass(&IfConverterID); - - // This optimization must happen after anything that might do store-to-load - // forwarding. Here we're after RA (and, thus, when spills are inserted) - // but before post-RA scheduling. - if (!DisableQPXLoadSplat) - addPass(createPPCQPXLoadSplatPass()); - } } void PPCPassConfig::addPreEmitPass() { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index bbb4239d36da5..ee8842f4d8663 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -25,8 +25,7 @@ using namespace llvm; static cl::opt DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); -// This is currently only used for the data prefetch pass which is only enabled -// for BG/Q by default. +// This is currently only used for the data prefetch pass static cl::opt CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), cl::desc("The loop prefetch cache line size")); @@ -104,55 +103,6 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1)); } - case Intrinsic::ppc_qpx_qvlfs: - // Turn PPC QPX qvlfs -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(0), Align(16), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { - Type *VTy = - VectorType::get(IC.Builder.getFloatTy(), - cast(II.getType())->getElementCount()); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0), - PointerType::getUnqual(VTy)); - Value *Load = IC.Builder.CreateLoad(VTy, Ptr); - return new FPExtInst(Load, II.getType()); - } - break; - case Intrinsic::ppc_qpx_qvlfd: - // Turn PPC QPX qvlfd -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(0), Align(32), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { - Value *Ptr = IC.Builder.CreateBitCast( - II.getArgOperand(0), PointerType::getUnqual(II.getType())); - return new LoadInst(II.getType(), Ptr, "", false, Align(32)); - } - break; - case Intrinsic::ppc_qpx_qvstfs: - // Turn PPC QPX qvstfs -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(1), Align(16), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { - Type *VTy = VectorType::get( - IC.Builder.getFloatTy(), - cast(II.getArgOperand(0)->getType())->getElementCount()); - Value *TOp = IC.Builder.CreateFPTrunc(II.getArgOperand(0), VTy); - Type *OpPtrTy = PointerType::getUnqual(VTy); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); - return new StoreInst(TOp, Ptr, false, Align(16)); - } - break; - case Intrinsic::ppc_qpx_qvstfd: - // Turn PPC QPX qvstfd -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(1), Align(32), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { - Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); - return new StoreInst(II.getArgOperand(0), Ptr, false, Align(32)); - } - break; - case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -736,10 +686,7 @@ bool PPCTTIImpl::useColdCCForColdCall(Function &F) { } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { - // On the A2, always unroll aggressively. For QPX unaligned loads, we depend - // on combining the loads generated for consecutive accesses, and failure to - // do so is particularly expensive. This makes it much more likely (compared - // to only using concatenation unrolling). + // On the A2, always unroll aggressively. if (ST->getCPUDirective() == PPC::DIR_A2) return true; @@ -799,7 +746,6 @@ const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { - if (ST->hasQPX()) return 256; if (ST->hasAltivec()) return 128; return 0; } @@ -828,8 +774,6 @@ unsigned PPCTTIImpl::getCacheLineSize() const { } unsigned PPCTTIImpl::getPrefetchDistance() const { - // This seems like a reasonable default for the BG/Q (this pass is enabled, by - // default, only on the BG/Q). return 300; } @@ -918,7 +862,7 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the @@ -974,13 +918,6 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return Cost; - } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { - // Floating point scalars are already located in index #0. - if (Index == 0) - return 0; - - return Cost; - } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { if (ST->hasP9Altivec()) { if (ISD == ISD::INSERT_VECTOR_ELT) @@ -1055,8 +992,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, LT.second == MVT::v4i32 || LT.second == MVT::v4f32); bool IsVSXType = ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); - bool IsQPXType = ST->hasQPX() && - (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); // VSX has 32b/64b load instructions. Legalization can handle loading of // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and @@ -1079,8 +1014,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // for Altivec types using the VSX instructions, but that's more expensive // than using the permutation-based load sequence. On the P8, that's no // longer true. - if (Opcode == Instruction::Load && - ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) && *Alignment >= LT.second.getScalarType().getStoreSize()) return Cost + LT.first; // Add the cost of the permutations. @@ -1133,7 +1067,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost( getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). For each result vector, we need one shuffle per incoming // vector (except that the first shuffle can take two incoming vectors diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 427abde4277d4..aa06e8144f634 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4751,15 +4751,14 @@ struct VarArgPowerPC64Helper : public VarArgHelper { // For PowerPC, we need to deal with alignment of stack arguments - // they are mostly aligned to 8 bytes, but vectors and i128 arrays // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes, - // and QPX vectors are aligned to 32 bytes. For that reason, we - // compute current offset from stack pointer (which is always properly - // aligned), and offset for the first vararg, then subtract them. + // For that reason, we compute current offset from stack pointer (which is + // always properly aligned), and offset for the first vararg, then subtract + // them. unsigned VAArgBase; Triple TargetTriple(F.getParent()->getTargetTriple()); // Parameter save area starts at 48 bytes from frame pointer for ABIv1, // and 32 bytes for ABIv2. This is usually determined by target // endianness, but in theory could be overridden by function attribute. - // For simplicity, we ignore it here (it'd only matter for QPX vectors). if (TargetTriple.getArch() == Triple::ppc64) VAArgBase = 48; else diff --git a/llvm/test/Analysis/BasicAA/phi-spec-order.ll b/llvm/test/Analysis/BasicAA/phi-spec-order.ll index f8586f094c2ce..e5d435c09ccc7 100644 --- a/llvm/test/Analysis/BasicAA/phi-spec-order.ll +++ b/llvm/test/Analysis/BasicAA/phi-spec-order.ll @@ -1,5 +1,5 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; RUN: opt < %s -basic-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s @X = external global [16000 x double], align 32 diff --git a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll index 3b1bc3b3fdbc0..e5fbf070cf32a 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll @@ -218,42 +218,6 @@ entry: ; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 } -define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { -entry: - %r = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %r - -; CHECK-LABEL: test_l_qv4float -; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 -} - -define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { -entry: - %r = load <8 x float>, <8 x float>* %p, align 4 - ret <8 x float> %r - -; CHECK-LABEL: test_l_qv8float -; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 -} - -define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { -entry: - %r = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %r - -; CHECK-LABEL: test_l_qv4double -; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 -} - -define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { -entry: - %r = load <8 x double>, <8 x double>* %p, align 8 - ret <8 x double> %r - -; CHECK-LABEL: test_l_qv8double -; CHECK: cost of 4 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8 -} - define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { entry: store <16 x i8> %v, <16 x i8>* %p, align 1 @@ -362,43 +326,6 @@ entry: ; CHECK: cost of 2 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 } -define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void - -; CHECK-LABEL: test_s_qv4float -; CHECK: cost of 7 for instruction: store <4 x float> %v, <4 x float>* %p, align 4 -} - -define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { -entry: - store <8 x float> %v, <8 x float>* %p, align 4 - ret void - -; CHECK-LABEL: test_s_qv8float -; CHECK: cost of 15 for instruction: store <8 x float> %v, <8 x float>* %p, align 4 -} - -define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void - -; CHECK-LABEL: test_s_qv4double -; CHECK: cost of 7 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 -} - -define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { -entry: - store <8 x double> %v, <8 x double>* %p, align 8 - ret void - -; CHECK-LABEL: test_s_qv8double -; CHECK: cost of 15 for instruction: store <8 x double> %v, <8 x double>* %p, align 8 -} - attributes #0 = { nounwind "target-cpu"="pwr7" } -attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll index 69f9cff5c525f..d93f192b1274d 100644 --- a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll +++ b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -enable-misched < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -enable-misched < %s | FileCheck %s ; ; PR14315: misched should not move the physreg copy of %t below the calls. diff --git a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir index 738aa1df5dd9d..a0139879f8c91 100644 --- a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir +++ b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir @@ -55,7 +55,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.module.flags = !{!0, !1} diff --git a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir index bcd51d31c6cfd..01ce79995512a 100644 --- a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir +++ b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir @@ -30,7 +30,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll b/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll deleted file mode 100644 index 17e3df6d58ccc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 | FileCheck -check-prefix=CHECK-A2 %s -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck -check-prefix=CHECK-A2Q %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-bgq-linux -mcpu=a2 | FileCheck -check-prefix=CHECK-BGQ %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare i32 @bar(i8* %a) nounwind; -define i32 @foo() nounwind { - %p = alloca i8, i8 115 - store i8 0, i8* %p - %r = call i32 @bar(i8* %p) - ret i32 %r -} - -; Without QPX, the allocated stack frame is 240 bytes, but with QPX -; (because we require 32-byte alignment), it is 256 bytes. -; CHECK-A2: @foo -; CHECK-A2: stdu 1, -240(1) -; CHECK-A2Q: @foo -; CHECK-A2Q: stdu 1, -256(1) -; CHECK-BGQ: @foo -; CHECK-BGQ: stdu 1, -256(1) - diff --git a/llvm/test/CodeGen/PowerPC/a2q.ll b/llvm/test/CodeGen/PowerPC/a2q.ll deleted file mode 100644 index 84e2dfa991d78..0000000000000 --- a/llvm/test/CodeGen/PowerPC/a2q.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 -mattr=+qpx | FileCheck %s - -define void @foo() { -entry: - ret void -} - -; CHECK: @foo - diff --git a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll index 1b0ea26f1fdea..d629148535aa7 100644 --- a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll +++ b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll @@ -298,7 +298,7 @@ _ZN10SubProcess12SafeSyscalls5fcntlEiil.exit: ; preds = %_ZN10SubProcess12Sa ; Function Attrs: nounwind argmemonly declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind argmemonly } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/asm-Zy.ll b/llvm/test/CodeGen/PowerPC/asm-Zy.ll index 78bb0f4c73eca..c8b5e9f1aa1d1 100644 --- a/llvm/test/CodeGen/PowerPC/asm-Zy.ll +++ b/llvm/test/CodeGen/PowerPC/asm-Zy.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2 | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" define i32 @zytest(i32 %a) nounwind { entry: diff --git a/llvm/test/CodeGen/PowerPC/asm-constraints.ll b/llvm/test/CodeGen/PowerPC/asm-constraints.ll index a3e573d8935e9..da77d1a169792 100644 --- a/llvm/test/CodeGen/PowerPC/asm-constraints.ll +++ b/llvm/test/CodeGen/PowerPC/asm-constraints.ll @@ -65,7 +65,7 @@ entry: } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir index 2081e6fd02f51..904210ee13477 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir @@ -63,8 +63,8 @@ ret i64 %2 } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir index b52e0a4103add..f46d4fc0a42a4 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir @@ -187,7 +187,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir index 4d2595e1abdcb..ba950dc3d3ae9 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir @@ -983,10 +983,10 @@ ret i64 %xor } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll index ed3c9f07c1a85..75640d1d26072 100644 --- a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll +++ b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX declare float @fabsf(float) @@ -64,11 +63,6 @@ loop_exit: ; CHECK-NOT: xsmindp ; CHECK: blr -; QPX-LABEL: test1v: -; QPX: mtctr -; QPX-NOT: bl fminf -; QPX: blr - define void @test1a(float %f, float* %fp) { entry: br label %loop_body @@ -139,11 +133,6 @@ loop_exit: ; CHECK-NOT: xsmaxdp ; CHECK: blr -; QPX-LABEL: test2v: -; QPX: mtctr -; QPX-NOT: bl fmax -; QPX: blr - define void @test2a(float %f, float* %fp) { entry: br label %loop_body diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll index 44acfcdd6e66a..636c86b815c8c 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 -; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s --check-prefixes=CHECK,CHECK-A2Q ; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4 ; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result. @@ -86,11 +85,8 @@ for.body: ; preds = %entry, %for.body } ; Function Attrs: norecurse nounwind -; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. -; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount2NonSmallLoop() { ; CHECK-LABEL: testTripCount2NonSmallLoop: -; CHECK-A2Q: mtctr ; CHECK-PWR8-NOT: mtctr ; CHECK: blr @@ -121,12 +117,9 @@ for.end: ; preds = %if.end ret i32 %conv } -; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. -; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount5() { ; CHECK-LABEL: testTripCount5: ; CHECK-PWR8-NOT: mtctr -; CHECK-A2Q: mtctr entry: %.prea = load i32, i32* @a, align 4 diff --git a/llvm/test/CodeGen/PowerPC/ec-input.ll b/llvm/test/CodeGen/PowerPC/ec-input.ll index 9a1c121699a69..425bc1985d419 100644 --- a/llvm/test/CodeGen/PowerPC/ec-input.ll +++ b/llvm/test/CodeGen/PowerPC/ec-input.ll @@ -5,7 +5,7 @@ ; that were both inputs to the inline asm and also early-clobber outputs). target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713 = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712 = type { %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32 } diff --git a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll index e066b45d3ca4b..023928bcb5896 100644 --- a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll +++ b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64-unknown-linux" %"class.Foam::messageStream.6" = type <{ %"class.Foam::string.5", i32, i32, i32, [4 x i8] }> %"class.Foam::string.5" = type { %"class.std::basic_string.4" } @@ -419,8 +419,8 @@ declare void @_ZN4Foam11regIOobjectD2Ev() #0 declare void @_ZN4Foam6reduceIiNS_5sumOpIiEEEEvRKNS_4ListINS_8UPstream11commsStructEEERT_RKT0_ii() #0 -attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll index fdd0fc2767803..b08b050f2c2fd 100644 --- a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll +++ b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs -O0 -relocation-model=pic < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"class.std::__1::__tree_node.130.151" = type { %"class.std::__1::__tree_node_base.base.128.149", %"class.boost::serialization::extended_type_info.129.150"* } %"class.std::__1::__tree_node_base.base.128.149" = type <{ %"class.std::__1::__tree_end_node.127.148", %"class.std::__1::__tree_node_base.126.147"*, %"class.std::__1::__tree_node_base.126.147"*, i8 }> diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll index eef6e0ccac02b..a336fc796ca52 100644 --- a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll @@ -33,4 +33,4 @@ define float @f(float %xf) #0 { ret float %25 } -attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll index 2feb4556dfab7..3b555cf898f57 100644 --- a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll +++ b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define linkonce_odr double @test1(ppc_fp128 %input) { entry: diff --git a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll index 54c3e11528b7b..2aa5239f25eb8 100644 --- a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll +++ b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"class.std::__1::__assoc_sub_state" = type { %"class.std::__1::__shared_count", %"class.std::__exception_ptr::exception_ptr", %"class.std::__1::mutex", %"class.std::__1::condition_variable", i32 } %"class.std::__1::__shared_count" = type { i32 (...)**, i64 } diff --git a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll index 74bfa75e5e313..a2d0eb599f91d 100644 --- a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll +++ b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll @@ -33,5 +33,5 @@ declare i8* @_ZN11__sanitizer21internal_start_threadEPFvPvES0_(void (i8*)*, i8*) declare hidden void @_ZN11__sanitizer16BackgroundThreadEPv(i8* nocapture readnone) #5 -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #7 = { nobuiltin nounwind } diff --git a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll index e4dfd6c58f0e8..6f1bc76d816ae 100644 --- a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll +++ b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %struct.BG_CoordinateMapping_t = type { [4 x i8] } diff --git a/llvm/test/CodeGen/PowerPC/load-two-flts.ll b/llvm/test/CodeGen/PowerPC/load-two-flts.ll index 1cfcff5e01601..19e21faf47232 100644 --- a/llvm/test/CodeGen/PowerPC/load-two-flts.ll +++ b/llvm/test/CodeGen/PowerPC/load-two-flts.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) { entry: diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll index f4664788930d4..2cbb70bb14cb5 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-ppc-prefetching=true -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define void @foo(double* %x, double* nocapture readonly %y) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll index f4821564c202b..defc52eec8e0d 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -enable-ppc-prefetching=true -mcpu=a2 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define void @foo(double* nocapture %a, double* nocapture readonly %b) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll index a13192d3e6586..7fdabcd4be210 100644 --- a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll +++ b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll @@ -1,5 +1,4 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BGQ target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -21,7 +20,6 @@ for.body: ; preds = %for.body, %entry ; CHECK-LABEL: @foo -; CHECK-BGQ-DAG: dcbt 4, 5 ; CHECK-DAG: lfdu [[REG1:[0-9]+]], 8({{[0-9]+}}) ; CHECK-DAG: fadd [[REG2:[0-9]+]], [[REG1]], 0 ; CHECK-DAG: stfdu [[REG2]], 8({{[0-9]+}}) @@ -34,15 +32,13 @@ for.cond.cleanup6: ; preds = %for.body7 for.body7: ; preds = %for.body, %for.body7 %i3.017 = phi i32 [ %inc9, %for.body7 ], [ 0, %for.body ] - tail call void bitcast (void (...)* @bar to void ()*)() #2 + tail call void bitcast (void (...)* @bar to void ()*)() #0 %inc9 = add nuw nsw i32 %i3.017, 1 %exitcond = icmp eq i32 %inc9, 1024 br i1 %exitcond, label %for.cond.cleanup6, label %for.body7 } -declare void @bar(...) #1 +declare void @bar(...) -attributes #0 = { nounwind "target-cpu"="a2q" } -attributes #1 = { "target-cpu"="a2q" } -attributes #2 = { nounwind } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll index 93868007d0d36..aa618d2b732c7 100644 --- a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll +++ b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll @@ -41,6 +41,6 @@ define void @aligned_slot() #0 { ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll index 2e834b1fe788c..16fc3ee3e5202 100644 --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -1,5 +1,4 @@ ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr7 < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-PWR -; RUN: llc -verify-machineinstrs -O3 -mcpu=a2q < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-QPX ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 < %s | FileCheck %s -check-prefix=FIXPOINT target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -93,9 +92,6 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds1: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -110,9 +106,6 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds2: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -127,9 +120,6 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds3: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -144,9 +134,6 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds4: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -217,9 +204,6 @@ define i64 @reassociate_mulld(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { define double @reassociate_mamaa_double(double %0, double %1, double %2, double %3, double %4, double %5) { ; CHECK-LABEL: reassociate_mamaa_double: ; CHECK: # %bb.0: -; CHECK-QPX-DAG: fmadd [[REG0:[0-9]+]], 4, 3, 2 -; CHECK-QPX-DAG: fmadd [[REG1:[0-9]+]], 6, 5, 1 -; CHECK-QPX: fadd 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xsmaddadp 1, 6, 5 ; CHECK-PWR-DAG: xsmaddadp 2, 4, 3 ; CHECK-PWR: xsadddp 1, 2, 1 @@ -250,9 +234,6 @@ define float @reassociate_mamaa_float(float %0, float %1, float %2, float %3, fl define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) { ; CHECK-LABEL: reassociate_mamaa_vec: ; CHECK: # %bb.0: -; CHECK-QPX-DAG: qvfmadds [[REG0:[0-9]+]], 4, 3, 2 -; CHECK-QPX-DAG: qvfmadds [[REG1:[0-9]+]], 6, 5, 1 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xvmaddasp [[REG0:[0-9]+]], 39, 38 ; CHECK-PWR-DAG: xvmaddasp [[REG1:[0-9]+]], 37, 36 ; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] @@ -268,11 +249,6 @@ define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x f define double @reassociate_mamama_double(double %0, double %1, double %2, double %3, double %4, double %5, double %6, double %7, double %8) { ; CHECK-LABEL: reassociate_mamama_double: ; CHECK: # %bb.0: -; CHECK-QPX: fmadd [[REG0:[0-9]+]], 2, 1, 7 -; CHECK-QPX-DAG: fmul [[REG1:[0-9]+]], 4, 3 -; CHECK-QPX-DAG: fmadd [[REG2:[0-9]+]], 6, 5, [[REG0]] -; CHECK-QPX-DAG: fmadd [[REG3:[0-9]+]], 9, 8, [[REG1]] -; CHECK-QPX: fadd 1, [[REG2]], [[REG3]] ; CHECK-PWR: xsmaddadp 7, 2, 1 ; CHECK-PWR-DAG: xsmuldp [[REG0:[0-9]+]], 4, 3 ; CHECK-PWR-DAG: xsmaddadp 7, 6, 5 diff --git a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll index e135986a2894c..f807f4fa20d25 100644 --- a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll +++ b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll @@ -19,7 +19,7 @@ entry: declare void @bar(double) #1 -attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll index cbb7947be2198..502347a3af198 100644 --- a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll +++ b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll @@ -1,9 +1,8 @@ -; RUN: opt -ee-instrument < %s | opt -inline | llc | FileCheck %s +; RUN: opt -ee-instrument < %s | opt -inline | llc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s ; The run-line mimics how Clang might run the instrumentation passes. target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @leaf_function() #0 { diff --git a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll index cd0abd6149bde..c4e60f8c4b1f5 100644 --- a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll +++ b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll @@ -1,6 +1,5 @@ ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR7 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR8 -; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s -check-prefix=A2Q target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -25,12 +24,6 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @foo1 -; A2Q-NOT: bl memcpy -; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -52,12 +45,6 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @foo2 -; A2Q-NOT: bl memcpy -; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -76,11 +63,6 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @bar1 -; A2Q-NOT: bl memset -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -99,11 +81,6 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @bar2 -; A2Q-NOT: bl memset -; A2Q: qvstfdx -; A2Q: blr } ; Function Attrs: nounwind diff --git a/llvm/test/CodeGen/PowerPC/memset-nc.ll b/llvm/test/CodeGen/PowerPC/memset-nc.ll deleted file mode 100644 index 663d0cb1d6785..0000000000000 --- a/llvm/test/CodeGen/PowerPC/memset-nc.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -O0 < %s | FileCheck %s -check-prefix=CHECK-O0 -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -; Function Attrs: nounwind -define void @test_qpx() unnamed_addr #0 align 2 { -entry: - %0 = load i32, i32* undef, align 4 - %1 = trunc i32 %0 to i8 - call void @llvm.memset.p0i8.i64(i8* align 32 null, i8 %1, i64 64, i1 false) - ret void - -; CHECK-LABEL: @test_qpx -; CHECK: qvstfdx -; CHECK: qvstfdx -; CHECK: blr - -; CHECK-O0-LABEL: @test_qpx -; CHECK-O0-NOT: qvstfdx -; CHECK-O0: blr -} - -; Function Attrs: nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 - -; Function Attrs: nounwind -define void @test_vsx() unnamed_addr #2 align 2 { -entry: - %0 = load i32, i32* undef, align 4 - %1 = trunc i32 %0 to i8 - call void @llvm.memset.p0i8.i64(i8* null, i8 %1, i64 32, i1 false) - ret void - -; CHECK-LABEL: @test_vsx -; CHECK: stxvw4x -; CHECK: stxvw4x -; CHECK: blr - -; CHECK-O0-LABEL: @test_vsx -; CHECK-O0-NOT: stxvw4x -; CHECK-O0: blr -} - -attributes #0 = { nounwind "target-cpu"="a2q" } -attributes #1 = { nounwind } -attributes #2 = { nounwind "target-cpu"="pwr7" } - diff --git a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll index 26663d81f3575..089c947713b9d 100644 --- a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll +++ b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -1,8 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -enable-misched -pre-RA-sched=source -scheditins=false \ -; RUN: -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s +; RUN: -disable-ifcvt-triangle-false -disable-post-ra -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s ; target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" ; %val1 is a load live out of %entry. It should be hoisted ; above the add. diff --git a/llvm/test/CodeGen/PowerPC/misched.ll b/llvm/test/CodeGen/PowerPC/misched.ll index 1c868b3f171c9..9a75fe44b7176 100644 --- a/llvm/test/CodeGen/PowerPC/misched.ll +++ b/llvm/test/CodeGen/PowerPC/misched.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -enable-misched -verify-machineinstrs ; PR14302 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" @b = external global [16000 x double], align 32 diff --git a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll index f59df4291c48f..ad5976318fe3a 100644 --- a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll +++ b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" declare zeroext i1 @ri1() declare void @se1() diff --git a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll index 2e248506c7b7b..2871e077df565 100644 --- a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll @@ -92,7 +92,7 @@ entry: ; Left the target features in this test because it is important that caller has ; -pcrelative-memops while callee has +pcrelative-memops -attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } -attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } -attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-qpx,-spe" } +attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } +attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } +attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-spe" } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/popcnt.ll b/llvm/test/CodeGen/PowerPC/popcnt.ll index a06c59d4b945a..695863d87f16e 100644 --- a/llvm/test/CodeGen/PowerPC/popcnt.ll +++ b/llvm/test/CodeGen/PowerPC/popcnt.ll @@ -1,8 +1,6 @@ ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+popcntd < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+slow-popcntd < %s | FileCheck %s --check-prefix=SLOWPC ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q < %s | FileCheck %s --check-prefix=SLOWPC -; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q -mattr=+popcntd < %s | FileCheck %s define i64 @_cntb64(i64 %x) nounwind readnone { %cnt = tail call i64 @llvm.ppc.popcntb(i64 %x) diff --git a/llvm/test/CodeGen/PowerPC/ppc-passname.ll b/llvm/test/CodeGen/PowerPC/ppc-passname.ll index 98343bdb535c2..06f13278d84cd 100644 --- a/llvm/test/CodeGen/PowerPC/ppc-passname.ll +++ b/llvm/test/CodeGen/PowerPC/ppc-passname.ll @@ -105,14 +105,3 @@ ; STOP-AFTER-BRANCH-COALESCING-NOT: "ppc-branch-coalescing" pass is not registered. ; STOP-AFTER-BRANCH-COALESCING: Branch Coalescing - -; Test pass name: ppc-qpx-load-splat. -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-QPX-LOAD-SPLAT -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: -ppc-qpx-load-splat -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: PowerPC QPX Load Splat Simplification - -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-QPX-LOAD-SPLAT -; STOP-AFTER-QPX-LOAD-SPLAT: -ppc-qpx-load-splat -; STOP-AFTER-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. -; STOP-AFTER-QPX-LOAD-SPLAT: PowerPC QPX Load Splat Simplification diff --git a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll index fc0e71f878cab..357f28e88b184 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -code-model=small | FileCheck %s -check-prefix=SCM ; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because @@ -117,23 +117,6 @@ define void @caller_local_sret_32(%S_32* %a) #1 { attributes #0 = { noinline nounwind } attributes #1 = { nounwind } -; vector <4 x i1> test - -define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void } -define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) { - tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b) - ret void - -; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't -; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder - -; CHECK-SCO-LABEL: caller_v4i1_reorder: -; CHECK-SCO: bl callee_v4i1 - -; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder: -; CHECK-SCO-HASQPX: b callee_v4i1 -} - define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void } define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) diff --git a/llvm/test/CodeGen/PowerPC/pr24546.ll b/llvm/test/CodeGen/PowerPC/pr24546.ll index 28c03293680e5..028fd2d8f0064 100644 --- a/llvm/test/CodeGen/PowerPC/pr24546.ll +++ b/llvm/test/CodeGen/PowerPC/pr24546.ll @@ -47,8 +47,8 @@ declare double @pow(double, double) #0 ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/pr27350.ll b/llvm/test/CodeGen/PowerPC/pr27350.ll index 982023a1fcdc8..93dbd10fecdeb 100644 --- a/llvm/test/CodeGen/PowerPC/pr27350.ll +++ b/llvm/test/CodeGen/PowerPC/pr27350.ll @@ -18,7 +18,7 @@ entry: declare fastcc void @bar([2 x i64], [2 x i64]) unnamed_addr #1 align 2 attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/pr28130.ll b/llvm/test/CodeGen/PowerPC/pr28130.ll index cb703dfda8a59..4da415bd29269 100644 --- a/llvm/test/CodeGen/PowerPC/pr28130.ll +++ b/llvm/test/CodeGen/PowerPC/pr28130.ll @@ -67,4 +67,4 @@ bb: ret void } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll index 04dee1ee182bb..35aec57ec2640 100644 --- a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll +++ b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %t1 = type { %t2*, %t3* } %t2 = type <{ %t3*, i32, [4 x i8] }> diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll b/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll deleted file mode 100644 index 4e0aef4c3df71..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define void @s452(i32 %inp1) nounwind { -entry: - br label %for.body4 - -for.body4: ; preds = %for.body4, %entry - %conv.4 = sitofp i32 %inp1 to double - %conv.5 = sitofp i32 %inp1 to double - %mul.4.v.i0.1 = insertelement <2 x double> undef, double %conv.4, i32 0 - %v = insertelement <2 x double> %mul.4.v.i0.1, double %conv.5, i32 1 - %vv = fmul <2 x double> %v, %v - %add7.4 = fadd <2 x double> %vv, %vv - store <2 x double> %add7.4, <2 x double>* undef, align 16 - br i1 undef, label %for.end, label %for.body4 - -for.end: ; preds = %for.body4 - unreachable -; CHECK-LABEL: @s452 -; CHECK: lfiwax [[REG1:[0-9]+]], -; CHECK: fcfid [[REG2:[0-9]+]], [[REG1]] -; FIXME: We could 'promote' this to a vector earlier and remove this splat. -; CHECK: qvesplati {{[0-9]+}}, [[REG2]], 0 -; CHECK: qvfmul -; CHECK: qvfadd -; CHECK: qvesplati {{[0-9]+}}, -; FIXME: We can use qvstfcdx here instead of two stores. -; CHECK: stfd -; CHECK: stfd -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv.ll b/llvm/test/CodeGen/PowerPC/qpx-bv.ll deleted file mode 100644 index 93a739b864c1d..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-bv.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s - -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(double %f1, double %f2, double %f3, double %f4) { - %v1 = insertelement <4 x double> undef, double %f1, i32 0 - %v2 = insertelement <4 x double> %v1, double %f2, i32 1 - %v3 = insertelement <4 x double> %v2, double %f3, i32 2 - %v4 = insertelement <4 x double> %v3, double %f4, i32 3 - ret <4 x double> %v4 - -; CHECK-LABEL: @foo -; CHECK: qvgpci [[REG1:[0-9]+]], 275 -; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 -; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] -; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] -; CHECK: blr -} - -define <4 x float> @goo(float %f1, float %f2, float %f3, float %f4) { - %v1 = insertelement <4 x float> undef, float %f1, i32 0 - %v2 = insertelement <4 x float> %v1, float %f2, i32 1 - %v3 = insertelement <4 x float> %v2, float %f3, i32 2 - %v4 = insertelement <4 x float> %v3, float %f4, i32 3 - ret <4 x float> %v4 - -; CHECK-LABEL: @goo -; CHECK: qvgpci [[REG1:[0-9]+]], 275 -; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 -; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] -; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll b/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll deleted file mode 100644 index ccbbd162a0cdb..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -declare <4 x double> @foo(<4 x double> %p) - -define <4 x double> @bar(<4 x double> %p, <4 x double> %q) { -entry: - %v = call <4 x double> @foo(<4 x double> %p) - %w = call <4 x double> @foo(<4 x double> %q) - %x = fadd <4 x double> %v, %w - ret <4 x double> %x - -; CHECK-LABEL: @bar -; CHECK: qvstfdx 2, -; CHECK: bl foo -; CHECK: qvstfdx 1, -; CHECK: qvlfdx 1, -; CHECK: bl foo -; CHECK: qvlfdx [[REG:[0-9]+]], -; CHECK: qvfadd 1, [[REG]], 1 -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll deleted file mode 100644 index 50b864980d985..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll +++ /dev/null @@ -1,80 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ -; RUN: -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s - -; Function Attrs: norecurse nounwind readonly -define <4 x double> @foo(double* nocapture readonly %a) #0 { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvdsx v2, 0, r3 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %0 = load double, double* %a, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - ret <4 x double> %shuffle.i -} - -define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 { -; CHECK-LABEL: foox: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: lxvdsx v2, r3, r4 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %p = getelementptr double, double* %a, i64 %idx - %0 = load double, double* %p, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - ret <4 x double> %shuffle.i -} - -define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 { -; CHECK-LABEL: fooxu: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: add r6, r3, r4 -; CHECK-NEXT: std r6, 0(r5) -; CHECK-NEXT: lxvdsx v2, r3, r4 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %p = getelementptr double, double* %a, i64 %idx - %0 = load double, double* %p, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - store double* %p, double** %pptr, align 8 - ret <4 x double> %shuffle.i -} - -define <4 x float> @foof(float* nocapture readonly %a) #0 { -; CHECK-LABEL: foof: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxspltw v2, vs0, 1 -; CHECK-NEXT: blr -entry: - %0 = load float, float* %a, align 4 - %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 - %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %shuffle.i -} - -define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 { -; CHECK-LABEL: foofx: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 2 -; CHECK-NEXT: lfiwzx f0, r3, r4 -; CHECK-NEXT: xxspltw v2, vs0, 1 -; CHECK-NEXT: blr -entry: - %p = getelementptr float, float* %a, i64 %idx - %0 = load float, float* %p, align 4 - %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 - %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %shuffle.i -} - - diff --git a/llvm/test/CodeGen/PowerPC/qpx-load.ll b/llvm/test/CodeGen/PowerPC/qpx-load.ll deleted file mode 100644 index 514f0934b6cfc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-load.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(<4 x double>* %p) { -entry: - %v = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %v -} - -; CHECK: @foo -; CHECK-DAG: li [[REG1:[0-9]+]], 31 -; CHECK-DAG: qvlfdx [[REG4:[0-9]+]], 0, 3 -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], 3, [[REG1]] -; CHECK-DAG: qvlpcldx [[REG3:[0-9]+]], 0, 3 -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] -; CHECK: blr - -define <4 x double> @bar(<4 x double>* %p) { -entry: - %v = load <4 x double>, <4 x double>* %p, align 32 - ret <4 x double> %v -} - -; CHECK: @bar -; CHECK: qvlfdx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll b/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll deleted file mode 100644 index eab4d6af7e9fc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll +++ /dev/null @@ -1,79 +0,0 @@ -; RUN: llc -verify-machineinstrs -stop-after=finalize-isel < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <2 x double> @test_qvfmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfmadd -; CHECK: QVFMADD %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fadd reassoc nsz <2 x double> %4, %0 - ret <2 x double> %5 -} - -define <4 x float> @test_qvfmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfmadds -; CHECK: QVFMADDSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fadd reassoc nsz <4 x float> %4, %0 - ret <4 x float> %5 -} - -define <2 x double> @test_qvfnmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfnmadd -; CHECK: QVFNMADD %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fadd reassoc nsz <2 x double> %4, %0 - %6 = fneg reassoc nsz <2 x double> %5 - ret <2 x double> %6 -} - -define <4 x float> @test_qvfnmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfnmadds -; CHECK: QVFNMADDSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fadd reassoc nsz <4 x float> %4, %0 - %6 = fneg reassoc nsz <4 x float> %5 - ret <4 x float> %6 -} - -define <2 x double> @test_qvfmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfmsub -; CHECK: QVFMSUB %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fsub reassoc nsz <2 x double> %4, %0 - ret <2 x double> %5 -} - -define <4 x float> @test_qvfmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfmsubs -; CHECK: QVFMSUBSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fsub reassoc nsz <4 x float> %4, %0 - ret <4 x float> %5 -} - -define <2 x double> @test_qvfnmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfnmsub -; CHECK: QVFNMSUB %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fsub reassoc nsz <2 x double> %4, %0 - %6 = fneg reassoc nsz <2 x double> %5 - ret <2 x double> %6 -} - -define <4 x float> @test_qvfnmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfnmsubs -; CHECK: QVFNMSUBSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fsub reassoc nsz <4 x float> %4, %0 - %6 = fneg reassoc nsz <4 x float> %5 - ret <4 x float> %6 -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll deleted file mode 100644 index 498ab62819ced..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll +++ /dev/null @@ -1,473 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) -declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) - -define <4 x double> @foo_fmf(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha -; CHECK-NEXT: qvfrsqrte 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfmsub 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 -; CHECK-NEXT: qvfmul 3, 3, 4 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 -; CHECK-NEXT: qvfmul 0, 3, 0 -; CHECK-NEXT: qvfmul 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call ninf afn reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %r = fdiv arcp reassoc <4 x double> %a, %x - ret <4 x double> %r -} - -define <4 x double> @foo_safe(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 5, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 1 -; CHECK-NEXT: qvesplati 4, 2, 2 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 5, 5 -; CHECK-NEXT: fsqrt 4, 4 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: qvesplati 6, 1, 3 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdiv 2, 1, 2 -; CHECK-NEXT: fdiv 5, 6, 5 -; CHECK-NEXT: qvesplati 6, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdiv 4, 6, 4 -; CHECK-NEXT: fdiv 1, 1, 3 -; CHECK-NEXT: qvfperm 3, 4, 5, 0 -; CHECK-NEXT: qvfperm 0, 2, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 3, 1 -; CHECK-NEXT: blr -entry: - %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %r = fdiv <4 x double> %a, %x - ret <4 x double> %r -} - -define <4 x double> @foof_fmf(<4 x double> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: foof_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI2_0@toc@ha -; CHECK-NEXT: qvfrsqrtes 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI2_0@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: qvfmuls 4, 3, 3 -; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 -; CHECK-NEXT: qvfmuls 0, 3, 0 -; CHECK-NEXT: qvfmul 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %y = fpext <4 x float> %x to <4 x double> - %r = fdiv arcp reassoc nsz <4 x double> %a, %y - ret <4 x double> %r -} - -define <4 x double> @foof_safe(<4 x double> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: foof_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 2 -; CHECK-NEXT: fsqrts 4, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: fsqrts 0, 0 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: qvgpci 5, 275 -; CHECK-NEXT: qvgpci 6, 101 -; CHECK-NEXT: qvfperm 0, 3, 0, 5 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 2, 5 -; CHECK-NEXT: qvfperm 0, 2, 0, 6 -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 4, 0, 3 -; CHECK-NEXT: fdiv 2, 2, 4 -; CHECK-NEXT: qvesplati 4, 0, 2 -; CHECK-NEXT: fdiv 3, 3, 4 -; CHECK-NEXT: qvesplati 4, 1, 1 -; CHECK-NEXT: fdiv 1, 1, 0 -; CHECK-NEXT: qvesplati 0, 0, 1 -; CHECK-NEXT: fdiv 0, 4, 0 -; CHECK-NEXT: qvfperm 2, 3, 2, 5 -; CHECK-NEXT: qvfperm 0, 1, 0, 5 -; CHECK-NEXT: qvfperm 1, 0, 2, 6 -; CHECK-NEXT: blr -entry: - %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %y = fpext <4 x float> %x to <4 x double> - %r = fdiv <4 x double> %a, %y - ret <4 x double> %r -} - -define <4 x float> @food_fmf(<4 x float> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: food_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha -; CHECK-NEXT: qvfrsqrte 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfmsub 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 -; CHECK-NEXT: qvfmul 3, 3, 4 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 -; CHECK-NEXT: qvfmul 0, 3, 0 -; CHECK-NEXT: qvfrsp 0, 0 -; CHECK-NEXT: qvfmuls 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %y = fptrunc <4 x double> %x to <4 x float> - %r = fdiv arcp reassoc <4 x float> %a, %y - ret <4 x float> %r -} - -define <4 x float> @food_safe(<4 x float> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: food_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 2 -; CHECK-NEXT: fsqrt 4, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: fsqrt 0, 0 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: qvgpci 5, 275 -; CHECK-NEXT: qvgpci 6, 101 -; CHECK-NEXT: qvfperm 0, 3, 0, 5 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 2, 5 -; CHECK-NEXT: qvfperm 0, 2, 0, 6 -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvfrsp 0, 0 -; CHECK-NEXT: qvesplati 4, 0, 3 -; CHECK-NEXT: fdivs 2, 2, 4 -; CHECK-NEXT: qvesplati 4, 0, 2 -; CHECK-NEXT: fdivs 3, 3, 4 -; CHECK-NEXT: qvesplati 4, 1, 1 -; CHECK-NEXT: fdivs 1, 1, 0 -; CHECK-NEXT: qvesplati 0, 0, 1 -; CHECK-NEXT: fdivs 0, 4, 0 -; CHECK-NEXT: qvfperm 2, 3, 2, 5 -; CHECK-NEXT: qvfperm 0, 1, 0, 5 -; CHECK-NEXT: qvfperm 1, 0, 2, 6 -; CHECK-NEXT: blr -entry: - %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %y = fptrunc <4 x double> %x to <4 x float> - %r = fdiv <4 x float> %a, %y - ret <4 x float> %r -} - -define <4 x float> @goo_fmf(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; CHECK-NEXT: qvfrsqrtes 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI6_0@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: qvfmuls 4, 3, 3 -; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 -; CHECK-NEXT: qvfmuls 0, 3, 0 -; CHECK-NEXT: qvfmuls 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %r = fdiv arcp reassoc nsz <4 x float> %a, %x - ret <4 x float> %r -} - -define <4 x float> @goo_safe(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 5, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 1 -; CHECK-NEXT: qvesplati 4, 2, 2 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: fsqrts 5, 5 -; CHECK-NEXT: fsqrts 4, 4 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: qvesplati 6, 1, 3 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdivs 2, 1, 2 -; CHECK-NEXT: fdivs 5, 6, 5 -; CHECK-NEXT: qvesplati 6, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdivs 4, 6, 4 -; CHECK-NEXT: fdivs 1, 1, 3 -; CHECK-NEXT: qvfperm 3, 4, 5, 0 -; CHECK-NEXT: qvfperm 0, 2, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 3, 1 -; CHECK-NEXT: blr -entry: - %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %r = fdiv <4 x float> %a, %x - ret <4 x float> %r -} - -define <4 x double> @foo2_fmf(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo2_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI8_0@toc@ha -; CHECK-NEXT: qvfre 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI8_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmadd 0, 2, 3, 0 -; CHECK-NEXT: qvfnmsub 0, 3, 0, 3 -; CHECK-NEXT: qvfmul 3, 1, 0 -; CHECK-NEXT: qvfnmsub 1, 2, 3, 1 -; CHECK-NEXT: qvfmadd 1, 0, 1, 3 -; CHECK-NEXT: blr -entry: - %r = fdiv arcp reassoc nsz ninf <4 x double> %a, %b - ret <4 x double> %r -} - -define <4 x double> @foo2_safe(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo2_safe: -; CHECK: # %bb.0: -; CHECK-NEXT: qvesplati 3, 2, 3 -; CHECK-NEXT: qvesplati 4, 1, 3 -; CHECK-NEXT: qvesplati 5, 2, 2 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdiv 3, 4, 3 -; CHECK-NEXT: qvesplati 4, 1, 2 -; CHECK-NEXT: fdiv 4, 4, 5 -; CHECK-NEXT: fdiv 5, 1, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdiv 1, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 3, 0 -; CHECK-NEXT: qvfperm 0, 5, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr - %r = fdiv <4 x double> %a, %b - ret <4 x double> %r -} - -define <4 x float> @goo2_fmf(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo2_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvfres 0, 2 -; CHECK-NEXT: qvfmuls 3, 1, 0 -; CHECK-NEXT: qvfnmsubs 1, 2, 3, 1 -; CHECK-NEXT: qvfmadds 1, 0, 1, 3 -; CHECK-NEXT: blr -entry: - %r = fdiv arcp reassoc ninf <4 x float> %a, %b - ret <4 x float> %r -} - -define <4 x float> @goo2_safe(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo2_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 3, 2, 3 -; CHECK-NEXT: qvesplati 4, 1, 3 -; CHECK-NEXT: qvesplati 5, 2, 2 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdivs 3, 4, 3 -; CHECK-NEXT: qvesplati 4, 1, 2 -; CHECK-NEXT: fdivs 4, 4, 5 -; CHECK-NEXT: fdivs 5, 1, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdivs 1, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 3, 0 -; CHECK-NEXT: qvfperm 0, 5, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = fdiv <4 x float> %a, %b - ret <4 x float> %r -} - -define <4 x double> @foo3_fmf_denorm_on(<4 x double> %a) #0 { -; CHECK-LABEL: foo3_fmf_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI12_0@toc@ha -; CHECK-NEXT: qvfrsqrte 0, 1 -; CHECK-NEXT: addi 3, 3, .LCPI12_0@toc@l -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI12_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI12_1@toc@l -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfmsub 4, 1, 2, 1 -; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 3 -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 2 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI12_2@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI12_2@toc@l -; CHECK-NEXT: qvlfdx 3, 0, 3 -; CHECK-NEXT: qvfmul 0, 0, 1 -; CHECK-NEXT: qvfabs 1, 1 -; CHECK-NEXT: qvfcmplt 1, 1, 2 -; CHECK-NEXT: qvfsel 1, 1, 3, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc ninf afn <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_fmf_denorm_off(<4 x double> %a) #1 { -; CHECK-LABEL: foo3_fmf_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI13_0@toc@ha -; CHECK-NEXT: qvfrsqrte 0, 1 -; CHECK-NEXT: addi 3, 3, .LCPI13_0@toc@l -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI13_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI13_1@toc@l -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfmsub 4, 1, 2, 1 -; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 3 -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 2 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: qvfmul 0, 0, 1 -; CHECK-NEXT: qvfcmpeq 1, 1, 2 -; CHECK-NEXT: qvfsel 1, 1, 2, 0 -; CHECK-NEXT: blr -entry: - %r = call afn reassoc ninf <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_safe_denorm_on(<4 x double> %a) #0 { -; CHECK-LABEL: foo3_safe_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrt 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_safe_denorm_off(<4 x double> %a) #1 { -; CHECK-LABEL: foo3_safe_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrt 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x float> @goo3_fmf_denorm_on(<4 x float> %a) #0 { -; CHECK-LABEL: goo3_fmf_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI16_1@toc@ha -; CHECK-NEXT: qvfrsqrtes 2, 1 -; CHECK-NEXT: addi 3, 3, .LCPI16_1@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI16_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI16_0@toc@l -; CHECK-NEXT: qvfmuls 4, 2, 2 -; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 -; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 -; CHECK-NEXT: qvlfsx 3, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI16_2@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI16_2@toc@l -; CHECK-NEXT: qvlfsx 4, 0, 3 -; CHECK-NEXT: qvfmuls 0, 2, 0 -; CHECK-NEXT: qvfabs 2, 1 -; CHECK-NEXT: qvfmuls 0, 0, 1 -; CHECK-NEXT: qvfcmplt 1, 2, 3 -; CHECK-NEXT: qvfsel 1, 1, 4, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc afn ninf nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -define <4 x float> @goo3_fmf_denorm_off(<4 x float> %a) #1 { -; CHECK-LABEL: goo3_fmf_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI17_1@toc@ha -; CHECK-NEXT: qvfrsqrtes 2, 1 -; CHECK-NEXT: addi 3, 3, .LCPI17_1@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l -; CHECK-NEXT: qvfmuls 4, 2, 2 -; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 -; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 -; CHECK-NEXT: qvlfsx 3, 0, 3 -; CHECK-NEXT: qvfmuls 0, 2, 0 -; CHECK-NEXT: qvfmuls 0, 0, 1 -; CHECK-NEXT: qvfcmpeq 1, 1, 3 -; CHECK-NEXT: qvfsel 1, 1, 3, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc ninf afn nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -define <4 x float> @goo3_safe(<4 x float> %a) nounwind { -; CHECK-LABEL: goo3_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrts 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: fsqrts 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -attributes #0 = { nounwind "denormal-fp-math"="ieee,ieee" } -attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll deleted file mode 100644 index ee3357156a6c0..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll +++ /dev/null @@ -1,109 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q -enable-unsafe-fp-math | FileCheck -check-prefix=CHECK-FM %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -define <4 x float> @test1(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test1: -; CHECK: qvfrim 1, 1 - -; CHECK-FM: test1: -; CHECK-FM: qvfrim 1, 1 -} - -declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test2(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test2: -; CHECK: qvfrim 1, 1 - -; CHECK-FM: test2: -; CHECK-FM: qvfrim 1, 1 -} - -declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test3(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test3: -; CHECK-NOT: qvfrin - -; CHECK-FM: test3: -; CHECK-FM-NOT: qvfrin -} - -declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test4(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test4: -; CHECK-NOT: qvfrin - -; CHECK-FM: test4: -; CHECK-FM-NOT: qvfrin -} - -declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test5(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test5: -; CHECK: qvfrip 1, 1 - -; CHECK-FM: test5: -; CHECK-FM: qvfrip 1, 1 -} - -declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test6(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test6: -; CHECK: qvfrip 1, 1 - -; CHECK-FM: test6: -; CHECK-FM: qvfrip 1, 1 -} - -declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test9(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test9: -; CHECK: qvfriz 1, 1 - -; CHECK-FM: test9: -; CHECK-FM: qvfriz 1, 1 -} - -declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test10(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test10: -; CHECK: qvfriz 1, 1 - -; CHECK-FM: test10: -; CHECK-FM: qvfriz 1, 1 -} - -declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-load.ll b/llvm/test/CodeGen/PowerPC/qpx-s-load.ll deleted file mode 100644 index 57d7e3b0ded3c..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-load.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <4 x float> @foo(<4 x float>* %p) { -entry: - %v = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %v -} - -; CHECK: @foo -; CHECK-DAG: li [[REG1:[0-9]+]], 15 -; CHECK-DAG: qvlfsx [[REG4:[0-9]+]], 0, 3 -; CHECK-DAG: qvlfsx [[REG2:[0-9]+]], 3, [[REG1]] -; CHECK-DAG: qvlpclsx [[REG3:[0-9]+]], 0, 3 -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] -; CHECK: blr - -define <4 x float> @bar(<4 x float>* %p) { -entry: - %v = load <4 x float>, <4 x float>* %p, align 16 - ret <4 x float> %v -} - -; CHECK: @bar -; CHECK: qvlfsx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll deleted file mode 100644 index 5d42b9a529953..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll +++ /dev/null @@ -1,143 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -@R = global <4 x i1> , align 16 - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) nounwind readnone { -entry: - %r = select <4 x i1> %c, <4 x float> %a, <4 x float> %b - ret <4 x float> %r - -; CHECK-LABEL: @test1 -; CHECK: qvfsel 1, 3, 1, 2 -; CHECK: blr -} - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { -entry: - %v = insertelement <4 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 - %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 - %r = select <4 x i1> %v4, <4 x float> %a, <4 x float> %b - ret <4 x float> %r - -; CHECK-LABEL: @test2 -; CHECK: stw -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - -define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { -entry: - %v = and <4 x i1> %a, - ret <4 x i1> %v - -; CHECK-LABEL: @test3 -; CHECK: qvlfsx [[REG:[0-9]+]], -; qvflogical 1, 1, [[REG]], 1 -; blr -} - -define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { -entry: - %q = load <4 x i1>, <4 x i1>* %t, align 16 - %v = and <4 x i1> %a, %q - ret <4 x i1> %v - -; CHECK-LABEL: @test4 -; CHECK-DAG: lbz -; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], -; CHECK-DAG: stw -; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] -; CHECK: qvfand 1, 1, [[REG4]] -; CHECK: blr -} - -define void @test5(<4 x i1> %a) nounwind { -entry: - store <4 x i1> %a, <4 x i1>* @R - ret void - -; CHECK-LABEL: @test5 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: stb -; CHECK: blr -} - -define i1 @test6(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test6 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define i1 @test7(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - %s = extractelement <4 x i1> %a, i32 3 - %q = and i1 %r, %s - ret i1 %q - -; CHECK-LABEL: @test7 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG4:[0-9]+]], -; FIXME: We're storing the vector twice, and that's silly. -; CHECK-DAG: qvstfiwx [[REG3]], -; CHECK: lwz [[REG5:[0-9]+]], -; CHECK: and 3, -; CHECK: blr -} - -define i1 @test8(<3 x i1> %a) nounwind { -entry: - %r = extractelement <3 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test8 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define <3 x float> @test9(<3 x float> %a, <3 x float> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { -entry: - %v = insertelement <3 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 - %r = select <3 x i1> %v3, <3 x float> %a, <3 x float> %b - ret <3 x float> %r - -; CHECK-LABEL: @test9 -; CHECK: stw -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-store.ll b/llvm/test/CodeGen/PowerPC/qpx-s-store.ll deleted file mode 100644 index 81cff7b6457f1..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-store.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define void @foo(<4 x float> %v, <4 x float>* %p) { -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void -} - -; CHECK: @foo -; CHECK: stfs -; CHECK: stfs -; CHECK: stfs -; CHECK: stfs -; CHECK: blr - -define void @bar(<4 x float> %v, <4 x float>* %p) { -entry: - store <4 x float> %v, <4 x float>* %p, align 16 - ret void -} - -; CHECK: @bar -; CHECK: qvstfsx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-sel.ll deleted file mode 100644 index abc92c9e98b13..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-sel.ll +++ /dev/null @@ -1,151 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -@R = global <4 x i1> , align 16 - -define <4 x double> @test1(<4 x double> %a, <4 x double> %b, <4 x i1> %c) nounwind readnone { -entry: - %r = select <4 x i1> %c, <4 x double> %a, <4 x double> %b - ret <4 x double> %r - -; CHECK-LABEL: @test1 -; CHECK: qvfsel 1, 3, 1, 2 -; CHECK: blr -} - -define <4 x double> @test2(<4 x double> %a, <4 x double> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { -entry: - %v = insertelement <4 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 - %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 - %r = select <4 x i1> %v4, <4 x double> %a, <4 x double> %b - ret <4 x double> %r - -; CHECK-LABEL: @test2 - -; FIXME: This load/store sequence is unnecessary. -; CHECK-DAG: lbz -; CHECK-DAG: stw - -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - -define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { -entry: - %v = and <4 x i1> %a, - ret <4 x i1> %v - -; CHECK-LABEL: @test3 -; CHECK: qvlfsx [[REG:[0-9]+]], -; qvflogical 1, 1, [[REG]], 1 -; blr -} - -define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { -entry: - %q = load <4 x i1>, <4 x i1>* %t, align 16 - %v = and <4 x i1> %a, %q - ret <4 x i1> %v - -; CHECK-LABEL: @test4 -; CHECK-DAG: lbz -; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], -; CHECK-DAG: stw -; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] -; CHECK: qvfand 1, 1, [[REG4]] -; CHECK: blr -} - -define void @test5(<4 x i1> %a) nounwind { -entry: - store <4 x i1> %a, <4 x i1>* @R - ret void - -; CHECK-LABEL: @test5 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: stb -; CHECK: blr -} - -define i1 @test6(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test6 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define i1 @test7(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - %s = extractelement <4 x i1> %a, i32 3 - %q = and i1 %r, %s - ret i1 %q - -; CHECK-LABEL: @test7 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG4:[0-9]+]], -; FIXME: We're storing the vector twice, and that's silly. -; CHECK-DAG: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG5:[0-9]+]], -; CHECK: and 3, -; CHECK: blr -} - -define i1 @test8(<3 x i1> %a) nounwind { -entry: - %r = extractelement <3 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test8 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define <3 x double> @test9(<3 x double> %a, <3 x double> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { -entry: - %v = insertelement <3 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 - %r = select <3 x i1> %v3, <3 x double> %a, <3 x double> %b - ret <3 x double> %r - -; CHECK-LABEL: @test9 - -; FIXME: This load/store sequence is unnecessary. -; CHECK-DAG: lbz -; CHECK-DAG: stw - -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll b/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll deleted file mode 100644 index df3e0befaef8a..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" - -; Function Attrs: nounwind -define void @gsl_sf_legendre_Pl_deriv_array(<4 x i32> %inp1, <4 x double> %inp2) #0 { -entry: - br label %vector.body198 - -vector.body198: ; preds = %vector.body198, %for.body46.lr.ph - %0 = icmp ne <4 x i32> %inp1, zeroinitializer - %1 = select <4 x i1> %0, <4 x double> , <4 x double> - %2 = fmul <4 x double> %inp2, %1 - %3 = fmul <4 x double> %inp2, %2 - %4 = fmul <4 x double> %3, %inp2 - store <4 x double> %4, <4 x double>* undef, align 8 - br label %return - -; CHECK-LABEL: @gsl_sf_legendre_Pl_deriv_array -; CHECK: qvlfiwzx -; CHECK: qvfcfidu -; CHECK: qvfcmpeq -; CHECK: qvfsel -; CHECK: qvfmul - -return: ; preds = %if.else.i - ret void -} - -attributes #0 = { nounwind } - diff --git a/llvm/test/CodeGen/PowerPC/qpx-store.ll b/llvm/test/CodeGen/PowerPC/qpx-store.ll deleted file mode 100644 index 2b96576ce4493..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-store.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define void @foo(<4 x double> %v, <4 x double>* %p) { -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void -} - -; CHECK: @foo -; CHECK: stfd -; CHECK: stfd -; CHECK: stfd -; CHECK: stfd -; CHECK: blr - -define void @bar(<4 x double> %v, <4 x double>* %p) { -entry: - store <4 x double> %v, <4 x double>* %p, align 32 - ret void -} - -; CHECK: @bar -; CHECK: qvstfdx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll b/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll deleted file mode 100644 index e7ab92db6efc9..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll +++ /dev/null @@ -1,217 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -; Function Attrs: nounwind -define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { -entry: - br label %vector.body - -; CHECK-LABEL: @foo -; Make sure that the offset constants we use are all even (only the last should be odd). -; CHECK-DAG: li {{[0-9]+}}, 1056 -; CHECK-DAG: li {{[0-9]+}}, 1088 -; CHECK-DAG: li {{[0-9]+}}, 1152 -; CHECK-DAG: li {{[0-9]+}}, 1216 -; CHECK-DAG: li {{[0-9]+}}, 1280 -; CHECK-DAG: li {{[0-9]+}}, 1344 -; CHECK-DAG: li {{[0-9]+}}, 1408 -; CHECK-DAG: li {{[0-9]+}}, 1472 -; CHECK-DAG: li {{[0-9]+}}, 1536 -; CHECK-DAG: li {{[0-9]+}}, 1600 -; CHECK-DAG: li {{[0-9]+}}, 1568 -; CHECK-DAG: li {{[0-9]+}}, 1664 -; CHECK-DAG: li {{[0-9]+}}, 1632 -; CHECK-DAG: li {{[0-9]+}}, 1728 -; CHECK-DAG: li {{[0-9]+}}, 1696 -; CHECK-DAG: li {{[0-9]+}}, 1792 -; CHECK-DAG: li {{[0-9]+}}, 1760 -; CHECK-DAG: li {{[0-9]+}}, 1856 -; CHECK-DAG: li {{[0-9]+}}, 1824 -; CHECK-DAG: li {{[0-9]+}}, 1920 -; CHECK-DAG: li {{[0-9]+}}, 1888 -; CHECK-DAG: li {{[0-9]+}}, 1984 -; CHECK-DAG: li {{[0-9]+}}, 1952 -; CHECK-DAG: li {{[0-9]+}}, 2016 -; CHECK-DAG: li {{[0-9]+}}, 1024 -; CHECK-DAG: li {{[0-9]+}}, 1120 -; CHECK-DAG: li {{[0-9]+}}, 1184 -; CHECK-DAG: li {{[0-9]+}}, 1248 -; CHECK-DAG: li {{[0-9]+}}, 1312 -; CHECK-DAG: li {{[0-9]+}}, 1376 -; CHECK-DAG: li {{[0-9]+}}, 1440 -; CHECK-DAG: li {{[0-9]+}}, 1504 -; CHECK-DAG: li {{[0-9]+}}, 2047 -; CHECK: blr - -vector.body: ; preds = %vector.body, %entry - %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ] - %0 = shl i64 %index, 1 - %1 = getelementptr inbounds double, double* %b, i64 %0 - %2 = bitcast double* %1 to <8 x double>* - %wide.vec = load <8 x double>, <8 x double>* %2, align 8 - %strided.vec = shufflevector <8 x double> %wide.vec, <8 x double> undef, <4 x i32> - %3 = fadd <4 x double> %strided.vec, - %4 = getelementptr inbounds double, double* %a, i64 %index - %5 = bitcast double* %4 to <4 x double>* - store <4 x double> %3, <4 x double>* %5, align 8 - %index.next = or i64 %index, 4 - %6 = shl i64 %index.next, 1 - %7 = getelementptr inbounds double, double* %b, i64 %6 - %8 = bitcast double* %7 to <8 x double>* - %wide.vec.1 = load <8 x double>, <8 x double>* %8, align 8 - %strided.vec.1 = shufflevector <8 x double> %wide.vec.1, <8 x double> undef, <4 x i32> - %9 = fadd <4 x double> %strided.vec.1, - %10 = getelementptr inbounds double, double* %a, i64 %index.next - %11 = bitcast double* %10 to <4 x double>* - store <4 x double> %9, <4 x double>* %11, align 8 - %index.next.1 = or i64 %index, 8 - %12 = shl i64 %index.next.1, 1 - %13 = getelementptr inbounds double, double* %b, i64 %12 - %14 = bitcast double* %13 to <8 x double>* - %wide.vec.2 = load <8 x double>, <8 x double>* %14, align 8 - %strided.vec.2 = shufflevector <8 x double> %wide.vec.2, <8 x double> undef, <4 x i32> - %15 = fadd <4 x double> %strided.vec.2, - %16 = getelementptr inbounds double, double* %a, i64 %index.next.1 - %17 = bitcast double* %16 to <4 x double>* - store <4 x double> %15, <4 x double>* %17, align 8 - %index.next.2 = or i64 %index, 12 - %18 = shl i64 %index.next.2, 1 - %19 = getelementptr inbounds double, double* %b, i64 %18 - %20 = bitcast double* %19 to <8 x double>* - %wide.vec.3 = load <8 x double>, <8 x double>* %20, align 8 - %strided.vec.3 = shufflevector <8 x double> %wide.vec.3, <8 x double> undef, <4 x i32> - %21 = fadd <4 x double> %strided.vec.3, - %22 = getelementptr inbounds double, double* %a, i64 %index.next.2 - %23 = bitcast double* %22 to <4 x double>* - store <4 x double> %21, <4 x double>* %23, align 8 - %index.next.3 = or i64 %index, 16 - %24 = shl i64 %index.next.3, 1 - %25 = getelementptr inbounds double, double* %b, i64 %24 - %26 = bitcast double* %25 to <8 x double>* - %wide.vec.4 = load <8 x double>, <8 x double>* %26, align 8 - %strided.vec.4 = shufflevector <8 x double> %wide.vec.4, <8 x double> undef, <4 x i32> - %27 = fadd <4 x double> %strided.vec.4, - %28 = getelementptr inbounds double, double* %a, i64 %index.next.3 - %29 = bitcast double* %28 to <4 x double>* - store <4 x double> %27, <4 x double>* %29, align 8 - %index.next.4 = or i64 %index, 20 - %30 = shl i64 %index.next.4, 1 - %31 = getelementptr inbounds double, double* %b, i64 %30 - %32 = bitcast double* %31 to <8 x double>* - %wide.vec.5 = load <8 x double>, <8 x double>* %32, align 8 - %strided.vec.5 = shufflevector <8 x double> %wide.vec.5, <8 x double> undef, <4 x i32> - %33 = fadd <4 x double> %strided.vec.5, - %34 = getelementptr inbounds double, double* %a, i64 %index.next.4 - %35 = bitcast double* %34 to <4 x double>* - store <4 x double> %33, <4 x double>* %35, align 8 - %index.next.5 = or i64 %index, 24 - %36 = shl i64 %index.next.5, 1 - %37 = getelementptr inbounds double, double* %b, i64 %36 - %38 = bitcast double* %37 to <8 x double>* - %wide.vec.6 = load <8 x double>, <8 x double>* %38, align 8 - %strided.vec.6 = shufflevector <8 x double> %wide.vec.6, <8 x double> undef, <4 x i32> - %39 = fadd <4 x double> %strided.vec.6, - %40 = getelementptr inbounds double, double* %a, i64 %index.next.5 - %41 = bitcast double* %40 to <4 x double>* - store <4 x double> %39, <4 x double>* %41, align 8 - %index.next.6 = or i64 %index, 28 - %42 = shl i64 %index.next.6, 1 - %43 = getelementptr inbounds double, double* %b, i64 %42 - %44 = bitcast double* %43 to <8 x double>* - %wide.vec.7 = load <8 x double>, <8 x double>* %44, align 8 - %strided.vec.7 = shufflevector <8 x double> %wide.vec.7, <8 x double> undef, <4 x i32> - %45 = fadd <4 x double> %strided.vec.7, - %46 = getelementptr inbounds double, double* %a, i64 %index.next.6 - %47 = bitcast double* %46 to <4 x double>* - store <4 x double> %45, <4 x double>* %47, align 8 - %index.next.7 = or i64 %index, 32 - %48 = shl i64 %index.next.7, 1 - %49 = getelementptr inbounds double, double* %b, i64 %48 - %50 = bitcast double* %49 to <8 x double>* - %wide.vec.8 = load <8 x double>, <8 x double>* %50, align 8 - %strided.vec.8 = shufflevector <8 x double> %wide.vec.8, <8 x double> undef, <4 x i32> - %51 = fadd <4 x double> %strided.vec.8, - %52 = getelementptr inbounds double, double* %a, i64 %index.next.7 - %53 = bitcast double* %52 to <4 x double>* - store <4 x double> %51, <4 x double>* %53, align 8 - %index.next.8 = or i64 %index, 36 - %54 = shl i64 %index.next.8, 1 - %55 = getelementptr inbounds double, double* %b, i64 %54 - %56 = bitcast double* %55 to <8 x double>* - %wide.vec.9 = load <8 x double>, <8 x double>* %56, align 8 - %strided.vec.9 = shufflevector <8 x double> %wide.vec.9, <8 x double> undef, <4 x i32> - %57 = fadd <4 x double> %strided.vec.9, - %58 = getelementptr inbounds double, double* %a, i64 %index.next.8 - %59 = bitcast double* %58 to <4 x double>* - store <4 x double> %57, <4 x double>* %59, align 8 - %index.next.9 = or i64 %index, 40 - %60 = shl i64 %index.next.9, 1 - %61 = getelementptr inbounds double, double* %b, i64 %60 - %62 = bitcast double* %61 to <8 x double>* - %wide.vec.10 = load <8 x double>, <8 x double>* %62, align 8 - %strided.vec.10 = shufflevector <8 x double> %wide.vec.10, <8 x double> undef, <4 x i32> - %63 = fadd <4 x double> %strided.vec.10, - %64 = getelementptr inbounds double, double* %a, i64 %index.next.9 - %65 = bitcast double* %64 to <4 x double>* - store <4 x double> %63, <4 x double>* %65, align 8 - %index.next.10 = or i64 %index, 44 - %66 = shl i64 %index.next.10, 1 - %67 = getelementptr inbounds double, double* %b, i64 %66 - %68 = bitcast double* %67 to <8 x double>* - %wide.vec.11 = load <8 x double>, <8 x double>* %68, align 8 - %strided.vec.11 = shufflevector <8 x double> %wide.vec.11, <8 x double> undef, <4 x i32> - %69 = fadd <4 x double> %strided.vec.11, - %70 = getelementptr inbounds double, double* %a, i64 %index.next.10 - %71 = bitcast double* %70 to <4 x double>* - store <4 x double> %69, <4 x double>* %71, align 8 - %index.next.11 = or i64 %index, 48 - %72 = shl i64 %index.next.11, 1 - %73 = getelementptr inbounds double, double* %b, i64 %72 - %74 = bitcast double* %73 to <8 x double>* - %wide.vec.12 = load <8 x double>, <8 x double>* %74, align 8 - %strided.vec.12 = shufflevector <8 x double> %wide.vec.12, <8 x double> undef, <4 x i32> - %75 = fadd <4 x double> %strided.vec.12, - %76 = getelementptr inbounds double, double* %a, i64 %index.next.11 - %77 = bitcast double* %76 to <4 x double>* - store <4 x double> %75, <4 x double>* %77, align 8 - %index.next.12 = or i64 %index, 52 - %78 = shl i64 %index.next.12, 1 - %79 = getelementptr inbounds double, double* %b, i64 %78 - %80 = bitcast double* %79 to <8 x double>* - %wide.vec.13 = load <8 x double>, <8 x double>* %80, align 8 - %strided.vec.13 = shufflevector <8 x double> %wide.vec.13, <8 x double> undef, <4 x i32> - %81 = fadd <4 x double> %strided.vec.13, - %82 = getelementptr inbounds double, double* %a, i64 %index.next.12 - %83 = bitcast double* %82 to <4 x double>* - store <4 x double> %81, <4 x double>* %83, align 8 - %index.next.13 = or i64 %index, 56 - %84 = shl i64 %index.next.13, 1 - %85 = getelementptr inbounds double, double* %b, i64 %84 - %86 = bitcast double* %85 to <8 x double>* - %wide.vec.14 = load <8 x double>, <8 x double>* %86, align 8 - %strided.vec.14 = shufflevector <8 x double> %wide.vec.14, <8 x double> undef, <4 x i32> - %87 = fadd <4 x double> %strided.vec.14, - %88 = getelementptr inbounds double, double* %a, i64 %index.next.13 - %89 = bitcast double* %88 to <4 x double>* - store <4 x double> %87, <4 x double>* %89, align 8 - %index.next.14 = or i64 %index, 60 - %90 = shl i64 %index.next.14, 1 - %91 = getelementptr inbounds double, double* %b, i64 %90 - %92 = bitcast double* %91 to <8 x double>* - %wide.vec.15 = load <8 x double>, <8 x double>* %92, align 8 - %strided.vec.15 = shufflevector <8 x double> %wide.vec.15, <8 x double> undef, <4 x i32> - %93 = fadd <4 x double> %strided.vec.15, - %94 = getelementptr inbounds double, double* %a, i64 %index.next.14 - %95 = bitcast double* %94 to <4 x double>* - store <4 x double> %93, <4 x double>* %95, align 8 - %index.next.15 = add nsw i64 %index, 64 - %96 = icmp eq i64 %index.next.15, 1600 - br i1 %96, label %for.cond.cleanup, label %vector.body - -for.cond.cleanup: ; preds = %vector.body - ret void -} - -attributes #0 = { nounwind "target-cpu"="a2q" } - diff --git a/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll b/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll deleted file mode 100644 index fdee919fdfc32..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 32 - ret <4 x double> %r -; CHECK: qvlfdx -; CHECK: blr -} - -define <4 x double> @bar(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 - %s = load <4 x double>, <4 x double>* %b, align 32 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -; CHECK: qvlpcldx -; CHECK: qvlfdx -; CHECK: qvfperm -; CHECK: blr -} - -define <4 x double> @bar1(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 - %s = load <4 x double>, <4 x double>* %b, align 8 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar2(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 32 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar3(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 8 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar4(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 8 - %c = getelementptr <4 x double>, <4 x double>* %b, i32 1 - %t = load <4 x double>, <4 x double>* %c, align 8 - %u = fadd <4 x double> %r, %s - %v = fadd <4 x double> %u, %t - ret <4 x double> %v -} - diff --git a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll index e8fc409527588..d512f51a76e7a 100644 --- a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll +++ b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll @@ -1,6 +1,4 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits < %s | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s define void @test() align 2 { entry: diff --git a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir index e3aeb5605b42c..dbe314b5251fe 100644 --- a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir +++ b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir @@ -60,7 +60,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll index 5c15145af2378..20071ea1710c5 100644 --- a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll +++ b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll @@ -1,7 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" @aa = external global [256 x [256 x double]], align 32 @bb = external global [256 x [256 x double]], align 32 diff --git a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll index 80ac733156197..9f458ebcf0a6e 100644 --- a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll +++ b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll @@ -1225,576 +1225,5 @@ entry: ; CHECK: blr } -define <4 x double> @testqv4doubleslt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleslt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleult(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublesle(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleule(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleeq(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleeq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x double> @testqv4doublesge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleuge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleuge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublesgt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleugt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublene(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublene -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x float> @testqv4floatslt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatslt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatult(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatsle(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatule(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floateq(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floateq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x float> @testqv4floatsge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatuge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatuge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatsgt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatugt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatne(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatne -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x i1> @testqv4i1slt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1slt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ult(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sle(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ule(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1eq(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1eq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1uge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1uge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sgt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ugt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ne(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ne -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - attributes #0 = { nounwind readnone "target-cpu"="pwr7" } -attributes #1 = { nounwind readnone "target-cpu"="a2q" } diff --git a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll index 53d17d8668270..73fce78c33aa7 100644 --- a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll +++ b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux < %s ; Check that llc does not crash due to an illegal APInt operation diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc.mir b/llvm/test/CodeGen/PowerPC/setcr_bc.mir index e9d81da681fcc..564ee7d45957b 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir index 582284d6d0a59..513cb85e1580a 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/stwu-sched.ll b/llvm/test/CodeGen/PowerPC/stwu-sched.ll index 0afd2ee406894..36afaf84a296b 100644 --- a/llvm/test/CodeGen/PowerPC/stwu-sched.ll +++ b/llvm/test/CodeGen/PowerPC/stwu-sched.ll @@ -58,7 +58,7 @@ define void @initCombList(%0* nocapture, i32 signext) local_unnamed_addr #0 { ret void } -attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll index 497add38e0444..79a368dd095ac 100644 --- a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll @@ -327,72 +327,6 @@ entry: } -define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { -; CHECK-LABEL: test_l_qv4float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 15 -; CHECK-NEXT: qvlpclsx 0, 0, 3 -; CHECK-NEXT: qvlfsx 1, 3, 4 -; CHECK-NEXT: qvlfsx 2, 0, 3 -; CHECK-NEXT: qvfperm 1, 2, 1, 0 -; CHECK-NEXT: blr -entry: - %r = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %r - -} - -define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { -; CHECK-LABEL: test_l_qv8float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 31 -; CHECK-NEXT: qvlpclsx 1, 0, 3 -; CHECK-NEXT: qvlfsx 0, 3, 4 -; CHECK-NEXT: li 4, 16 -; CHECK-NEXT: qvlfsx 3, 3, 4 -; CHECK-NEXT: qvlfsx 4, 0, 3 -; CHECK-NEXT: qvfperm 2, 3, 0, 1 -; CHECK-NEXT: qvfperm 1, 4, 3, 1 -; CHECK-NEXT: blr -entry: - %r = load <8 x float>, <8 x float>* %p, align 4 - ret <8 x float> %r - -} - -define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { -; CHECK-LABEL: test_l_qv4double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 31 -; CHECK-NEXT: qvlpcldx 0, 0, 3 -; CHECK-NEXT: qvlfdx 1, 3, 4 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: qvfperm 1, 2, 1, 0 -; CHECK-NEXT: blr -entry: - %r = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %r - -} - -define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { -; CHECK-LABEL: test_l_qv8double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 63 -; CHECK-NEXT: qvlpcldx 1, 0, 3 -; CHECK-NEXT: qvlfdx 0, 3, 4 -; CHECK-NEXT: li 4, 32 -; CHECK-NEXT: qvlfdx 3, 3, 4 -; CHECK-NEXT: qvlfdx 4, 0, 3 -; CHECK-NEXT: qvfperm 2, 3, 0, 1 -; CHECK-NEXT: qvfperm 1, 4, 3, 1 -; CHECK-NEXT: blr -entry: - %r = load <8 x double>, <8 x double>* %p, align 8 - ret <8 x double> %r - -} - define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { ; CHECK-LABEL: test_s_v16i8: ; CHECK: # %bb.0: # %entry @@ -537,89 +471,6 @@ entry: } -define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { -; CHECK-LABEL: test_s_qv4float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfs 1, 0(3) -; CHECK-NEXT: stfs 0, 12(3) -; CHECK-NEXT: qvesplati 0, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfs 0, 8(3) -; CHECK-NEXT: stfs 1, 4(3) -; CHECK-NEXT: blr -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void - -} - -define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { -; CHECK-LABEL: test_s_qv8float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: stfs 2, 16(3) -; CHECK-NEXT: stfs 0, 28(3) -; CHECK-NEXT: qvesplati 0, 2, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: stfs 1, 0(3) -; CHECK-NEXT: stfs 0, 24(3) -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfs 2, 20(3) -; CHECK-NEXT: qvesplati 2, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfs 0, 12(3) -; CHECK-NEXT: stfs 2, 8(3) -; CHECK-NEXT: stfs 1, 4(3) -; CHECK-NEXT: blr -entry: - store <8 x float> %v, <8 x float>* %p, align 4 - ret void - -} - -define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { -; CHECK-LABEL: test_s_qv4double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfd 1, 0(3) -; CHECK-NEXT: stfd 0, 24(3) -; CHECK-NEXT: qvesplati 0, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfd 0, 16(3) -; CHECK-NEXT: stfd 1, 8(3) -; CHECK-NEXT: blr -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void - -} - -define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { -; CHECK-LABEL: test_s_qv8double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: stfd 2, 32(3) -; CHECK-NEXT: stfd 0, 56(3) -; CHECK-NEXT: qvesplati 0, 2, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: stfd 1, 0(3) -; CHECK-NEXT: stfd 0, 48(3) -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfd 2, 40(3) -; CHECK-NEXT: qvesplati 2, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfd 0, 24(3) -; CHECK-NEXT: stfd 2, 16(3) -; CHECK-NEXT: stfd 1, 8(3) -; CHECK-NEXT: blr -entry: - store <8 x double> %v, <8 x double>* %p, align 8 - ret void - -} - attributes #0 = { nounwind "target-cpu"="pwr7" } -attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/CodeGen/PowerPC/uwtables.ll b/llvm/test/CodeGen/PowerPC/uwtables.ll index 7523d04d73d38..e302934ab8d6b 100644 --- a/llvm/test/CodeGen/PowerPC/uwtables.ll +++ b/llvm/test/CodeGen/PowerPC/uwtables.ll @@ -47,5 +47,5 @@ declare i32 @__gxx_personality_v0(...) declare void @__cxa_call_unexpected(i8*) local_unnamed_addr -attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll index 36da7add88015..33f3d82c3683d 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll @@ -55,21 +55,6 @@ define i32 @bar2() { ; CHECK: store <2 x i64> zeroinitializer, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 8) to <2 x i64>*), align 8 ; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls -; Check QPX vector argument. -define i32 @bar3() "target-features"="+qpx" { - %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i32 2, <4 x double> ) - ret i32 %1 -} - -; That one is even stranger: the parameter save area starts at offset 48 from -; (32-byte aligned) stack pointer, the vector parameter is at 96 bytes from -; the stack pointer, so its offset from parameter save area is misaligned. -; CHECK-LABEL: @bar3 -; CHECK: store i32 0, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 4) to i32*), align 8 -; CHECK: store i32 0, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 12) to i32*), align 8 -; CHECK: store <4 x i64> zeroinitializer, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 40) to <4 x i64>*), align 8 -; CHECK: store {{.*}} 72, {{.*}} @__msan_va_arg_overflow_size_tls - ; Check i64 array. define i32 @bar4() { %1 = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2]) diff --git a/llvm/test/MC/Disassembler/PowerPC/qpx.txt b/llvm/test/MC/Disassembler/PowerPC/qpx.txt deleted file mode 100644 index 00e598bd4356e..0000000000000 --- a/llvm/test/MC/Disassembler/PowerPC/qpx.txt +++ /dev/null @@ -1,371 +0,0 @@ -# RUN: llvm-mc --disassemble %s -triple powerpc64-bgq-linux -mcpu=a2q | FileCheck %s - -# CHECK: qvfabs 3, 5 -0x10 0x60 0x2a 0x10 - -# CHECK: qvfadd 3, 4, 5 -0x10 0x64 0x28 0x2a - -# CHECK: qvfadds 3, 4, 5 -0x00 0x64 0x28 0x2a - -# CHECK: qvfandc 3, 4, 5 -0x10 0x64 0x2a 0x08 - -# CHECK: qvfand 3, 4, 5 -0x10 0x64 0x28 0x88 - -# CHECK: qvfcfid 3, 5 -0x10 0x60 0x2e 0x9c - -# CHECK: qvfcfids 3, 5 -0x00 0x60 0x2e 0x9c - -# CHECK: qvfcfidu 3, 5 -0x10 0x60 0x2f 0x9c - -# CHECK: qvfcfidus 3, 5 -0x00 0x60 0x2f 0x9c - -# CHECK: qvfclr 3 -0x10 0x63 0x18 0x08 - -# CHECK: qvfcpsgn 3, 4, 5 -0x10 0x64 0x28 0x10 - -# CHECK: qvfctfb 3, 4 -0x10 0x64 0x22 0x88 - -# CHECK: qvfctid 3, 5 -0x10 0x60 0x2e 0x5c - -# CHECK: qvfctidu 3, 5 -0x10 0x60 0x2f 0x5c - -# CHECK: qvfctiduz 3, 5 -0x10 0x60 0x2f 0x5e - -# CHECK: qvfctidz 3, 5 -0x10 0x60 0x2e 0x5e - -# CHECK: qvfctiw 3, 5 -0x10 0x60 0x28 0x1c - -# CHECK: qvfctiwu 3, 5 -0x10 0x60 0x29 0x1c - -# CHECK: qvfctiwuz 3, 5 -0x10 0x60 0x29 0x1e - -# CHECK: qvfctiwz 3, 5 -0x10 0x60 0x28 0x1e - -# CHECK: qvfequ 3, 4, 5 -0x10 0x64 0x2c 0x88 - -# CHECK: qvflogical 3, 4, 5, 12 -0x10 0x64 0x2e 0x08 - -# CHECK: qvfmadd 3, 4, 6, 5 -0x10 0x64 0x29 0xba - -# CHECK: qvfmadds 3, 4, 6, 5 -0x00 0x64 0x29 0xba - -# CHECK: qvfmr 3, 5 -0x10 0x60 0x28 0x90 - -# CHECK: qvfmsub 3, 4, 6, 5 -0x10 0x64 0x29 0xb8 - -# CHECK: qvfmsubs 3, 4, 6, 5 -0x00 0x64 0x29 0xb8 - -# CHECK: qvfmul 3, 4, 6 -0x10 0x64 0x01 0xb2 - -# CHECK: qvfmuls 3, 4, 6 -0x00 0x64 0x01 0xb2 - -# CHECK: qvfnabs 3, 5 -0x10 0x60 0x29 0x10 - -# CHECK: qvfnand 3, 4, 5 -0x10 0x64 0x2f 0x08 - -# CHECK: qvfneg 3, 5 -0x10 0x60 0x28 0x50 - -# CHECK: qvfnmadd 3, 4, 6, 5 -0x10 0x64 0x29 0xbe - -# CHECK: qvfnmadds 3, 4, 6, 5 -0x00 0x64 0x29 0xbe - -# CHECK: qvfnmsub 3, 4, 6, 5 -0x10 0x64 0x29 0xbc - -# CHECK: qvfnmsubs 3, 4, 6, 5 -0x00 0x64 0x29 0xbc - -# CHECK: qvfnor 3, 4, 5 -0x10 0x64 0x2c 0x08 - -# CHECK: qvfnot 3, 4 -0x10 0x64 0x25 0x08 - -# CHECK: qvforc 3, 4, 5 -0x10 0x64 0x2e 0x88 - -# CHECK: qvfor 3, 4, 5 -0x10 0x64 0x2b 0x88 - -# CHECK: qvfperm 3, 4, 5, 6 -0x10 0x64 0x29 0x8c - -# CHECK: qvfre 3, 5 -0x10 0x60 0x28 0x30 - -# CHECK: qvfres 3, 5 -0x00 0x60 0x28 0x30 - -# CHECK: qvfrim 3, 5 -0x10 0x60 0x2b 0xd0 - -# CHECK: qvfrin 3, 5 -0x10 0x60 0x2b 0x10 - -# CHECK: qvfrip 3, 5 -0x10 0x60 0x2b 0x90 - -# CHECK: qvfriz 3, 5 -0x10 0x60 0x2b 0x50 - -# CHECK: qvfrsp 3, 5 -0x10 0x60 0x28 0x18 - -# CHECK: qvfrsqrte 3, 5 -0x10 0x60 0x28 0x34 - -# CHECK: qvfrsqrtes 3, 5 -0x00 0x60 0x28 0x34 - -# CHECK: qvfsel 3, 4, 6, 5 -0x10 0x64 0x29 0xae - -# CHECK: qvfset 3 -0x10 0x63 0x1f 0x88 - -# CHECK: qvfsub 3, 4, 5 -0x10 0x64 0x28 0x28 - -# CHECK: qvfsubs 3, 4, 5 -0x00 0x64 0x28 0x28 - -# CHECK: qvfxmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x92 - -# CHECK: qvfxmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x92 - -# CHECK: qvfxmul 3, 4, 6 -0x10 0x64 0x01 0xa2 - -# CHECK: qvfxmuls 3, 4, 6 -0x00 0x64 0x01 0xa2 - -# CHECK: qvfxor 3, 4, 5 -0x10 0x64 0x2b 0x08 - -# CHECK: qvfxxcpnmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x86 - -# CHECK: qvfxxcpnmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x86 - -# CHECK: qvfxxmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x82 - -# CHECK: qvfxxmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x82 - -# CHECK: qvfxxnpmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x96 - -# CHECK: qvfxxnpmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x96 - -# CHECK: qvlfcduxa 3, 9, 11 -0x7c 0x69 0x58 0xcf - -# CHECK: qvlfcdux 3, 9, 11 -0x7c 0x69 0x58 0xce - -# CHECK: qvlfcdxa 3, 10, 11 -0x7c 0x6a 0x58 0x8f - -# CHECK: qvlfcdx 3, 10, 11 -0x7c 0x6a 0x58 0x8e - -# CHECK: qvlfcsuxa 3, 9, 11 -0x7c 0x69 0x58 0x4f - -# CHECK: qvlfcsux 3, 9, 11 -0x7c 0x69 0x58 0x4e - -# CHECK: qvlfcsxa 3, 10, 11 -0x7c 0x6a 0x58 0x0f - -# CHECK: qvlfcsx 3, 10, 11 -0x7c 0x6a 0x58 0x0e - -# CHECK: qvlfduxa 3, 9, 11 -0x7c 0x69 0x5c 0xcf - -# CHECK: qvlfdux 3, 9, 11 -0x7c 0x69 0x5c 0xce - -# CHECK: qvlfdxa 3, 10, 11 -0x7c 0x6a 0x5c 0x8f - -# CHECK: qvlfdx 3, 10, 11 -0x7c 0x6a 0x5c 0x8e - -# CHECK: qvlfiwaxa 3, 10, 11 -0x7c 0x6a 0x5e 0xcf - -# CHECK: qvlfiwax 3, 10, 11 -0x7c 0x6a 0x5e 0xce - -# CHECK: qvlfiwzxa 3, 10, 11 -0x7c 0x6a 0x5e 0x8f - -# CHECK: qvlfiwzx 3, 10, 11 -0x7c 0x6a 0x5e 0x8e - -# CHECK: qvlfsuxa 3, 9, 11 -0x7c 0x69 0x5c 0x4f - -# CHECK: qvlfsux 3, 9, 11 -0x7c 0x69 0x5c 0x4e - -# CHECK: qvlfsxa 3, 10, 11 -0x7c 0x6a 0x5c 0x0f - -# CHECK: qvlfsx 3, 10, 11 -0x7c 0x6a 0x5c 0x0e - -# CHECK: qvlpcldx 3, 10, 11 -0x7c 0x6a 0x5c 0x8c - -# CHECK: qvlpclsx 3, 10, 11 -0x7c 0x6a 0x5c 0x0c - -# CHECK: qvlpcrdx 3, 10, 11 -0x7c 0x6a 0x58 0x8c - -# CHECK: qvlpcrsx 3, 10, 11 -0x7c 0x6a 0x58 0x0c - -# CHECK: qvstfcduxa 2, 9, 11 -0x7c 0x49 0x59 0xcf - -# CHECK: qvstfcduxia 2, 9, 11 -0x7c 0x49 0x59 0xcb - -# CHECK: qvstfcduxi 2, 9, 11 -0x7c 0x49 0x59 0xca - -# CHECK: qvstfcdux 2, 9, 11 -0x7c 0x49 0x59 0xce - -# CHECK: qvstfcdxa 2, 10, 11 -0x7c 0x4a 0x59 0x8f - -# CHECK: qvstfcdxia 2, 10, 11 -0x7c 0x4a 0x59 0x8b - -# CHECK: qvstfcdxi 2, 10, 11 -0x7c 0x4a 0x59 0x8a - -# CHECK: qvstfcdx 2, 10, 11 -0x7c 0x4a 0x59 0x8e - -# CHECK: qvstfcsuxa 2, 9, 11 -0x7c 0x49 0x59 0x4f - -# CHECK: qvstfcsuxia 2, 9, 11 -0x7c 0x49 0x59 0x4b - -# CHECK: qvstfcsuxi 2, 9, 11 -0x7c 0x49 0x59 0x4a - -# CHECK: qvstfcsux 2, 9, 11 -0x7c 0x49 0x59 0x4e - -# CHECK: qvstfcsxa 2, 10, 11 -0x7c 0x4a 0x59 0x0f - -# CHECK: qvstfcsxia 2, 10, 11 -0x7c 0x4a 0x59 0x0b - -# CHECK: qvstfcsxi 2, 10, 11 -0x7c 0x4a 0x59 0x0a - -# CHECK: qvstfcsx 2, 10, 11 -0x7c 0x4a 0x59 0x0e - -# CHECK: qvstfduxa 2, 9, 11 -0x7c 0x49 0x5d 0xcf - -# CHECK: qvstfduxia 2, 9, 11 -0x7c 0x49 0x5d 0xcb - -# CHECK: qvstfduxi 2, 9, 11 -0x7c 0x49 0x5d 0xca - -# CHECK: qvstfdux 2, 9, 11 -0x7c 0x49 0x5d 0xce - -# CHECK: qvstfdxa 2, 10, 11 -0x7c 0x4a 0x5d 0x8f - -# CHECK: qvstfdxia 2, 10, 11 -0x7c 0x4a 0x5d 0x8b - -# CHECK: qvstfdxi 2, 10, 11 -0x7c 0x4a 0x5d 0x8a - -# CHECK: qvstfdx 2, 10, 11 -0x7c 0x4a 0x5d 0x8e - -# CHECK: qvstfiwxa 2, 10, 11 -0x7c 0x4a 0x5f 0x8f - -# CHECK: qvstfiwx 2, 10, 11 -0x7c 0x4a 0x5f 0x8e - -# CHECK: qvstfsuxa 2, 9, 11 -0x7c 0x49 0x5d 0x4f - -# CHECK: qvstfsuxia 2, 9, 11 -0x7c 0x49 0x5d 0x4b - -# CHECK: qvstfsuxi 2, 9, 11 -0x7c 0x49 0x5d 0x4a - -# CHECK: qvstfsux 2, 9, 11 -0x7c 0x49 0x5d 0x4e - -# CHECK: qvstfsxa 2, 10, 11 -0x7c 0x4a 0x5d 0x0f - -# CHECK: qvstfsxia 2, 10, 11 -0x7c 0x4a 0x5d 0x0b - -# CHECK: qvstfsxi 2, 10, 11 -0x7c 0x4a 0x5d 0x0a - -# CHECK: qvstfsx 2, 10, 11 -0x7c 0x4a 0x5d 0x0e - diff --git a/llvm/test/MC/PowerPC/qpx.s b/llvm/test/MC/PowerPC/qpx.s deleted file mode 100644 index a1fb2090f8fff..0000000000000 --- a/llvm/test/MC/PowerPC/qpx.s +++ /dev/null @@ -1,252 +0,0 @@ -# RUN: llvm-mc -triple powerpc64-bgq-linux --show-encoding %s | FileCheck %s - -# CHECK: qvfabs 3, 5 # encoding: [0x10,0x60,0x2a,0x10] - qvfabs %q3, %q5 - -# CHECK: qvfabs 3, 5 # encoding: [0x10,0x60,0x2a,0x10] - qvfabs 3, 5 -# CHECK: qvfadd 3, 4, 5 # encoding: [0x10,0x64,0x28,0x2a] - qvfadd 3, 4, 5 -# CHECK: qvfadds 3, 4, 5 # encoding: [0x00,0x64,0x28,0x2a] - qvfadds 3, 4, 5 -# CHECK: qvfandc 3, 4, 5 # encoding: [0x10,0x64,0x2a,0x08] - qvfandc 3, 4, 5 -# CHECK: qvfand 3, 4, 5 # encoding: [0x10,0x64,0x28,0x88] - qvfand 3, 4, 5 -# CHECK: qvfcfid 3, 5 # encoding: [0x10,0x60,0x2e,0x9c] - qvfcfid 3, 5 -# CHECK: qvfcfids 3, 5 # encoding: [0x00,0x60,0x2e,0x9c] - qvfcfids 3, 5 -# CHECK: qvfcfidu 3, 5 # encoding: [0x10,0x60,0x2f,0x9c] - qvfcfidu 3, 5 -# CHECK: qvfcfidus 3, 5 # encoding: [0x00,0x60,0x2f,0x9c] - qvfcfidus 3, 5 -# CHECK: qvfclr 3 # encoding: [0x10,0x63,0x18,0x08] - qvfclr 3 -# CHECK: qvfcpsgn 3, 4, 5 # encoding: [0x10,0x64,0x28,0x10] - qvfcpsgn 3, 4, 5 -# CHECK: qvfctfb 3, 4 # encoding: [0x10,0x64,0x22,0x88] - qvfctfb 3, 4 -# CHECK: qvfctid 3, 5 # encoding: [0x10,0x60,0x2e,0x5c] - qvfctid 3, 5 -# CHECK: qvfctidu 3, 5 # encoding: [0x10,0x60,0x2f,0x5c] - qvfctidu 3, 5 -# CHECK: qvfctiduz 3, 5 # encoding: [0x10,0x60,0x2f,0x5e] - qvfctiduz 3, 5 -# CHECK: qvfctidz 3, 5 # encoding: [0x10,0x60,0x2e,0x5e] - qvfctidz 3, 5 -# CHECK: qvfctiw 3, 5 # encoding: [0x10,0x60,0x28,0x1c] - qvfctiw 3, 5 -# CHECK: qvfctiwu 3, 5 # encoding: [0x10,0x60,0x29,0x1c] - qvfctiwu 3, 5 -# CHECK: qvfctiwuz 3, 5 # encoding: [0x10,0x60,0x29,0x1e] - qvfctiwuz 3, 5 -# CHECK: qvfctiwz 3, 5 # encoding: [0x10,0x60,0x28,0x1e] - qvfctiwz 3, 5 -# CHECK: qvfequ 3, 4, 5 # encoding: [0x10,0x64,0x2c,0x88] - qvfequ 3, 4, 5 -# CHECK: qvflogical 3, 4, 5, 12 # encoding: [0x10,0x64,0x2e,0x08] - qvflogical 3, 4, 5, 12 -# CHECK: qvfmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xba] - qvfmadd 3, 4, 6, 5 -# CHECK: qvfmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xba] - qvfmadds 3, 4, 6, 5 -# CHECK: qvfmr 3, 5 # encoding: [0x10,0x60,0x28,0x90] - qvfmr 3, 5 -# CHECK: qvfmsub 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xb8] - qvfmsub 3, 4, 6, 5 -# CHECK: qvfmsubs 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xb8] - qvfmsubs 3, 4, 6, 5 -# CHECK: qvfmul 3, 4, 6 # encoding: [0x10,0x64,0x01,0xb2] - qvfmul 3, 4, 6 -# CHECK: qvfmuls 3, 4, 6 # encoding: [0x00,0x64,0x01,0xb2] - qvfmuls 3, 4, 6 -# CHECK: qvfnabs 3, 5 # encoding: [0x10,0x60,0x29,0x10] - qvfnabs 3, 5 -# CHECK: qvfnand 3, 4, 5 # encoding: [0x10,0x64,0x2f,0x08] - qvfnand 3, 4, 5 -# CHECK: qvfneg 3, 5 # encoding: [0x10,0x60,0x28,0x50] - qvfneg 3, 5 -# CHECK: qvfnmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xbe] - qvfnmadd 3, 4, 6, 5 -# CHECK: qvfnmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xbe] - qvfnmadds 3, 4, 6, 5 -# CHECK: qvfnmsub 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xbc] - qvfnmsub 3, 4, 6, 5 -# CHECK: qvfnmsubs 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xbc] - qvfnmsubs 3, 4, 6, 5 -# CHECK: qvfnor 3, 4, 5 # encoding: [0x10,0x64,0x2c,0x08] - qvfnor 3, 4, 5 -# CHECK: qvfnot 3, 4 # encoding: [0x10,0x64,0x25,0x08] - qvfnot 3, 4 -# CHECK: qvforc 3, 4, 5 # encoding: [0x10,0x64,0x2e,0x88] - qvforc 3, 4, 5 -# CHECK: qvfor 3, 4, 5 # encoding: [0x10,0x64,0x2b,0x88] - qvfor 3, 4, 5 -# CHECK: qvfperm 3, 4, 5, 6 # encoding: [0x10,0x64,0x29,0x8c] - qvfperm 3, 4, 5, 6 -# CHECK: qvfre 3, 5 # encoding: [0x10,0x60,0x28,0x30] - qvfre 3, 5 -# CHECK: qvfres 3, 5 # encoding: [0x00,0x60,0x28,0x30] - qvfres 3, 5 -# CHECK: qvfrim 3, 5 # encoding: [0x10,0x60,0x2b,0xd0] - qvfrim 3, 5 -# CHECK: qvfrin 3, 5 # encoding: [0x10,0x60,0x2b,0x10] - qvfrin 3, 5 -# CHECK: qvfrip 3, 5 # encoding: [0x10,0x60,0x2b,0x90] - qvfrip 3, 5 -# CHECK: qvfriz 3, 5 # encoding: [0x10,0x60,0x2b,0x50] - qvfriz 3, 5 -# CHECK: qvfrsp 3, 5 # encoding: [0x10,0x60,0x28,0x18] - qvfrsp 3, 5 -# CHECK: qvfrsqrte 3, 5 # encoding: [0x10,0x60,0x28,0x34] - qvfrsqrte 3, 5 -# CHECK: qvfrsqrtes 3, 5 # encoding: [0x00,0x60,0x28,0x34] - qvfrsqrtes 3, 5 -# CHECK: qvfsel 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xae] - qvfsel 3, 4, 6, 5 -# CHECK: qvfset 3 # encoding: [0x10,0x63,0x1f,0x88] - qvfset 3 -# CHECK: qvfsub 3, 4, 5 # encoding: [0x10,0x64,0x28,0x28] - qvfsub 3, 4, 5 -# CHECK: qvfsubs 3, 4, 5 # encoding: [0x00,0x64,0x28,0x28] - qvfsubs 3, 4, 5 -# CHECK: qvfxmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x92] - qvfxmadd 3, 4, 6, 5 -# CHECK: qvfxmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x92] - qvfxmadds 3, 4, 6, 5 -# CHECK: qvfxmul 3, 4, 6 # encoding: [0x10,0x64,0x01,0xa2] - qvfxmul 3, 4, 6 -# CHECK: qvfxmuls 3, 4, 6 # encoding: [0x00,0x64,0x01,0xa2] - qvfxmuls 3, 4, 6 -# CHECK: qvfxor 3, 4, 5 # encoding: [0x10,0x64,0x2b,0x08] - qvfxor 3, 4, 5 -# CHECK: qvfxxcpnmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x86] - qvfxxcpnmadd 3, 4, 6, 5 -# CHECK: qvfxxcpnmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x86] - qvfxxcpnmadds 3, 4, 6, 5 -# CHECK: qvfxxmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x82] - qvfxxmadd 3, 4, 6, 5 -# CHECK: qvfxxmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x82] - qvfxxmadds 3, 4, 6, 5 -# CHECK: qvfxxnpmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x96] - qvfxxnpmadd 3, 4, 6, 5 -# CHECK: qvfxxnpmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x96] - qvfxxnpmadds 3, 4, 6, 5 -# CHECK: qvlfcduxa 3, 9, 11 # encoding: [0x7c,0x69,0x58,0xcf] - qvlfcduxa 3, 9, 11 -# CHECK: qvlfcdux 3, 9, 11 # encoding: [0x7c,0x69,0x58,0xce] - qvlfcdux 3, 9, 11 -# CHECK: qvlfcdxa 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8f] - qvlfcdxa 3, 10, 11 -# CHECK: qvlfcdx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8e] - qvlfcdx 3, 10, 11 -# CHECK: qvlfcsuxa 3, 9, 11 # encoding: [0x7c,0x69,0x58,0x4f] - qvlfcsuxa 3, 9, 11 -# CHECK: qvlfcsux 3, 9, 11 # encoding: [0x7c,0x69,0x58,0x4e] - qvlfcsux 3, 9, 11 -# CHECK: qvlfcsxa 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0f] - qvlfcsxa 3, 10, 11 -# CHECK: qvlfcsx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0e] - qvlfcsx 3, 10, 11 -# CHECK: qvlfduxa 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0xcf] - qvlfduxa 3, 9, 11 -# CHECK: qvlfdux 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0xce] - qvlfdux 3, 9, 11 -# CHECK: qvlfdxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8f] - qvlfdxa 3, 10, 11 -# CHECK: qvlfdx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8e] - qvlfdx 3, 10, 11 -# CHECK: qvlfiwaxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0xcf] - qvlfiwaxa 3, 10, 11 -# CHECK: qvlfiwax 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0xce] - qvlfiwax 3, 10, 11 -# CHECK: qvlfiwzxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0x8f] - qvlfiwzxa 3, 10, 11 -# CHECK: qvlfiwzx 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0x8e] - qvlfiwzx 3, 10, 11 -# CHECK: qvlfsuxa 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0x4f] - qvlfsuxa 3, 9, 11 -# CHECK: qvlfsux 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0x4e] - qvlfsux 3, 9, 11 -# CHECK: qvlfsxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0f] - qvlfsxa 3, 10, 11 -# CHECK: qvlfsx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0e] - qvlfsx 3, 10, 11 -# CHECK: qvlpcldx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8c] - qvlpcldx 3, 10, 11 -# CHECK: qvlpclsx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0c] - qvlpclsx 3, 10, 11 -# CHECK: qvlpcrdx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8c] - qvlpcrdx 3, 10, 11 -# CHECK: qvlpcrsx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0c] - qvlpcrsx 3, 10, 11 -# CHECK: qvstfcduxa 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xcf] - qvstfcduxa 2, 9, 11 -# CHECK: qvstfcduxia 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xcb] - qvstfcduxia 2, 9, 11 -# CHECK: qvstfcduxi 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xca] - qvstfcduxi 2, 9, 11 -# CHECK: qvstfcdux 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xce] - qvstfcdux 2, 9, 11 -# CHECK: qvstfcdxa 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8f] - qvstfcdxa 2, 10, 11 -# CHECK: qvstfcdxia 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8b] - qvstfcdxia 2, 10, 11 -# CHECK: qvstfcdxi 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8a] - qvstfcdxi 2, 10, 11 -# CHECK: qvstfcdx 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8e] - qvstfcdx 2, 10, 11 -# CHECK: qvstfcsuxa 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4f] - qvstfcsuxa 2, 9, 11 -# CHECK: qvstfcsuxia 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4b] - qvstfcsuxia 2, 9, 11 -# CHECK: qvstfcsuxi 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4a] - qvstfcsuxi 2, 9, 11 -# CHECK: qvstfcsux 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4e] - qvstfcsux 2, 9, 11 -# CHECK: qvstfcsxa 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0f] - qvstfcsxa 2, 10, 11 -# CHECK: qvstfcsxia 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0b] - qvstfcsxia 2, 10, 11 -# CHECK: qvstfcsxi 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0a] - qvstfcsxi 2, 10, 11 -# CHECK: qvstfcsx 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0e] - qvstfcsx 2, 10, 11 -# CHECK: qvstfduxa 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xcf] - qvstfduxa 2, 9, 11 -# CHECK: qvstfduxia 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xcb] - qvstfduxia 2, 9, 11 -# CHECK: qvstfduxi 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xca] - qvstfduxi 2, 9, 11 -# CHECK: qvstfdux 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xce] - qvstfdux 2, 9, 11 -# CHECK: qvstfdxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8f] - qvstfdxa 2, 10, 11 -# CHECK: qvstfdxia 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8b] - qvstfdxia 2, 10, 11 -# CHECK: qvstfdxi 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8a] - qvstfdxi 2, 10, 11 -# CHECK: qvstfdx 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8e] - qvstfdx 2, 10, 11 -# CHECK: qvstfiwxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5f,0x8f] - qvstfiwxa 2, 10, 11 -# CHECK: qvstfiwx 2, 10, 11 # encoding: [0x7c,0x4a,0x5f,0x8e] - qvstfiwx 2, 10, 11 -# CHECK: qvstfsuxa 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4f] - qvstfsuxa 2, 9, 11 -# CHECK: qvstfsuxia 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4b] - qvstfsuxia 2, 9, 11 -# CHECK: qvstfsuxi 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4a] - qvstfsuxi 2, 9, 11 -# CHECK: qvstfsux 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4e] - qvstfsux 2, 9, 11 -# CHECK: qvstfsxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0f] - qvstfsxa 2, 10, 11 -# CHECK: qvstfsxia 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0b] - qvstfsxia 2, 10, 11 -# CHECK: qvstfsxi 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0a] - qvstfsxi 2, 10, 11 -# CHECK: qvstfsx 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0e] - qvstfsx 2, 10, 11 - diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll b/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll index c45c48d502343..67c22f9470779 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll @@ -4,7 +4,7 @@ ; RUN: opt -attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @test(i32 signext %n) { ; IS__TUNIT____: Function Attrs: nofree noreturn nosync nounwind readnone diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll index 6cd77a59df6b1..faf7041bfc387 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"struct.std::complex" = type { { float, float } } diff --git a/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll index 2a61fff15ade0..a57693a1da38e 100644 --- a/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"struct.std::complex" = type { { float, float } } diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll index 4c5f18a26657c..5a6daa2c9a008 100644 --- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll +++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll @@ -4,7 +4,7 @@ ; RUN: opt -passes="function(ee-instrument),function(ee-instrument),cgscc(inline),function(post-inline-ee-instrument),function(post-inline-ee-instrument)" -S < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @leaf_function() #0 { entry: diff --git a/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll b/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll deleted file mode 100644 index e9710df5670cd..0000000000000 --- a/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll +++ /dev/null @@ -1,165 +0,0 @@ -; RUN: opt -S -instcombine < %s | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare <4 x double> @llvm.ppc.qpx.qvlfs(i8*) #1 - -define <4 x double> @test1(<4 x float>* %h) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv) - -; CHECK-LABEL: @test1 -; CHECK: @llvm.ppc.qpx.qvlfs -; CHECK: ret <4 x double> - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - %v0e = fpext <4 x float> %v0 to <4 x double> - %a = fadd <4 x double> %v0e, %vl - ret <4 x double> %a -} - -define <4 x double> @test1a(<4 x float>* align 16 %h) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv) - -; CHECK-LABEL: @test1a -; CHECK-NOT: @llvm.ppc.qpx.qvlfs -; CHECK-NOT: load <4 x double> -; CHECK: ret <4 x double> - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - %v0e = fpext <4 x float> %v0 to <4 x double> - %a = fadd <4 x double> %v0e, %vl - ret <4 x double> %a -} - -declare void @llvm.ppc.qpx.qvstfs(<4 x double>, i8*) #0 - -define <4 x float> @test2(<4 x float>* %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv) - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - ret <4 x float> %v0 - -; CHECK-LABEL: @test2 -; CHECK: @llvm.ppc.qpx.qvstfs -; CHECK: ret <4 x float> -} - -define <4 x float> @test2a(<4 x float>* align 16 %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv) - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - ret <4 x float> %v0 - -; CHECK-LABEL: @test2 -; CHECK: fptrunc <4 x double> %d to <4 x float> -; CHECK-NOT: @llvm.ppc.qpx.qvstfs -; CHECK-NOT: store <4 x double> -; CHECK: ret <4 x float> -} - -declare <4 x double> @llvm.ppc.qpx.qvlfd(i8*) #1 - -define <4 x double> @test1l(<4 x double>* %h) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) - -; CHECK-LABEL: @test1l -; CHECK: @llvm.ppc.qpx.qvlfd -; CHECK: ret <4 x double> - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - %a = fadd <4 x double> %v0, %vl - ret <4 x double> %a -} - -define <4 x double> @test1ln(<4 x double>* align 16 %h) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) - -; CHECK-LABEL: @test1ln -; CHECK: @llvm.ppc.qpx.qvlfd -; CHECK: ret <4 x double> - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - %a = fadd <4 x double> %v0, %vl - ret <4 x double> %a -} - -define <4 x double> @test1la(<4 x double>* align 32 %h) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) - -; CHECK-LABEL: @test1la -; CHECK-NOT: @llvm.ppc.qpx.qvlfd -; CHECK: ret <4 x double> - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - %a = fadd <4 x double> %v0, %vl - ret <4 x double> %a -} - -declare void @llvm.ppc.qpx.qvstfd(<4 x double>, i8*) #0 - -define <4 x double> @test2l(<4 x double>* %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - ret <4 x double> %v0 - -; CHECK-LABEL: @test2l -; CHECK: @llvm.ppc.qpx.qvstfd -; CHECK: ret <4 x double> -} - -define <4 x double> @test2ln(<4 x double>* align 16 %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - ret <4 x double> %v0 - -; CHECK-LABEL: @test2ln -; CHECK: @llvm.ppc.qpx.qvstfd -; CHECK: ret <4 x double> -} - -define <4 x double> @test2la(<4 x double>* align 32 %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - ret <4 x double> %v0 - -; CHECK-LABEL: @test2l -; CHECK-NOT: @llvm.ppc.qpx.qvstfd -; CHECK: ret <4 x double> -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readonly } - diff --git a/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll b/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll index ea46fd0d5a8f8..68c75af14f3e9 100644 --- a/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll +++ b/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll @@ -1,7 +1,6 @@ -; RUN: opt -mcpu=a2 -loop-data-prefetch -S < %s | FileCheck %s -; RUN: opt -mcpu=a2 -passes=loop-data-prefetch -S < %s | FileCheck %s +; RUN: opt -mcpu=a2 -loop-data-prefetch -mtriple=powerpc64le-unknown-linux -enable-ppc-prefetching -S < %s | FileCheck %s +; RUN: opt -mcpu=a2 -passes=loop-data-prefetch -mtriple=powerpc64le-unknown-linux -enable-ppc-prefetching -S < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @foo(double* nocapture %a, double* nocapture readonly %b) { entry: diff --git a/llvm/test/Transforms/LoopSimplify/dup-preds.ll b/llvm/test/Transforms/LoopSimplify/dup-preds.ll index c9253fa51a65f..362d834686d41 100644 --- a/llvm/test/Transforms/LoopSimplify/dup-preds.ll +++ b/llvm/test/Transforms/LoopSimplify/dup-preds.ll @@ -1,6 +1,6 @@ ; RUN: opt -loop-simplify -S %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define fastcc void @do_update_md([3 x float]* nocapture readonly %x) #0 { entry: diff --git a/llvm/test/Transforms/LoopUnroll/pr14167.ll b/llvm/test/Transforms/LoopUnroll/pr14167.ll index 9aac70115d9ae..3097c234fb933 100644 --- a/llvm/test/Transforms/LoopUnroll/pr14167.ll +++ b/llvm/test/Transforms/LoopUnroll/pr14167.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -S -loop-unroll -unroll-runtime | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @test1() nounwind { ; Ensure that we don't crash when the trip count == -1. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll deleted file mode 100644 index 9fdfb6f90e7bf..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: opt -S -basic-aa -loop-vectorize < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -; Function Attrs: nounwind -define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %c) #0 { -entry: - br label %for.body - -; CHECK-LABEL: @foo -; CHECK: fmul <4 x double> %{{[^,]+}}, -; CHECK-NEXT: fmul <4 x double> %{{[^,]+}}, - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv - %0 = load double, double* %arrayidx, align 8 - %mul = fmul double %0, 2.000000e+00 - %mul3 = fmul double %0, %mul - %arrayidx5 = getelementptr inbounds double, double* %c, i64 %indvars.iv - %1 = load double, double* %arrayidx5, align 8 - %mul6 = fmul double %1, 3.000000e+00 - %mul9 = fmul double %1, %mul6 - %add = fadd double %mul3, %mul9 - %mul12 = fmul double %0, 4.000000e+00 - %mul15 = fmul double %mul12, %1 - %add16 = fadd double %mul15, %add - %add17 = fadd double %add16, 1.000000e+00 - %arrayidx19 = getelementptr inbounds double, double* %a, i64 %indvars.iv - store double %add17, double* %arrayidx19, align 8 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 1600 - br i1 %exitcond, label %for.cond.cleanup, label %for.body -} - -attributes #0 = { nounwind "target-cpu"="a2q" } - diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll index 8abc25ece35c6..cddddba579473 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -loop-vectorize < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define zeroext i32 @test() #0 { diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll index 999ff74ad5881..5bf7e1a695011 100644 --- a/llvm/test/Transforms/NewGVN/pr31483.ll +++ b/llvm/test/Transforms/NewGVN/pr31483.ll @@ -100,7 +100,7 @@ declare signext i32 @zot(i8*, ...) #1 ; Function Attrs: nounwind declare void @llvm.va_end(i8*) #2 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll index 8f97225ca446b..20c44384504e2 100644 --- a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll +++ b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -ipsccp < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @test(i32 signext %n) { diff --git a/llvm/unittests/ADT/TripleTest.cpp b/llvm/unittests/ADT/TripleTest.cpp index dc7a28c72f208..1852d7b6a1b0d 100644 --- a/llvm/unittests/ADT/TripleTest.cpp +++ b/llvm/unittests/ADT/TripleTest.cpp @@ -111,41 +111,6 @@ TEST(TripleTest, ParsedIDs) { EXPECT_EQ(Triple::Linux, T.getOS()); EXPECT_EQ(Triple::Musl, T.getEnvironment()); - T = Triple("powerpc-bgp-linux"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("powerpc-bgp-cnk"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::CNK, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("ppc-bgp-linux"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("ppc32-bgp-linux"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("powerpc64-bgq-linux"); - EXPECT_EQ(Triple::ppc64, T.getArch()); - EXPECT_EQ(Triple::BGQ, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("ppc64-bgq-linux"); - EXPECT_EQ(Triple::ppc64, T.getArch()); - EXPECT_EQ(Triple::BGQ, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - T = Triple("powerpc-ibm-aix"); EXPECT_EQ(Triple::ppc, T.getArch()); EXPECT_EQ(Triple::IBM, T.getVendor()); diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn index 043a672a76e1e..3a452fc6e0601 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn @@ -61,7 +61,6 @@ static_library("LLVMPowerPCCodeGen") { "PPCMachineScheduler.cpp", "PPCMacroFusion.cpp", "PPCPreEmitPeephole.cpp", - "PPCQPXLoadSplat.cpp", "PPCReduceCRLogicals.cpp", "PPCRegisterInfo.cpp", "PPCSubtarget.cpp", diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 933573bc810cb..bb6cee740ace7 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1116,9 +1116,6 @@ extern kmp_uint64 __kmp_now_nsec(); #if KMP_OS_WINDOWS #define KMP_INIT_WAIT 64U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */ -#elif KMP_OS_CNK -#define KMP_INIT_WAIT 16U /* initial number of spin-tests */ -#define KMP_NEXT_WAIT 8U /* susequent number of spin-tests */ #elif KMP_OS_LINUX #define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index b5c641cc7273c..f6fb1e602c297 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -680,17 +680,6 @@ void __kmpc_flush(ident_t *loc) { // Nothing to see here move along #elif KMP_ARCH_PPC64 // Nothing needed here (we have a real MB above). -#if KMP_OS_CNK - // The flushing thread needs to yield here; this prevents a - // busy-waiting thread from saturating the pipeline. flush is - // often used in loops like this: - // while (!flag) { - // #pragma omp flush(flag) - // } - // and adding the yield here is good for at least a 10x speedup - // when running >2 threads per core (on the NAS LU benchmark). - __kmp_yield(); -#endif #else #error Unknown or unsupported architecture #endif diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h index e54f6812b8b34..b80e54777e8c2 100644 --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -159,7 +159,7 @@ extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck); #define KMP_LOCK_ACQUIRED_NEXT 0 #ifndef KMP_USE_FUTEX #define KMP_USE_FUTEX \ - (KMP_OS_LINUX && !KMP_OS_CNK && \ + (KMP_OS_LINUX && \ (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)) #endif #if KMP_USE_FUTEX diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index bfe7765b2a967..33735cf455c7e 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -69,7 +69,7 @@ #error Unknown compiler #endif -#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_CNK +#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) #define KMP_AFFINITY_SUPPORTED 1 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64 #define KMP_GROUP_AFFINITY 1 diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index 779c08e9771d5..4296ca31d67d9 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -22,7 +22,6 @@ #define KMP_OS_OPENBSD 0 #define KMP_OS_DARWIN 0 #define KMP_OS_WINDOWS 0 -#define KMP_OS_CNK 0 #define KMP_OS_HURD 0 #define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */ @@ -66,11 +65,6 @@ #define KMP_OS_OPENBSD 1 #endif -#if (defined __bgq__) -#undef KMP_OS_CNK -#define KMP_OS_CNK 1 -#endif - #if (defined __GNU__) #undef KMP_OS_HURD #define KMP_OS_HURD 1 diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index 8090ff759fe1b..16059a3762bf4 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -1433,13 +1433,8 @@ __kmp_invoke_microtask: add 12, 0, 12 neg 12, 12 -// We need to make sure that the stack frame stays aligned (to 16 bytes, except -// under the BG/Q CNK, where it must be to 32 bytes). -# if KMP_OS_CNK - li 0, -32 -# else +// We need to make sure that the stack frame stays aligned (to 16 bytes). li 0, -16 -# endif and 12, 0, 12 // Establish the local stack frame. diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 3b5910fc95e89..58cc4d25f6080 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -31,7 +31,7 @@ #include #include -#if KMP_OS_LINUX && !KMP_OS_CNK +#if KMP_OS_LINUX #include #if KMP_USE_FUTEX // We should really include , but that causes compatibility problems on diff --git a/polly/lib/External/isl/config.sub b/polly/lib/External/isl/config.sub index 1d8e98bcee23a..bc4db70f82abf 100755 --- a/polly/lib/External/isl/config.sub +++ b/polly/lib/External/isl/config.sub @@ -152,9 +152,6 @@ case $os in os= basic_machine=$1 ;; - -bluegene*) - os=-cnk - ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 @@ -539,10 +536,6 @@ case $basic_machine in basic_machine=bfin-`echo "$basic_machine" | sed 's/^[^-]*-//'` os=-linux ;; - bluegene*) - basic_machine=powerpc-ibm - os=-cnk - ;; c54x-*) basic_machine=tic54x-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; @@ -1364,7 +1357,7 @@ case $os in # Each alternative MUST end in a * to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ @@ -1728,7 +1721,7 @@ case $basic_machine in -sunos*) vendor=sun ;; - -cnk*|-aix*) + -aix*) vendor=ibm ;; -beos*) diff --git a/polly/lib/External/ppcg/config.sub b/polly/lib/External/ppcg/config.sub index 6205f8423d6aa..d97f3009f9f09 100644 --- a/polly/lib/External/ppcg/config.sub +++ b/polly/lib/External/ppcg/config.sub @@ -160,9 +160,6 @@ case $os in os= basic_machine=$1 ;; - -bluegene*) - os=-cnk - ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 @@ -524,10 +521,6 @@ case $basic_machine in basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; - bluegene*) - basic_machine=powerpc-ibm - os=-cnk - ;; c54x-*) basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; @@ -1344,7 +1337,7 @@ case $os in # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ @@ -1709,7 +1702,7 @@ case $basic_machine in -sunos*) vendor=sun ;; - -cnk*|-aix*) + -aix*) vendor=ibm ;; -beos*) From 536baa11cfe12362ea646ad731a2274a07208cc0 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 27 Jul 2020 12:30:09 -0700 Subject: [PATCH 0235/1035] [lldb] Remove CMAKE_VERSION checks now that the minimum version is 3.13.4 --- .../modules/FindPythonInterpAndLibs.cmake | 38 ++++--------------- lldb/cmake/modules/LLDBConfig.cmake | 5 --- 2 files changed, 7 insertions(+), 36 deletions(-) diff --git a/lldb/cmake/modules/FindPythonInterpAndLibs.cmake b/lldb/cmake/modules/FindPythonInterpAndLibs.cmake index 243e0463f48b6..3a64ebbcf9721 100644 --- a/lldb/cmake/modules/FindPythonInterpAndLibs.cmake +++ b/lldb/cmake/modules/FindPythonInterpAndLibs.cmake @@ -61,46 +61,22 @@ if(PYTHON_LIBRARIES AND PYTHON_INCLUDE_DIRS AND PYTHON_EXECUTABLE AND SWIG_EXECU else() find_package(SWIG 2.0) if (SWIG_FOUND) - if(NOT CMAKE_VERSION VERSION_LESS 3.12) - if (LLDB_PYTHON_VERSION) - if (LLDB_PYTHON_VERSION VERSION_EQUAL "2") - FindPython2() - elseif(LLDB_PYTHON_VERSION VERSION_EQUAL "3") - FindPython3() - endif() - else() + if (LLDB_PYTHON_VERSION) + if (LLDB_PYTHON_VERSION VERSION_EQUAL "2") + FindPython2() + elseif(LLDB_PYTHON_VERSION VERSION_EQUAL "3") FindPython3() - if (NOT PYTHON3_FOUND AND NOT CMAKE_SYSTEM_NAME STREQUAL Windows) - FindPython2() - endif() endif() else() - find_package(PythonInterp) - find_package(PythonLibs) - if(PYTHONINTERP_FOUND AND PYTHONLIBS_FOUND AND SWIG_FOUND) - if (NOT CMAKE_CROSSCOMPILING) - string(REPLACE "." ";" pythonlibs_version_list ${PYTHONLIBS_VERSION_STRING}) - list(GET pythonlibs_version_list 0 pythonlibs_major) - list(GET pythonlibs_version_list 1 pythonlibs_minor) - - # Ignore the patch version. Some versions of macOS report a different - # patch version for the system provided interpreter and libraries. - if (CMAKE_CROSSCOMPILING OR (PYTHON_VERSION_MAJOR VERSION_EQUAL pythonlibs_major AND - PYTHON_VERSION_MINOR VERSION_EQUAL pythonlibs_minor)) - mark_as_advanced( - PYTHON_LIBRARIES - PYTHON_INCLUDE_DIRS - PYTHON_EXECUTABLE - SWIG_EXECUTABLE) - endif() - endif() + FindPython3() + if (NOT PYTHON3_FOUND AND NOT CMAKE_SYSTEM_NAME STREQUAL Windows) + FindPython2() endif() endif() else() message(STATUS "SWIG 2 or later is required for Python support in LLDB but could not be found") endif() - include(FindPackageHandleStandardArgs) find_package_handle_standard_args(PythonInterpAndLibs FOUND_VAR diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake index 8465cfe3b7b72..7e5848c800f87 100644 --- a/lldb/cmake/modules/LLDBConfig.cmake +++ b/lldb/cmake/modules/LLDBConfig.cmake @@ -79,11 +79,6 @@ if(LLDB_BUILD_FRAMEWORK) if(NOT APPLE) message(FATAL_ERROR "LLDB.framework can only be generated when targeting Apple platforms") endif() - # CMake 3.6 did not correctly emit POST_BUILD commands for Apple Framework targets - # CMake < 3.8 did not have the BUILD_RPATH target property - if(CMAKE_VERSION VERSION_LESS 3.8) - message(FATAL_ERROR "LLDB_BUILD_FRAMEWORK is not supported on CMake < 3.8") - endif() set(LLDB_FRAMEWORK_VERSION A CACHE STRING "LLDB.framework version (default is A)") set(LLDB_FRAMEWORK_BUILD_DIR bin CACHE STRING "Output directory for LLDB.framework") From 6dadf7cb654ce131c1d05b3add8e30344b6d22be Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Mon, 27 Jul 2020 13:37:35 -0600 Subject: [PATCH 0236/1035] [llvm][examples][SimplifyCFG] Fix pass's IR changed reporting ... under the EXPENSIVE_CHECKS build, this fails the assert in the LegacyPM that verifies whether a pass really did leave the IR alone when it reports no changes back from its return status. --- llvm/examples/IRTransforms/SimplifyCFG.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/examples/IRTransforms/SimplifyCFG.cpp b/llvm/examples/IRTransforms/SimplifyCFG.cpp index 10658c9f09590..82368d7494212 100644 --- a/llvm/examples/IRTransforms/SimplifyCFG.cpp +++ b/llvm/examples/IRTransforms/SimplifyCFG.cpp @@ -354,18 +354,18 @@ static bool mergeIntoSinglePredecessor_v2(Function &F, DominatorTree &DT) { } static bool doSimplify_v1(Function &F) { - return eliminateCondBranches_v1(F) & mergeIntoSinglePredecessor_v1(F) & + return eliminateCondBranches_v1(F) | mergeIntoSinglePredecessor_v1(F) | removeDeadBlocks_v1(F); } static bool doSimplify_v2(Function &F, DominatorTree &DT) { - return eliminateCondBranches_v2(F, DT) & - mergeIntoSinglePredecessor_v2(F, DT) & removeDeadBlocks_v2(F, DT); + return eliminateCondBranches_v2(F, DT) | + mergeIntoSinglePredecessor_v2(F, DT) | removeDeadBlocks_v2(F, DT); } static bool doSimplify_v3(Function &F, DominatorTree &DT) { - return eliminateCondBranches_v3(F, DT) & - mergeIntoSinglePredecessor_v2(F, DT) & removeDeadBlocks_v2(F, DT); + return eliminateCondBranches_v3(F, DT) | + mergeIntoSinglePredecessor_v2(F, DT) | removeDeadBlocks_v2(F, DT); } namespace { From 4c6eebf86a0734779cd20473cfcaa9d7c8899298 Mon Sep 17 00:00:00 2001 From: Fred Riss Date: Fri, 24 Jul 2020 09:24:41 -0700 Subject: [PATCH 0237/1035] [lldb/AppleSimulator] Always provide a -simulator environment Summary: This commit is somewhat NFC-ish today as the environment of triples is not considered when comparing s if one of them is not set (I plan to change that). We have made simulator triples unambiguous these days, but the simulator platforms still advertise triples without the environment. This wasn't an issue when the sims ran only on a very different architecure than the real device, but this has changed with Apple Silicon. This patch simplifies the way GetSupportedArchitectureAtIndex is implemented for the sim platforms and adds the environment. It also trivially adds support for Apple Silicon to those platforms. Reviewers: aprantl Subscribers: lldb-commits --- .../MacOSX/PlatformAppleSimulator.cpp | 8 ++ .../Platform/MacOSX/PlatformAppleSimulator.h | 6 ++ .../MacOSX/PlatformAppleTVSimulator.cpp | 36 ++++----- .../MacOSX/PlatformAppleTVSimulator.h | 3 - .../MacOSX/PlatformAppleWatchSimulator.cpp | 40 +++++----- .../MacOSX/PlatformAppleWatchSimulator.h | 3 - .../Platform/MacOSX/PlatformiOSSimulator.cpp | 61 +++++---------- .../Platform/MacOSX/PlatformiOSSimulator.h | 3 - lldb/unittests/Platform/CMakeLists.txt | 1 + .../Platform/PlatformAppleSimulatorTest.cpp | 74 +++++++++++++++++++ 10 files changed, 146 insertions(+), 89 deletions(-) create mode 100644 lldb/unittests/Platform/PlatformAppleSimulatorTest.cpp diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp index bd0a231303bd1..0160fb95c58a9 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp @@ -253,3 +253,11 @@ CoreSimulatorSupport::Device PlatformAppleSimulator::GetSimulatorDevice() { return CoreSimulatorSupport::Device(); } #endif + +bool PlatformAppleSimulator::GetSupportedArchitectureAtIndex(uint32_t idx, + ArchSpec &arch) { + if (idx >= m_supported_triples.size()) + return false; + arch = ArchSpec(m_supported_triples[idx]); + return true; +} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h index 8c0174f2946ed..6182acaf229ac 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h @@ -44,6 +44,9 @@ class PlatformAppleSimulator : public PlatformDarwin { lldb_private::Target *target, lldb_private::Status &error) override; + bool GetSupportedArchitectureAtIndex(uint32_t idx, + lldb_private::ArchSpec &arch) override; + protected: std::mutex m_core_sim_path_mutex; llvm::Optional m_core_simulator_framework_path; @@ -52,6 +55,9 @@ class PlatformAppleSimulator : public PlatformDarwin { lldb_private::FileSpec GetCoreSimulatorPath(); + llvm::Triple::OSType m_os_type = llvm::Triple::UnknownOS; + llvm::ArrayRef m_supported_triples = {}; + void LoadCoreSimulator(); #if defined(__APPLE__) diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp index 461624a2adaa8..27f798b00ebf4 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp @@ -77,6 +77,7 @@ PlatformSP PlatformAppleTVSimulator::CreateInstance(bool force, bool create = force; if (!create && arch && arch->IsValid()) { switch (arch->GetMachine()) { + case llvm::Triple::aarch64: case llvm::Triple::x86_64: { const llvm::Triple &triple = arch->GetTriple(); switch (triple.getVendor()) { @@ -144,7 +145,24 @@ const char *PlatformAppleTVSimulator::GetDescriptionStatic() { /// Default Constructor PlatformAppleTVSimulator::PlatformAppleTVSimulator() : PlatformAppleSimulator( - CoreSimulatorSupport::DeviceType::ProductFamilyID::appleTV) {} + CoreSimulatorSupport::DeviceType::ProductFamilyID::appleTV) { +#ifdef __APPLE__ +#if __arm64__ + static const llvm::StringRef supported_triples[] = { + "arm64e-apple-tvos-simulator", + "arm64-apple-tvos-simulator", + "x86_64h-apple-tvos-simulator", + "x86_64-apple-tvos-simulator", + }; +#else + static const llvm::StringRef supported_triples[] = { + "x86_64h-apple-tvos-simulator", + "x86_64-apple-tvos-simulator", + }; +#endif + m_supported_triples = supported_triples; +#endif +} /// Destructor. /// @@ -322,19 +340,3 @@ uint32_t PlatformAppleTVSimulator::FindProcesses( } return process_infos.size(); } - -bool PlatformAppleTVSimulator::GetSupportedArchitectureAtIndex(uint32_t idx, - ArchSpec &arch) { - static const ArchSpec platform_arch( - HostInfo::GetArchitecture(HostInfo::eArchKind64)); - - if (idx == 0) { - arch = platform_arch; - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::TvOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } - return false; -} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h index 5a7b0ee0d7dc9..a94f94f9f57f7 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h @@ -62,9 +62,6 @@ class PlatformAppleTVSimulator : public PlatformAppleSimulator { FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info, lldb_private::ProcessInstanceInfoList &process_infos) override; - bool GetSupportedArchitectureAtIndex(uint32_t idx, - lldb_private::ArchSpec &arch) override; - void AddClangModuleCompilationOptions(lldb_private::Target *target, std::vector &options) override { diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp index 03a8fcd313602..79f254c43a6ae 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp @@ -76,6 +76,7 @@ PlatformSP PlatformAppleWatchSimulator::CreateInstance(bool force, bool create = force; if (!create && arch && arch->IsValid()) { switch (arch->GetMachine()) { + case llvm::Triple::aarch64: case llvm::Triple::x86_64: case llvm::Triple::x86: { const llvm::Triple &triple = arch->GetTriple(); @@ -145,7 +146,23 @@ const char *PlatformAppleWatchSimulator::GetDescriptionStatic() { /// Default Constructor PlatformAppleWatchSimulator::PlatformAppleWatchSimulator() : PlatformAppleSimulator( - CoreSimulatorSupport::DeviceType::ProductFamilyID::appleWatch) {} + CoreSimulatorSupport::DeviceType::ProductFamilyID::appleWatch) { +#ifdef __APPLE__ +#if __arm64__ + static const llvm::StringRef supported_triples[] = { + "arm64e-apple-watchos-simulator", + "arm64-apple-watchos-simulator", + }; +#else + static const llvm::StringRef supported_triples[] = { + "x86_64-apple-watchos-simulator", + "x86_64h-apple-watchos-simulator", + "i386-apple-watchos-simulator", + }; +#endif + m_supported_triples = supported_triples; +#endif +} /// Destructor. /// @@ -325,24 +342,3 @@ uint32_t PlatformAppleWatchSimulator::FindProcesses( return process_infos.size(); } -bool PlatformAppleWatchSimulator::GetSupportedArchitectureAtIndex( - uint32_t idx, ArchSpec &arch) { - if (idx == 0) { - arch = HostInfo::GetArchitecture(HostInfo::eArchKind32); - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::WatchOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } - - if (idx == 1) { - arch = HostInfo::GetArchitecture(HostInfo::eArchKind64); - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::WatchOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } - return false; -} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h index 96dcd16ffa993..78b936691b0c7 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h @@ -62,9 +62,6 @@ class PlatformAppleWatchSimulator : public PlatformAppleSimulator { FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info, lldb_private::ProcessInstanceInfoList &process_infos) override; - bool GetSupportedArchitectureAtIndex(uint32_t idx, - lldb_private::ArchSpec &arch) override; - void AddClangModuleCompilationOptions(lldb_private::Target *target, std::vector &options) override { diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp index a890d0afdf1e3..b73c06fcdc8b4 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp @@ -76,6 +76,7 @@ PlatformSP PlatformiOSSimulator::CreateInstance(bool force, bool create = force; if (!create && arch && arch->IsValid()) { switch (arch->GetMachine()) { + case llvm::Triple::aarch64: case llvm::Triple::x86_64: case llvm::Triple::x86: { const llvm::Triple &triple = arch->GetTriple(); @@ -148,7 +149,25 @@ const char *PlatformiOSSimulator::GetDescriptionStatic() { /// Default Constructor PlatformiOSSimulator::PlatformiOSSimulator() : PlatformAppleSimulator( - CoreSimulatorSupport::DeviceType::ProductFamilyID::iPhone) {} + CoreSimulatorSupport::DeviceType::ProductFamilyID::iPhone) { +#ifdef __APPLE__ +#if __arm64__ + static const llvm::StringRef supported_triples[] = { + "arm64e-apple-ios-simulator", + "arm64-apple-ios-simulator", + "x86_64-apple-ios-simulator", + "x86_64h-apple-ios-simulator", + }; +#else + static const llvm::StringRef supported_triples[] = { + "x86_64h-apple-ios-simulator", + "x86_64-apple-ios-simulator", + "i386-apple-ios-simulator", + }; +#endif + m_supported_triples = supported_triples; +#endif +} /// Destructor. /// @@ -328,43 +347,3 @@ PlatformiOSSimulator::FindProcesses(const ProcessInstanceInfoMatch &match_info, return process_infos.size(); } -bool PlatformiOSSimulator::GetSupportedArchitectureAtIndex(uint32_t idx, - ArchSpec &arch) { - static const ArchSpec platform_arch( - HostInfo::GetArchitecture(HostInfo::eArchKindDefault)); - static const ArchSpec platform_arch64( - HostInfo::GetArchitecture(HostInfo::eArchKind64)); - - if (idx == 0) { - arch = platform_arch; - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::IOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } else { - if (platform_arch.IsExactMatch(platform_arch64)) { - // This macosx platform supports both 32 and 64 bit. - if (idx == 1) { - // 32/64: return "x86_64-apple-macosx" for architecture 1 - arch = platform_arch64; - return true; - } else if (idx == 2 || idx == 3) { - arch = HostInfo::GetArchitecture(HostInfo::eArchKind32); - if (arch.IsValid()) { - if (idx == 2) - arch.GetTriple().setOS(llvm::Triple::IOS); - // 32/64: return "i386-apple-ios" for architecture 2 32/64: return - // "i386-apple-macosx" for architecture 3 - return true; - } - } - } else if (idx == 1) { - // This macosx platform supports only 32 bit, so return the *-apple- - // macosx version - arch = platform_arch; - return true; - } - } - return false; -} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h index 4d416d759bd2d..982f8e2de5e7a 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h @@ -64,9 +64,6 @@ class PlatformiOSSimulator : public PlatformAppleSimulator { FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info, lldb_private::ProcessInstanceInfoList &process_infos) override; - bool GetSupportedArchitectureAtIndex(uint32_t idx, - lldb_private::ArchSpec &arch) override; - void AddClangModuleCompilationOptions(lldb_private::Target *target, std::vector &options) override { diff --git a/lldb/unittests/Platform/CMakeLists.txt b/lldb/unittests/Platform/CMakeLists.txt index eb7f0a6ca3c41..ca5031b9b43e0 100644 --- a/lldb/unittests/Platform/CMakeLists.txt +++ b/lldb/unittests/Platform/CMakeLists.txt @@ -1,4 +1,5 @@ add_lldb_unittest(LLDBPlatformTests + PlatformAppleSimulatorTest.cpp PlatformDarwinTest.cpp LINK_LIBS diff --git a/lldb/unittests/Platform/PlatformAppleSimulatorTest.cpp b/lldb/unittests/Platform/PlatformAppleSimulatorTest.cpp new file mode 100644 index 0000000000000..0b90380b797c5 --- /dev/null +++ b/lldb/unittests/Platform/PlatformAppleSimulatorTest.cpp @@ -0,0 +1,74 @@ +//===-- PlatformAppleSimulatorTest.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gtest/gtest.h" + +#include "Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h" +#include "Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h" +#include "Plugins/Platform/MacOSX/PlatformiOSSimulator.h" +#include "TestingSupport/SubsystemRAII.h" +#include "lldb/Host/FileSystem.h" +#include "lldb/Host/HostInfo.h" +#include "lldb/Target/Platform.h" + +using namespace lldb; +using namespace lldb_private; + +class PlatformAppleSimulatorTest : public ::testing::Test { + SubsystemRAII + subsystems; +}; + +#ifdef __APPLE__ + +static void testSimPlatformArchHasSimEnvironment(llvm::StringRef name) { + Status error; + auto platform_sp = Platform::Create(ConstString(name), error); + ASSERT_TRUE(platform_sp); + int num_arches = 0; + + while (true) { + ArchSpec arch; + if (!platform_sp->GetSupportedArchitectureAtIndex(num_arches, arch)) + break; + EXPECT_EQ(arch.GetTriple().getEnvironment(), llvm::Triple::Simulator); + num_arches++; + } + + EXPECT_GT(num_arches, 0); +} + +TEST_F(PlatformAppleSimulatorTest, TestSimHasSimEnvionament) { + testSimPlatformArchHasSimEnvironment("ios-simulator"); + testSimPlatformArchHasSimEnvironment("tvos-simulator"); + testSimPlatformArchHasSimEnvironment("watchos-simulator"); +} + +TEST_F(PlatformAppleSimulatorTest, TestHostPlatformToSim) { + static const ArchSpec platform_arch( + HostInfo::GetArchitecture(HostInfo::eArchKindDefault)); + + const llvm::Triple::OSType sim_platforms[] = { + llvm::Triple::IOS, + llvm::Triple::TvOS, + llvm::Triple::WatchOS, + }; + + for (auto sim : sim_platforms) { + ArchSpec arch = platform_arch; + arch.GetTriple().setOS(sim); + arch.GetTriple().setEnvironment(llvm::Triple::Simulator); + + Status error; + auto platform_sp = Platform::Create(arch, nullptr, error); + EXPECT_TRUE(platform_sp); + } +} + +#endif From 932316660179c1273e365d9dbbe648478bc5c4f1 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 27 Jul 2020 16:08:19 -0400 Subject: [PATCH 0238/1035] [OpenMP] Add more pass-through functions in DeviceTy Summary: 1. Add DeviceTy::data_alloc, DeviceTy::data_delete, DeviceTy::data_alloc, DeviceTy::synchronize pass-through functions. Avoid directly accessing Device.RTL 2. Fix the type of the first argument of synchronize_ty in rth.h, device id is int32_t which is consistent with other functions. Reviewers: tianshilei1992, jdoerfert Reviewed By: tianshilei1992 Subscribers: yaxunl, guansong, sstefan1, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D84487 --- openmp/libomptarget/src/api.cpp | 6 ++---- openmp/libomptarget/src/device.cpp | 18 ++++++++++++++++-- openmp/libomptarget/src/device.h | 16 ++++++++++++++++ openmp/libomptarget/src/omptarget.cpp | 10 +++------- openmp/libomptarget/src/rtl.h | 2 +- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 7baeebeb0a2a9..6a5ad86bd18a4 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -57,8 +57,7 @@ EXTERN void *omp_target_alloc(size_t size, int device_num) { return NULL; } - DeviceTy &Device = Devices[device_num]; - rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL); + rc = Devices[device_num].data_alloc(size); DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); return rc; } @@ -83,8 +82,7 @@ EXTERN void omp_target_free(void *device_ptr, int device_num) { return; } - DeviceTy &Device = Devices[device_num]; - Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr); + Devices[device_num].data_delete(device_ptr); DP("omp_target_free deallocated device ptr\n"); } diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 867083fde4892..6da8320462eef 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -217,7 +217,7 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, } else if (Size) { // If it is not contained and Size > 0, we should create a new entry for it. IsNew = true; - uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); + uintptr_t tp = (uintptr_t)data_alloc(Size, HstPtrBegin); DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin), @@ -299,7 +299,7 @@ int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete, if (HT.decRefCount() == 0) { DP("Deleting tgt data " DPxMOD " of size %ld\n", DPxPTR(HT.TgtPtrBegin), Size); - RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin); + data_delete((void *)HT.TgtPtrBegin); DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%ld\n", (ForceDelete ? " (forced)" : ""), DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size); @@ -351,6 +351,14 @@ __tgt_target_table *DeviceTy::load_binary(void *Img) { return rc; } +void *DeviceTy::data_alloc(int64_t Size, void *HstPtr) { + return RTL->data_alloc(RTLDeviceID, Size, HstPtr); +} + +int32_t DeviceTy::data_delete(void *TgtPtrBegin) { + return RTL->data_delete(RTLDeviceID, TgtPtrBegin); +} + // Submit data to device int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr) { @@ -423,6 +431,12 @@ bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) { return false; } +int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) { + if (RTL->synchronize) + return RTL->synchronize(RTLDeviceID, AsyncInfoPtr); + return OFFLOAD_SUCCESS; +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h index ebec76c3cf62a..72b919c04df01 100644 --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -192,6 +192,18 @@ struct DeviceTy { int32_t initOnce(); __tgt_target_table *load_binary(void *Img); + // device memory allocation/deallocation routines + /// Allocates \p Size bytes on the device and returns the address/nullptr when + /// succeeds/fails. \p HstPtr is an address of the host data which the + /// allocated target data will be associated with. If it is unknown, the + /// default value of \p HstPtr is nullptr. Note: this function doesn't do + /// pointer association. Actually, all the __tgt_rtl_data_alloc + /// implementations ignore \p HstPtr. + void *data_alloc(int64_t Size, void *HstPtr = nullptr); + /// Deallocates memory which \p TgtPtrBegin points at and returns + /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. + int32_t data_delete(void *TgtPtrBegin); + // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be // synchronous. // Copy data from host to device @@ -213,6 +225,10 @@ struct DeviceTy { uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr); + /// Synchronize device/queue/event based on \p AsyncInfoPtr and return + /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. + int32_t synchronize(__tgt_async_info *AsyncInfoPtr); + private: // Call to RTL void init(); // To be called only via DeviceTy::initOnce() diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 47971b9c0a00a..25722d75f6ea4 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -845,8 +845,7 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, TgtBaseOffset = 0; } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { // Allocate memory for (first-)private array - TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, - arg_sizes[i], HstPtrBegin); + TgtPtrBegin = Device.data_alloc(arg_sizes[i], HstPtrBegin); if (!TgtPtrBegin) { DP ("Data allocation for %sprivate array " DPxMOD " failed, " "abort target.\n", @@ -929,7 +928,7 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, // Deallocate (first-)private arrays for (auto it : fpArrays) { - int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); + int rt = Device.data_delete(it); if (rt != OFFLOAD_SUCCESS) { DP("Deallocation of (first-)private arrays failed.\n"); return OFFLOAD_FAIL; @@ -944,8 +943,5 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num, return OFFLOAD_FAIL; } - if (Device.RTL->synchronize) - return Device.RTL->synchronize(device_id, &AsyncInfo); - - return OFFLOAD_SUCCESS; + return Device.synchronize(&AsyncInfo); } diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h index 941461f22b5c6..a9695a1022c21 100644 --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -52,7 +52,7 @@ struct RTLInfoTy { int32_t, uint64_t, __tgt_async_info *); typedef int64_t(init_requires_ty)(int64_t); - typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); + typedef int64_t(synchronize_ty)(int32_t, __tgt_async_info *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, From 145acacaea1d7fb4ffc055a3e92ee8fee7c58634 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 24 Jul 2020 10:13:53 -0400 Subject: [PATCH 0239/1035] [libunwind] Remove old keymgr related logic keymgr used to be used on MacOSX <= 10.6, however we don't build libunwind from scratch for such old systems anymore. Hence, this code isn't useful anymore. Differential Revision: https://reviews.llvm.org/D84677 --- libunwind/src/AddressSpace.hpp | 11 ----- libunwind/src/Unwind_AppleExtras.cpp | 68 ---------------------------- 2 files changed, 79 deletions(-) diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp index 764aaa3489f26..3d1e810f43c08 100644 --- a/libunwind/src/AddressSpace.hpp +++ b/libunwind/src/AddressSpace.hpp @@ -39,13 +39,6 @@ struct EHABIIndexEntry { }; #endif -#ifdef __APPLE__ -#include -namespace libunwind { - bool checkKeyMgrRegisteredFDEs(uintptr_t targetAddr, void *&fde); -} -#endif - #include "libunwind.h" #include "config.h" #include "dwarf2.h" @@ -651,14 +644,10 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, inline bool LocalAddressSpace::findOtherFDE(pint_t targetAddr, pint_t &fde) { -#ifdef __APPLE__ - return checkKeyMgrRegisteredFDEs(targetAddr, *((void**)&fde)); -#else // TO DO: if OS has way to dynamically register FDEs, check that. (void)targetAddr; (void)fde; return false; -#endif } inline bool LocalAddressSpace::findFunctionName(pint_t addr, char *buf, diff --git a/libunwind/src/Unwind_AppleExtras.cpp b/libunwind/src/Unwind_AppleExtras.cpp index 536303993effb..1d9948aced355 100644 --- a/libunwind/src/Unwind_AppleExtras.cpp +++ b/libunwind/src/Unwind_AppleExtras.cpp @@ -12,33 +12,6 @@ #include "DwarfParser.hpp" -// private keymgr stuff -#define KEYMGR_GCC3_DW2_OBJ_LIST 302 -extern "C" { - extern void _keymgr_set_and_unlock_processwide_ptr(int key, void *ptr); - extern void *_keymgr_get_and_lock_processwide_ptr(int key); -} - -// undocumented libgcc "struct object" -struct libgcc_object { - void *start; - void *unused1; - void *unused2; - void *fde; - unsigned long encoding; - void *fde_end; - libgcc_object *next; -}; - -// undocumented libgcc "struct km_object_info" referenced by -// KEYMGR_GCC3_DW2_OBJ_LIST -struct libgcc_object_info { - libgcc_object *seen_objects; - libgcc_object *unseen_objects; - unsigned spare[2]; -}; - - // static linker symbols to prevent wrong two level namespace for _Unwind symbols #if defined(__arm__) #define NOT_HERE_BEFORE_5_0(sym) \ @@ -140,44 +113,3 @@ NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Resume_or_Rethrow) NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Unregister) #endif // defined(_LIBUNWIND_BUILD_SJLJ_APIS) - - -namespace libunwind { - -_LIBUNWIND_HIDDEN -bool checkKeyMgrRegisteredFDEs(uintptr_t pc, void *&fde) { -#if __MAC_OS_X_VERSION_MIN_REQUIRED - // lastly check for old style keymgr registration of dynamically generated - // FDEs acquire exclusive access to libgcc_object_info - libgcc_object_info *head = (libgcc_object_info *) - _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST); - if (head != NULL) { - // look at each FDE in keymgr - for (libgcc_object *ob = head->unseen_objects; ob != NULL; ob = ob->next) { - CFI_Parser::FDE_Info fdeInfo; - CFI_Parser::CIE_Info cieInfo; - const char *msg = CFI_Parser::decodeFDE( - LocalAddressSpace::sThisAddressSpace, - (uintptr_t)ob->fde, &fdeInfo, &cieInfo); - if (msg == NULL) { - // Check if this FDE is for a function that includes the pc - if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) { - fde = (void*)fdeInfo.pcStart; - _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, - head); - return true; - } - } - } - } - // release libgcc_object_info - _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, head); -#else - (void)pc; - (void)fde; -#endif - return false; -} - -} - From 113f56fbb80e8d6f705be19f8ae169a3fee2e4f8 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Sat, 25 Jul 2020 08:27:21 -0700 Subject: [PATCH 0240/1035] Unify the return value of GetByteSize to an llvm::Optional (NFC-ish) This cleanup patch unifies all methods called GetByteSize() in the ValueObject hierarchy to return an optional, like the methods in CompilerType do. This means fewer magic 0 values, which could fix bugs down the road in languages where types can have a size of zero, such as Swift and C (but not C++). Differential Revision: https://reviews.llvm.org/D84285 This re-lands the patch with bogus :m_byte_size(0) initalizations removed. --- lldb/include/lldb/Core/ValueObject.h | 2 +- lldb/include/lldb/Core/ValueObjectCast.h | 2 +- lldb/include/lldb/Core/ValueObjectChild.h | 2 +- .../lldb/Core/ValueObjectConstResult.h | 4 ++-- .../lldb/Core/ValueObjectDynamicValue.h | 2 +- lldb/include/lldb/Core/ValueObjectMemory.h | 2 +- lldb/include/lldb/Core/ValueObjectRegister.h | 4 ++-- .../lldb/Core/ValueObjectSyntheticFilter.h | 2 +- lldb/include/lldb/Core/ValueObjectVariable.h | 2 +- .../lldb/Expression/ExpressionVariable.h | 2 +- .../lldb/Target/StackFrameRecognizer.h | 4 +++- lldb/source/API/SBValue.cpp | 2 +- .../Commands/CommandObjectWatchpoint.cpp | 2 +- lldb/source/Core/ValueObject.cpp | 12 +++++----- lldb/source/Core/ValueObjectCast.cpp | 2 +- lldb/source/Core/ValueObjectConstResult.cpp | 21 +++++++---------- lldb/source/Core/ValueObjectDynamicValue.cpp | 2 +- lldb/source/Core/ValueObjectMemory.cpp | 8 +++---- lldb/source/Core/ValueObjectRegister.cpp | 6 +++-- .../Core/ValueObjectSyntheticFilter.cpp | 4 +++- lldb/source/Core/ValueObjectVariable.cpp | 6 ++--- lldb/source/Expression/ExpressionVariable.cpp | 8 +++---- lldb/source/Expression/Materializer.cpp | 23 +++++++++++-------- lldb/source/Target/StackFrame.cpp | 6 ++--- 24 files changed, 66 insertions(+), 64 deletions(-) diff --git a/lldb/include/lldb/Core/ValueObject.h b/lldb/include/lldb/Core/ValueObject.h index 0080368fd9965..a557d69f3ae30 100644 --- a/lldb/include/lldb/Core/ValueObject.h +++ b/lldb/include/lldb/Core/ValueObject.h @@ -358,7 +358,7 @@ class ValueObject : public UserID { virtual bool CanProvideValue(); // Subclasses must implement the functions below. - virtual uint64_t GetByteSize() = 0; + virtual llvm::Optional GetByteSize() = 0; virtual lldb::ValueType GetValueType() const = 0; diff --git a/lldb/include/lldb/Core/ValueObjectCast.h b/lldb/include/lldb/Core/ValueObjectCast.h index d91ca6a92be8d..342803f8ca63a 100644 --- a/lldb/include/lldb/Core/ValueObjectCast.h +++ b/lldb/include/lldb/Core/ValueObjectCast.h @@ -30,7 +30,7 @@ class ValueObjectCast : public ValueObject { ConstString name, const CompilerType &cast_type); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; size_t CalculateNumChildren(uint32_t max) override; diff --git a/lldb/include/lldb/Core/ValueObjectChild.h b/lldb/include/lldb/Core/ValueObjectChild.h index c6f44a29b0591..9a9fd9294261a 100644 --- a/lldb/include/lldb/Core/ValueObjectChild.h +++ b/lldb/include/lldb/Core/ValueObjectChild.h @@ -30,7 +30,7 @@ class ValueObjectChild : public ValueObject { public: ~ValueObjectChild() override; - uint64_t GetByteSize() override { return m_byte_size; } + llvm::Optional GetByteSize() override { return m_byte_size; } lldb::offset_t GetByteOffset() override { return m_byte_offset; } diff --git a/lldb/include/lldb/Core/ValueObjectConstResult.h b/lldb/include/lldb/Core/ValueObjectConstResult.h index 0e868c687e931..8d823baa0b7b4 100644 --- a/lldb/include/lldb/Core/ValueObjectConstResult.h +++ b/lldb/include/lldb/Core/ValueObjectConstResult.h @@ -62,7 +62,7 @@ class ValueObjectConstResult : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const Status &error); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override; @@ -113,7 +113,7 @@ class ValueObjectConstResult : public ValueObject { CompilerType GetCompilerTypeImpl() override; ConstString m_type_name; - uint64_t m_byte_size; + llvm::Optional m_byte_size; ValueObjectConstResultImpl m_impl; diff --git a/lldb/include/lldb/Core/ValueObjectDynamicValue.h b/lldb/include/lldb/Core/ValueObjectDynamicValue.h index 9f5304b55e934..2806857339efb 100644 --- a/lldb/include/lldb/Core/ValueObjectDynamicValue.h +++ b/lldb/include/lldb/Core/ValueObjectDynamicValue.h @@ -34,7 +34,7 @@ class ValueObjectDynamicValue : public ValueObject { public: ~ValueObjectDynamicValue() override; - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectMemory.h b/lldb/include/lldb/Core/ValueObjectMemory.h index d1cd6ae41445d..b5d5e6ecf4c0e 100644 --- a/lldb/include/lldb/Core/ValueObjectMemory.h +++ b/lldb/include/lldb/Core/ValueObjectMemory.h @@ -40,7 +40,7 @@ class ValueObjectMemory : public ValueObject { const Address &address, const CompilerType &ast_type); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectRegister.h b/lldb/include/lldb/Core/ValueObjectRegister.h index 41051d93b707e..3968584ad5185 100644 --- a/lldb/include/lldb/Core/ValueObjectRegister.h +++ b/lldb/include/lldb/Core/ValueObjectRegister.h @@ -36,7 +36,7 @@ class ValueObjectRegisterSet : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t set_idx); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegisterSet; @@ -86,7 +86,7 @@ class ValueObjectRegister : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t reg_num); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegister; diff --git a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h index cb471657aec9b..41c461ce13f0d 100644 --- a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h +++ b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h @@ -36,7 +36,7 @@ class ValueObjectSynthetic : public ValueObject { public: ~ValueObjectSynthetic() override; - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectVariable.h b/lldb/include/lldb/Core/ValueObjectVariable.h index b7e262574a14d..23fdedbf5a4a6 100644 --- a/lldb/include/lldb/Core/ValueObjectVariable.h +++ b/lldb/include/lldb/Core/ValueObjectVariable.h @@ -37,7 +37,7 @@ class ValueObjectVariable : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const lldb::VariableSP &var_sp); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Expression/ExpressionVariable.h b/lldb/include/lldb/Expression/ExpressionVariable.h index 60062d212badf..4259e6395da47 100644 --- a/lldb/include/lldb/Expression/ExpressionVariable.h +++ b/lldb/include/lldb/Expression/ExpressionVariable.h @@ -32,7 +32,7 @@ class ExpressionVariable virtual ~ExpressionVariable(); - size_t GetByteSize() { return m_frozen_sp->GetByteSize(); } + llvm::Optional GetByteSize() { return m_frozen_sp->GetByteSize(); } ConstString GetName() { return m_frozen_sp->GetName(); } diff --git a/lldb/include/lldb/Target/StackFrameRecognizer.h b/lldb/include/lldb/Target/StackFrameRecognizer.h index 302b56bec907b..baffc890bb065 100644 --- a/lldb/include/lldb/Target/StackFrameRecognizer.h +++ b/lldb/include/lldb/Target/StackFrameRecognizer.h @@ -154,7 +154,9 @@ class ValueObjectRecognizerSynthesizedValue : public ValueObject { SetName(parent.GetName()); } - uint64_t GetByteSize() override { return m_parent->GetByteSize(); } + llvm::Optional GetByteSize() override { + return m_parent->GetByteSize(); + } lldb::ValueType GetValueType() const override { return m_type; } bool UpdateValue() override { if (!m_parent->UpdateValueIfNeeded()) return false; diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp index 7485b0ee1838e..686d1f23a75a8 100644 --- a/lldb/source/API/SBValue.cpp +++ b/lldb/source/API/SBValue.cpp @@ -333,7 +333,7 @@ size_t SBValue::GetByteSize() { ValueLocker locker; lldb::ValueObjectSP value_sp(GetSP(locker)); if (value_sp) { - result = value_sp->GetByteSize(); + result = value_sp->GetByteSize().getValueOr(0); } return result; diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp index ce4662930a7c2..c2a008af79d6f 100644 --- a/lldb/source/Commands/CommandObjectWatchpoint.cpp +++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp @@ -905,7 +905,7 @@ corresponding to the byte size of the data type."); // We're in business. // Find out the size of this variable. size = m_option_watchpoint.watch_size == 0 - ? valobj_sp->GetByteSize() + ? valobj_sp->GetByteSize().getValueOr(0) : m_option_watchpoint.watch_size; } compiler_type = valobj_sp->GetCompilerType(); diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index d3a1971235ca0..78711c4e42ce2 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -849,7 +849,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize(); + const size_t byte_size = GetByteSize().getValueOr(0); Value::ValueType value_type = m_value.GetValueType(); @@ -1524,7 +1524,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize(); + const size_t byte_size = GetByteSize().getValueOr(0); Value::ValueType value_type = m_value.GetValueType(); @@ -1739,13 +1739,13 @@ ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to, uint32_t bit_field_offset = from; if (GetDataExtractor().GetByteOrder() == eByteOrderBig) bit_field_offset = - GetByteSize() * 8 - bit_field_size - bit_field_offset; + GetByteSize().getValueOr(0) * 8 - bit_field_size - bit_field_offset; // We haven't made a synthetic array member for INDEX yet, so lets make // one and cache it for any future reference. ValueObjectChild *synthetic_child = new ValueObjectChild( - *this, GetCompilerType(), index_const_str, GetByteSize(), 0, - bit_field_size, bit_field_offset, false, false, eAddressTypeInvalid, - 0); + *this, GetCompilerType(), index_const_str, + GetByteSize().getValueOr(0), 0, bit_field_size, bit_field_offset, + false, false, eAddressTypeInvalid, 0); // Cache the value if we got one back... if (synthetic_child) { diff --git a/lldb/source/Core/ValueObjectCast.cpp b/lldb/source/Core/ValueObjectCast.cpp index 22e856be539b5..7b6d3591faf44 100644 --- a/lldb/source/Core/ValueObjectCast.cpp +++ b/lldb/source/Core/ValueObjectCast.cpp @@ -47,7 +47,7 @@ size_t ValueObjectCast::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -uint64_t ValueObjectCast::GetByteSize() { +llvm::Optional ValueObjectCast::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); return m_value.GetValueByteSize(nullptr, &exe_ctx); } diff --git a/lldb/source/Core/ValueObjectConstResult.cpp b/lldb/source/Core/ValueObjectConstResult.cpp index 8d84f8e62ccc5..ceb4491f86663 100644 --- a/lldb/source/Core/ValueObjectConstResult.cpp +++ b/lldb/source/Core/ValueObjectConstResult.cpp @@ -40,8 +40,7 @@ ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ByteOrder byte_order, uint32_t addr_byte_size, lldb::addr_t address) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this, address) { + : ValueObject(exe_scope, manager), m_impl(this, address) { SetIsConstant(); SetValueIsValid(true); m_data.SetByteOrder(byte_order); @@ -64,8 +63,7 @@ ValueObjectConstResult::ValueObjectConstResult( ExecutionContextScope *exe_scope, ValueObjectManager &manager, const CompilerType &compiler_type, ConstString name, const DataExtractor &data, lldb::addr_t address) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this, address) { + : ValueObject(exe_scope, manager), m_impl(this, address) { m_data = data; if (!m_data.GetSharedDataBuffer()) { @@ -112,8 +110,7 @@ ValueObjectConstResult::ValueObjectConstResult( const CompilerType &compiler_type, ConstString name, const lldb::DataBufferSP &data_sp, lldb::ByteOrder data_byte_order, uint32_t data_addr_size, lldb::addr_t address) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this, address) { + : ValueObject(exe_scope, manager), m_impl(this, address) { m_data.SetByteOrder(data_byte_order); m_data.SetAddressByteSize(data_addr_size); m_data.SetData(data_sp); @@ -143,7 +140,7 @@ ValueObjectConstResult::ValueObjectConstResult( ExecutionContextScope *exe_scope, ValueObjectManager &manager, const CompilerType &compiler_type, ConstString name, lldb::addr_t address, AddressType address_type, uint32_t addr_byte_size) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), + : ValueObject(exe_scope, manager), m_type_name(), m_impl(this, address) { m_value.GetScalar() = address; m_data.SetAddressByteSize(addr_byte_size); @@ -179,8 +176,7 @@ ValueObjectSP ValueObjectConstResult::Create(ExecutionContextScope *exe_scope, ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Status &error) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this) { + : ValueObject(exe_scope, manager), m_impl(this) { m_error = error; SetIsConstant(); } @@ -189,8 +185,7 @@ ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Value &value, ConstString name, Module *module) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this) { + : ValueObject(exe_scope, manager), m_impl(this) { m_value = value; m_name = name; ExecutionContext exe_ctx; @@ -208,9 +203,9 @@ lldb::ValueType ValueObjectConstResult::GetValueType() const { return eValueTypeConstResult; } -uint64_t ValueObjectConstResult::GetByteSize() { +llvm::Optional ValueObjectConstResult::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); - if (m_byte_size == 0) { + if (!m_byte_size) { if (auto size = GetCompilerType().GetByteSize(exe_ctx.GetBestExecutionContextScope())) SetByteSize(*size); diff --git a/lldb/source/Core/ValueObjectDynamicValue.cpp b/lldb/source/Core/ValueObjectDynamicValue.cpp index ca66740cb55d4..1c25b8c85a059 100644 --- a/lldb/source/Core/ValueObjectDynamicValue.cpp +++ b/lldb/source/Core/ValueObjectDynamicValue.cpp @@ -98,7 +98,7 @@ size_t ValueObjectDynamicValue::CalculateNumChildren(uint32_t max) { return m_parent->GetNumChildren(max); } -uint64_t ValueObjectDynamicValue::GetByteSize() { +llvm::Optional ValueObjectDynamicValue::GetByteSize() { const bool success = UpdateValueIfNeeded(false); if (success && m_dynamic_type_info.HasType()) { ExecutionContext exe_ctx(GetExecutionContextRef()); diff --git a/lldb/source/Core/ValueObjectMemory.cpp b/lldb/source/Core/ValueObjectMemory.cpp index 8e7d3ebc93f69..17fade9e5fdc3 100644 --- a/lldb/source/Core/ValueObjectMemory.cpp +++ b/lldb/source/Core/ValueObjectMemory.cpp @@ -139,13 +139,11 @@ size_t ValueObjectMemory::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -uint64_t ValueObjectMemory::GetByteSize() { +llvm::Optional ValueObjectMemory::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); if (m_type_sp) - return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()) - .getValueOr(0); - return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()) - .getValueOr(0); + return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()); + return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); } lldb::ValueType ValueObjectMemory::GetValueType() const { diff --git a/lldb/source/Core/ValueObjectRegister.cpp b/lldb/source/Core/ValueObjectRegister.cpp index ec87c38fb3679..27461e9cebc41 100644 --- a/lldb/source/Core/ValueObjectRegister.cpp +++ b/lldb/source/Core/ValueObjectRegister.cpp @@ -81,7 +81,7 @@ size_t ValueObjectRegisterSet::CalculateNumChildren(uint32_t max) { return 0; } -uint64_t ValueObjectRegisterSet::GetByteSize() { return 0; } +llvm::Optional ValueObjectRegisterSet::GetByteSize() { return 0; } bool ValueObjectRegisterSet::UpdateValue() { m_error.Clear(); @@ -229,7 +229,9 @@ size_t ValueObjectRegister::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -uint64_t ValueObjectRegister::GetByteSize() { return m_reg_info.byte_size; } +llvm::Optional ValueObjectRegister::GetByteSize() { + return m_reg_info.byte_size; +} bool ValueObjectRegister::UpdateValue() { m_error.Clear(); diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp index 32d1e6ab8368c..fb2d32e602cea 100644 --- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp +++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp @@ -121,7 +121,9 @@ bool ValueObjectSynthetic::MightHaveChildren() { return (m_might_have_children != eLazyBoolNo); } -uint64_t ValueObjectSynthetic::GetByteSize() { return m_parent->GetByteSize(); } +llvm::Optional ValueObjectSynthetic::GetByteSize() { + return m_parent->GetByteSize(); +} lldb::ValueType ValueObjectSynthetic::GetValueType() const { return m_parent->GetValueType(); diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp index 0d1e7b047a0ac..ab67e3038cf0a 100644 --- a/lldb/source/Core/ValueObjectVariable.cpp +++ b/lldb/source/Core/ValueObjectVariable.cpp @@ -105,15 +105,15 @@ size_t ValueObjectVariable::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -uint64_t ValueObjectVariable::GetByteSize() { +llvm::Optional ValueObjectVariable::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); CompilerType type(GetCompilerType()); if (!type.IsValid()) - return 0; + return {}; - return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()).getValueOr(0); + return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); } lldb::ValueType ValueObjectVariable::GetValueType() const { diff --git a/lldb/source/Expression/ExpressionVariable.cpp b/lldb/source/Expression/ExpressionVariable.cpp index d95f0745cf4ba..8b3dda7b2fe10 100644 --- a/lldb/source/Expression/ExpressionVariable.cpp +++ b/lldb/source/Expression/ExpressionVariable.cpp @@ -16,10 +16,10 @@ using namespace lldb_private; ExpressionVariable::~ExpressionVariable() {} uint8_t *ExpressionVariable::GetValueBytes() { - const size_t byte_size = m_frozen_sp->GetByteSize(); - if (byte_size > 0) { - if (m_frozen_sp->GetDataExtractor().GetByteSize() < byte_size) { - m_frozen_sp->GetValue().ResizeData(byte_size); + llvm::Optional byte_size = m_frozen_sp->GetByteSize(); + if (byte_size && *byte_size) { + if (m_frozen_sp->GetDataExtractor().GetByteSize() < *byte_size) { + m_frozen_sp->GetValue().ResizeData(*byte_size); m_frozen_sp->GetValue().GetData(m_frozen_sp->GetDataExtractor()); } return const_cast( diff --git a/lldb/source/Expression/Materializer.cpp b/lldb/source/Expression/Materializer.cpp index 6f8d9b154570a..327e15a26266f 100644 --- a/lldb/source/Expression/Materializer.cpp +++ b/lldb/source/Expression/Materializer.cpp @@ -67,7 +67,7 @@ class EntityPersistentVariable : public Materializer::Entity { const bool zero_memory = false; lldb::addr_t mem = map.Malloc( - m_persistent_variable_sp->GetByteSize(), 8, + m_persistent_variable_sp->GetByteSize().getValueOr(0), 8, lldb::ePermissionsReadable | lldb::ePermissionsWritable, IRMemoryMap::eAllocationPolicyMirror, zero_memory, allocate_error); @@ -106,7 +106,8 @@ class EntityPersistentVariable : public Materializer::Entity { Status write_error; map.WriteMemory(mem, m_persistent_variable_sp->GetValueBytes(), - m_persistent_variable_sp->GetByteSize(), write_error); + m_persistent_variable_sp->GetByteSize().getValueOr(0), + write_error); if (!write_error.Success()) { err.SetErrorStringWithFormat( @@ -234,7 +235,7 @@ class EntityPersistentVariable : public Materializer::Entity { map.GetBestExecutionContextScope(), m_persistent_variable_sp.get()->GetCompilerType(), m_persistent_variable_sp->GetName(), location, eAddressTypeLoad, - m_persistent_variable_sp->GetByteSize()); + m_persistent_variable_sp->GetByteSize().getValueOr(0)); if (frame_top != LLDB_INVALID_ADDRESS && frame_bottom != LLDB_INVALID_ADDRESS && location >= frame_bottom && @@ -279,7 +280,8 @@ class EntityPersistentVariable : public Materializer::Entity { LLDB_LOGF(log, "Dematerializing %s from 0x%" PRIx64 " (size = %llu)", m_persistent_variable_sp->GetName().GetCString(), (uint64_t)mem, - (unsigned long long)m_persistent_variable_sp->GetByteSize()); + (unsigned long long)m_persistent_variable_sp->GetByteSize() + .getValueOr(0)); // Read the contents of the spare memory area @@ -288,7 +290,7 @@ class EntityPersistentVariable : public Materializer::Entity { Status read_error; map.ReadMemory(m_persistent_variable_sp->GetValueBytes(), mem, - m_persistent_variable_sp->GetByteSize(), read_error); + m_persistent_variable_sp->GetByteSize().getValueOr(0), read_error); if (!read_error.Success()) { err.SetErrorStringWithFormat( @@ -369,10 +371,11 @@ class EntityPersistentVariable : public Materializer::Entity { if (!err.Success()) { dump_stream.Printf(" \n"); } else { - DataBufferHeap data(m_persistent_variable_sp->GetByteSize(), 0); + DataBufferHeap data( + m_persistent_variable_sp->GetByteSize().getValueOr(0), 0); map.ReadMemory(data.GetBytes(), target_address, - m_persistent_variable_sp->GetByteSize(), err); + m_persistent_variable_sp->GetByteSize().getValueOr(0), err); if (!err.Success()) { dump_stream.Printf(" \n"); @@ -621,8 +624,8 @@ class EntityVariable : public Materializer::Entity { Status extract_error; - map.GetMemoryData(data, m_temporary_allocation, valobj_sp->GetByteSize(), - extract_error); + map.GetMemoryData(data, m_temporary_allocation, + valobj_sp->GetByteSize().getValueOr(0), extract_error); if (!extract_error.Success()) { err.SetErrorStringWithFormat("couldn't get the data for variable %s", @@ -919,7 +922,7 @@ class EntityResultVariable : public Materializer::Entity { ret->ValueUpdated(); - const size_t pvar_byte_size = ret->GetByteSize(); + const size_t pvar_byte_size = ret->GetByteSize().getValueOr(0); uint8_t *pvar_data = ret->GetValueBytes(); map.ReadMemory(pvar_data, address, pvar_byte_size, read_error); diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 098aed9cd8125..22bca52d7f98a 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1408,7 +1408,7 @@ ValueObjectSP GetValueForOffset(StackFrame &frame, ValueObjectSP &parent, } int64_t child_offset = child_sp->GetByteOffset(); - int64_t child_size = child_sp->GetByteSize(); + int64_t child_size = child_sp->GetByteSize().getValueOr(0); if (offset >= child_offset && offset < (child_offset + child_size)) { return GetValueForOffset(frame, child_sp, offset - child_offset); @@ -1441,8 +1441,8 @@ ValueObjectSP GetValueForDereferincingOffset(StackFrame &frame, } if (offset >= 0 && uint64_t(offset) >= pointee->GetByteSize()) { - int64_t index = offset / pointee->GetByteSize(); - offset = offset % pointee->GetByteSize(); + int64_t index = offset / pointee->GetByteSize().getValueOr(1); + offset = offset % pointee->GetByteSize().getValueOr(1); const bool can_create = true; pointee = base->GetSyntheticArrayMember(index, can_create); } From 351d234d866ea441d8f8ad1e9b483a98fd51da19 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 27 Jul 2020 23:35:51 +0300 Subject: [PATCH 0241/1035] [OpenMPOpt] Most SCC's are uninteresting, don't waste time on them (up to 16x faster) Summary: This seems obvious in hindsight, but the result is surprising. I've measured compile-time of `-openmpopt` pass standalone on RawSpeed unity build, and while there is some OpenMP stuff, most is not OpenMP. But nonetheless the pass does a lot of costly preparations before ever trying to look for OpenMP stuff in SCC. Numbers (n=25): 0.094624s -> 0.005976s, an -93.68% improvement, or ~16x Reviewers: jdoerfert Reviewed By: jdoerfert Subscribers: yaxunl, hiraditya, guansong, llvm-commits, sstefan1 Tags: #llvm Differential Revision: https://reviews.llvm.org/D84689 --- llvm/include/llvm/Transforms/IPO/OpenMPOpt.h | 10 ++++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 48 ++++++++++++++++---- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h index d96187b73f9bb..9b72ee0afd284 100644 --- a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h +++ b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h @@ -33,6 +33,11 @@ struct OpenMPInModule { bool isKnown() { return Value != OpenMP::UNKNOWN; } operator bool() { return Value != OpenMP::NOT_FOUND; } + /// Does this function \p F contain any OpenMP runtime calls? + bool containsOMPRuntimeCalls(Function *F) const { + return FuncsWithOMPRuntimeCalls.contains(F); + } + /// Return the known kernels (=GPU entry points) in the module. SmallPtrSetImpl &getKernels() { return Kernels; } @@ -42,6 +47,11 @@ struct OpenMPInModule { private: enum class OpenMP { FOUND, NOT_FOUND, UNKNOWN } Value = OpenMP::UNKNOWN; + friend bool containsOpenMP(Module &M, OpenMPInModule &OMPInModule); + + /// In which functions are OpenMP runtime calls present? + SmallPtrSet FuncsWithOMPRuntimeCalls; + /// Collection of known kernels (=GPU entry points) in the module. SmallPtrSet Kernels; }; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index f664a24173747..93f1e5392eb2c 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1339,10 +1339,21 @@ PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); SmallVector SCC; - for (LazyCallGraph::Node &N : C) - SCC.push_back(&N.getFunction()); + // If there are kernels in the module, we have to run on all SCC's. + bool SCCIsInteresting = !OMPInModule.getKernels().empty(); + for (LazyCallGraph::Node &N : C) { + Function *Fn = &N.getFunction(); + SCC.push_back(Fn); + + // Do we already know that the SCC contains kernels, + // or that OpenMP functions are called from this SCC? + if (SCCIsInteresting) + continue; + // If not, let's check that. + SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); + } - if (SCC.empty()) + if (!SCCIsInteresting || SCC.empty()) return PreservedAnalyses::all(); FunctionAnalysisManager &FAM = @@ -1401,12 +1412,23 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass { return false; SmallVector SCC; - for (CallGraphNode *CGN : CGSCC) - if (Function *Fn = CGN->getFunction()) - if (!Fn->isDeclaration()) - SCC.push_back(Fn); + // If there are kernels in the module, we have to run on all SCC's. + bool SCCIsInteresting = !OMPInModule.getKernels().empty(); + for (CallGraphNode *CGN : CGSCC) { + Function *Fn = CGN->getFunction(); + if (!Fn || Fn->isDeclaration()) + continue; + SCC.push_back(Fn); - if (SCC.empty()) + // Do we already know that the SCC contains kernels, + // or that OpenMP functions are called from this SCC? + if (SCCIsInteresting) + continue; + // If not, let's check that. + SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); + } + + if (!SCCIsInteresting || SCC.empty()) return false; CallGraph &CG = getAnalysis().getCallGraph(); @@ -1468,13 +1490,19 @@ bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { if (OMPInModule.isKnown()) return OMPInModule; + auto RecordFunctionsContainingUsesOf = [&](Function *F) { + for (User *U : F->users()) + if (auto *I = dyn_cast(U)) + OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction()); + }; + // MSVC doesn't like long if-else chains for some reason and instead just // issues an error. Work around it.. do { #define OMP_RTL(_Enum, _Name, ...) \ - if (M.getFunction(_Name)) { \ + if (Function *F = M.getFunction(_Name)) { \ + RecordFunctionsContainingUsesOf(F); \ OMPInModule = true; \ - break; \ } #include "llvm/Frontend/OpenMP/OMPKinds.def" } while (false); From beb7e3bb702f69be4c81cf344528577806ab7c5b Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 27 Jul 2020 13:42:13 -0700 Subject: [PATCH 0242/1035] Rename t2-reduce-size -> thumb2-reduce-size For readability and consistency with other thumb2 passes like "thumb2-it". Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D84696 --- llvm/lib/Target/ARM/Thumb2SizeReduction.cpp | 2 +- llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir | 2 +- llvm/test/CodeGen/Thumb2/t2sizereduction.mir | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index ae661594bdc93..0f7e190386731 100644 --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -43,7 +43,7 @@ using namespace llvm; -#define DEBUG_TYPE "t2-reduce-size" +#define DEBUG_TYPE "thumb2-reduce-size" #define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass" STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); diff --git a/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir b/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir index d0bcc666dfacb..1729a59819217 100644 --- a/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir +++ b/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s +# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --- | target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir index 166914d5dedde..48b75ed5e3465 100644 --- a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir +++ b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s +# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --- | target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" From a566e616202b2db8f3aa66c97e2c38f5e734adfb Mon Sep 17 00:00:00 2001 From: Sergej Jaskiewicz Date: Tue, 14 Jul 2020 11:37:27 +0300 Subject: [PATCH 0243/1035] Reland "[compiler-rt] [test] Allow expanding lit substitutions recursively" The commit 8372d505082aceb38417e0b561cd32f2e227597b has been reverted (eafeb8af34946306a7382fa3801cf6e39a1c7226) because it broke asan tests on green dragon buildbots. The underlying issue has been fixed in 4dd5c2bee366514cbc3fc4e6da46462bc11a0a3d. --- compiler-rt/test/lit.common.cfg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index fdc28a4637ff7..d773f3766fd33 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -23,6 +23,9 @@ # bash on Windows is usually very slow. execute_external = (not sys.platform in ['win32']) +# Allow expanding substitutions that are based on other substitutions +config.recursiveExpansionLimit = 10 + # Setup test format. config.test_format = lit.formats.ShTest(execute_external) if execute_external: From 754deffd11c733d709c3ed66d3b9a6b54d081474 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 27 Jul 2020 20:49:01 +0000 Subject: [PATCH 0244/1035] [NFC] Move BitcodeCommon.h from Bitstream to Bitcode --- llvm/include/llvm/{Bitstream => Bitcode}/BitcodeCommon.h | 6 +++--- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename llvm/include/llvm/{Bitstream => Bitcode}/BitcodeCommon.h (88%) diff --git a/llvm/include/llvm/Bitstream/BitcodeCommon.h b/llvm/include/llvm/Bitcode/BitcodeCommon.h similarity index 88% rename from llvm/include/llvm/Bitstream/BitcodeCommon.h rename to llvm/include/llvm/Bitcode/BitcodeCommon.h index 84b35987c4a90..6a3e74550bc48 100644 --- a/llvm/include/llvm/Bitstream/BitcodeCommon.h +++ b/llvm/include/llvm/Bitcode/BitcodeCommon.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_BITSTREAM_BITCODECOMMON_H -#define LLVM_BITSTREAM_BITCODECOMMON_H +#ifndef LLVM_BITCODE_BITCODECOMMON_H +#define LLVM_BITCODE_BITCODECOMMON_H #include "llvm/ADT/Bitfields.h" @@ -27,4 +27,4 @@ struct AllocaPackedValues { } // namespace llvm -#endif // LLVM_BITSTREAM_BITCODECOMMON_H +#endif // LLVM_BITCODE_BITCODECOMMON_H diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index f0377df8648a4..82b6f2078695f 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -20,8 +20,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Bitcode/BitcodeCommon.h" #include "llvm/Bitcode/LLVMBitCodes.h" -#include "llvm/Bitstream/BitcodeCommon.h" #include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Argument.h" diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f566ddf0864c6..7afef397e05ed 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -24,10 +24,10 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Bitcode/BitcodeCommon.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/LLVMBitCodes.h" #include "llvm/Bitstream/BitCodes.h" -#include "llvm/Bitstream/BitcodeCommon.h" #include "llvm/Bitstream/BitstreamWriter.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Attributes.h" From bf544fa1c3cb80f24d85e84559fb11193846259f Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Mon, 27 Jul 2020 20:48:44 +0000 Subject: [PATCH 0245/1035] Revert "[PowerPC] Remove QPX/A2Q BGQ/BGP CNK support" This reverts commit adffce71538e219aab4eeb024819baa7687262ff. This is breaking test-suite, revert while investigation. --- clang/lib/Basic/Targets/PPC.cpp | 39 +- clang/lib/Basic/Targets/PPC.h | 3 + clang/lib/Driver/ToolChains/Arch/PPC.cpp | 1 + clang/lib/Driver/ToolChains/Clang.cpp | 12 + clang/test/Driver/clang-translation.c | 6 + clang/test/Driver/ppc-abi.c | 20 + clang/test/Misc/target-invalid-cpu-note.c | 2 +- clang/test/Preprocessor/init-ppc64.c | 16 + llvm/docs/LangRef.rst | 11 +- llvm/include/llvm/ADT/Triple.h | 3 + llvm/include/llvm/IR/IntrinsicsPowerPC.td | 176 +++ llvm/lib/Support/Triple.cpp | 6 + .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 18 + llvm/lib/Target/PowerPC/CMakeLists.txt | 1 + .../PowerPC/Disassembler/PPCDisassembler.cpp | 15 +- .../PowerPC/MCTargetDesc/PPCInstPrinter.cpp | 12 + .../PowerPC/MCTargetDesc/PPCMCTargetDesc.h | 1 + llvm/lib/Target/PowerPC/PPC.h | 2 + llvm/lib/Target/PowerPC/PPC.td | 14 +- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 3 + llvm/lib/Target/PowerPC/PPCCallingConv.td | 16 + llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 16 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 1025 +++++++++++++- llvm/lib/Target/PowerPC/PPCISelLowering.h | 20 + llvm/lib/Target/PowerPC/PPCInstrFormats.td | 52 + llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 27 +- llvm/lib/Target/PowerPC/PPCInstrInfo.h | 23 +- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 11 + llvm/lib/Target/PowerPC/PPCInstrQPX.td | 1212 +++++++++++++++++ llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp | 161 +++ llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 3 + llvm/lib/Target/PowerPC/PPCRegisterInfo.h | 1 + llvm/lib/Target/PowerPC/PPCRegisterInfo.td | 23 + llvm/lib/Target/PowerPC/PPCScheduleP9.td | 9 +- llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 12 +- llvm/lib/Target/PowerPC/PPCSubtarget.h | 14 + llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 23 +- .../Target/PowerPC/PPCTargetTransformInfo.cpp | 76 +- .../Instrumentation/MemorySanitizer.cpp | 7 +- llvm/test/Analysis/BasicAA/phi-spec-order.ll | 2 +- .../CostModel/PowerPC/unal-vec-ldst.ll | 73 + .../CodeGen/PowerPC/2012-11-16-mischedcall.ll | 2 +- ...leHoistingDueToBlockHotnessProfileData.mir | 2 +- .../NoCRFieldRedefWhenSpillingCRBIT.mir | 2 +- llvm/test/CodeGen/PowerPC/a2q-stackalign.ll | 23 + llvm/test/CodeGen/PowerPC/a2q.ll | 10 + .../PowerPC/aantidep-inline-asm-use.ll | 2 +- llvm/test/CodeGen/PowerPC/asm-Zy.ll | 3 +- llvm/test/CodeGen/PowerPC/asm-constraints.ll | 2 +- ...rt-rr-to-ri-instrs-R0-special-handling.mir | 4 +- .../convert-rr-to-ri-instrs-out-of-range.mir | 2 +- .../PowerPC/convert-rr-to-ri-instrs.mir | 8 +- llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll | 11 + .../CodeGen/PowerPC/ctrloop-shortLoops.ll | 7 + llvm/test/CodeGen/PowerPC/ec-input.ll | 2 +- .../CodeGen/PowerPC/extra-toc-reg-deps.ll | 8 +- .../CodeGen/PowerPC/fast-isel-icmp-split.ll | 2 +- .../PowerPC/fma-mutate-duplicate-vreg.ll | 2 +- .../CodeGen/PowerPC/fp2int2fp-ppcfp128.ll | 3 +- .../CodeGen/PowerPC/glob-comp-aa-crash.ll | 4 +- .../PowerPC/ifcvt-forked-bug-2016-08-08.ll | 2 +- .../test/CodeGen/PowerPC/inlineasm-i64-reg.ll | 4 +- llvm/test/CodeGen/PowerPC/load-two-flts.ll | 3 +- .../PowerPC/loop-data-prefetch-inner.ll | 4 +- .../CodeGen/PowerPC/loop-data-prefetch.ll | 4 +- llvm/test/CodeGen/PowerPC/loop-prep-all.ll | 10 +- .../PowerPC/lxv-aligned-stack-slots.ll | 2 +- llvm/test/CodeGen/PowerPC/machine-combiner.ll | 24 + llvm/test/CodeGen/PowerPC/mc-instrlat.ll | 4 +- llvm/test/CodeGen/PowerPC/mcount-insertion.ll | 3 +- llvm/test/CodeGen/PowerPC/memcpy-vec.ll | 23 + llvm/test/CodeGen/PowerPC/memset-nc.ll | 48 + .../PowerPC/misched-inorder-latency.ll | 3 +- llvm/test/CodeGen/PowerPC/misched.ll | 1 + .../CodeGen/PowerPC/optnone-crbits-i1-ret.ll | 3 +- .../CodeGen/PowerPC/pcrel-local-caller-toc.ll | 6 +- llvm/test/CodeGen/PowerPC/popcnt.ll | 2 + llvm/test/CodeGen/PowerPC/ppc-passname.ll | 11 + llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll | 21 +- llvm/test/CodeGen/PowerPC/pr24546.ll | 4 +- llvm/test/CodeGen/PowerPC/pr27350.ll | 2 +- llvm/test/CodeGen/PowerPC/pr28130.ll | 2 +- .../CodeGen/PowerPC/preinc-ld-sel-crash.ll | 2 +- llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll | 33 + llvm/test/CodeGen/PowerPC/qpx-bv.ll | 37 + llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll | 22 + llvm/test/CodeGen/PowerPC/qpx-load-splat.ll | 80 ++ llvm/test/CodeGen/PowerPC/qpx-load.ll | 26 + llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll | 79 ++ llvm/test/CodeGen/PowerPC/qpx-recipest.ll | 473 +++++++ llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll | 109 ++ llvm/test/CodeGen/PowerPC/qpx-s-load.ll | 26 + llvm/test/CodeGen/PowerPC/qpx-s-sel.ll | 143 ++ llvm/test/CodeGen/PowerPC/qpx-s-store.ll | 25 + llvm/test/CodeGen/PowerPC/qpx-sel.ll | 151 ++ llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll | 31 + llvm/test/CodeGen/PowerPC/qpx-store.ll | 25 + .../test/CodeGen/PowerPC/qpx-unal-cons-lds.ll | 217 +++ llvm/test/CodeGen/PowerPC/qpx-unalperm.ll | 64 + llvm/test/CodeGen/PowerPC/rlwimi-and.ll | 4 +- .../CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir | 2 +- .../CodeGen/PowerPC/s000-alias-misched.ll | 5 +- llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll | 571 ++++++++ .../selectiondag-extload-computeknownbits.ll | 2 +- llvm/test/CodeGen/PowerPC/setcr_bc.mir | 4 +- llvm/test/CodeGen/PowerPC/setcr_bc2.mir | 4 +- llvm/test/CodeGen/PowerPC/stwu-sched.ll | 2 +- llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll | 149 ++ llvm/test/CodeGen/PowerPC/uwtables.ll | 2 +- .../MemorySanitizer/PowerPC/vararg-ppc64.ll | 15 + llvm/test/MC/Disassembler/PowerPC/qpx.txt | 371 +++++ llvm/test/MC/PowerPC/qpx.s | 252 ++++ .../IPConstantProp/fp-bc-icmp-const-fold.ll | 2 +- .../MSSA/combined-partial-overwrites.ll | 2 +- .../combined-partial-overwrites.ll | 2 +- .../EntryExitInstrumenter/mcount.ll | 2 +- .../InstCombine/PowerPC/aligned-qpx.ll | 165 +++ .../LoopDataPrefetch/PowerPC/basic.ll | 5 +- .../test/Transforms/LoopSimplify/dup-preds.ll | 2 +- llvm/test/Transforms/LoopUnroll/pr14167.ll | 2 +- .../PowerPC/agg-interleave-a2.ll | 40 + .../PowerPC/vectorize-only-for-real.ll | 2 +- llvm/test/Transforms/NewGVN/pr31483.ll | 4 +- .../Transforms/SCCP/fp-bc-icmp-const-fold.ll | 2 +- llvm/unittests/ADT/TripleTest.cpp | 35 + .../llvm/lib/Target/PowerPC/BUILD.gn | 1 + openmp/runtime/src/kmp.h | 3 + openmp/runtime/src/kmp_csupport.cpp | 11 + openmp/runtime/src/kmp_lock.h | 2 +- openmp/runtime/src/kmp_os.h | 2 +- openmp/runtime/src/kmp_platform.h | 6 + openmp/runtime/src/z_Linux_asm.S | 7 +- openmp/runtime/src/z_Linux_util.cpp | 2 +- polly/lib/External/isl/config.sub | 11 +- polly/lib/External/ppcg/config.sub | 11 +- 135 files changed, 6525 insertions(+), 174 deletions(-) create mode 100644 llvm/lib/Target/PowerPC/PPCInstrQPX.td create mode 100644 llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp create mode 100644 llvm/test/CodeGen/PowerPC/a2q-stackalign.ll create mode 100644 llvm/test/CodeGen/PowerPC/a2q.ll create mode 100644 llvm/test/CodeGen/PowerPC/memset-nc.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-bv.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-load-splat.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-load.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-recipest.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-load.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-sel.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-store.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-sel.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-store.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll create mode 100644 llvm/test/CodeGen/PowerPC/qpx-unalperm.ll create mode 100644 llvm/test/MC/Disassembler/PowerPC/qpx.txt create mode 100644 llvm/test/MC/PowerPC/qpx.s create mode 100644 llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll create mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index 5f716a541ae92..f0de2bf070ea4 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -46,6 +46,8 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector &Features, HasP8Crypto = true; } else if (Feature == "+direct-move") { HasDirectMove = true; + } else if (Feature == "+qpx") { + HasQPX = true; } else if (Feature == "+htm") { HasHTM = true; } else if (Feature == "+float128") { @@ -97,7 +99,7 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, } // ABI options. - if (ABI == "elfv1") + if (ABI == "elfv1" || ABI == "elfv1-qpx") Builder.defineMacro("_CALL_ELF", "1"); if (ABI == "elfv2") Builder.defineMacro("_CALL_ELF", "2"); @@ -157,11 +159,22 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("_ARCH_PWR10"); if (ArchDefs & ArchDefineA2) Builder.defineMacro("_ARCH_A2"); + if (ArchDefs & ArchDefineA2q) { + Builder.defineMacro("_ARCH_A2Q"); + Builder.defineMacro("_ARCH_QP"); + } if (ArchDefs & ArchDefineE500) Builder.defineMacro("__NO_LWSYNC__"); if (ArchDefs & ArchDefineFuture) Builder.defineMacro("_ARCH_PWR_FUTURE"); + if (getTriple().getVendor() == llvm::Triple::BGQ) { + Builder.defineMacro("__bg__"); + Builder.defineMacro("__THW_BLUEGENE__"); + Builder.defineMacro("__bgq__"); + Builder.defineMacro("__TOS_BGQ__"); + } + if (HasAltivec) { Builder.defineMacro("__VEC__", "10206"); Builder.defineMacro("__ALTIVEC__"); @@ -264,6 +277,7 @@ bool PPCTargetInfo::initFeatureMap( .Case("ppc64le", true) .Default(false); + Features["qpx"] = (CPU == "a2q"); Features["power9-vector"] = (CPU == "pwr9"); Features["crypto"] = llvm::StringSwitch(CPU) .Case("ppc64le", true) @@ -359,6 +373,7 @@ bool PPCTargetInfo::hasFeature(StringRef Feature) const { .Case("power8-vector", HasP8Vector) .Case("crypto", HasP8Crypto) .Case("direct-move", HasDirectMove) + .Case("qpx", HasQPX) .Case("htm", HasHTM) .Case("bpermd", HasBPERMD) .Case("extdiv", HasExtDiv) @@ -488,17 +503,17 @@ ArrayRef PPCTargetInfo::getGCCAddlRegNames() const { } static constexpr llvm::StringLiteral ValidCPUNames[] = { - {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, - {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, - {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, - {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, - {"g5"}, {"a2"}, {"e500"}, {"e500mc"}, {"e5500"}, - {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, {"power5"}, - {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, {"pwr6"}, - {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, {"power8"}, - {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, {"pwr10"}, - {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, {"powerpc64le"}, - {"ppc64le"}, {"future"}}; + {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, + {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, + {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, + {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, + {"g5"}, {"a2"}, {"a2q"}, {"e500"}, {"e500mc"}, + {"e5500"}, {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, + {"power5"}, {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, + {"pwr6"}, {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, + {"power8"}, {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, + {"pwr10"}, {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, + {"powerpc64le"}, {"ppc64le"}, {"future"}}; bool PPCTargetInfo::isValidCPUName(StringRef Name) const { return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames); diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index c2048b2145918..ff8579b6c3cf4 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -46,6 +46,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { ArchDefinePwr10 = 1 << 14, ArchDefineFuture = 1 << 15, ArchDefineA2 = 1 << 16, + ArchDefineA2q = 1 << 17, ArchDefineE500 = 1 << 18 } ArchDefineTypes; @@ -62,6 +63,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool HasP8Vector = false; bool HasP8Crypto = false; bool HasDirectMove = false; + bool HasQPX = false; bool HasHTM = false; bool HasBPERMD = false; bool HasExtDiv = false; @@ -116,6 +118,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { .Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) .Case("a2", ArchDefineA2) + .Case("a2q", ArchDefineName | ArchDefineA2 | ArchDefineA2q) .Cases("power3", "pwr3", ArchDefinePpcgr) .Cases("power4", "pwr4", ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index bcaecf4b2d980..144e276a6bd87 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -57,6 +57,7 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) { .Case("970", "970") .Case("G5", "g5") .Case("a2", "a2") + .Case("a2q", "a2q") .Case("e500", "e500") .Case("e500mc", "e500mc") .Case("e5500", "e5500") diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b0de225f8abf5..7a73eea013bdf 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1883,6 +1883,18 @@ void Clang::AddPPCTargetArgs(const ArgList &Args, if (T.isOSBinFormatELF()) { switch (getToolChain().getArch()) { case llvm::Triple::ppc64: { + // When targeting a processor that supports QPX, or if QPX is + // specifically enabled, default to using the ABI that supports QPX (so + // long as it is not specifically disabled). + bool HasQPX = false; + if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) + HasQPX = A->getValue() == StringRef("a2q"); + HasQPX = Args.hasFlag(options::OPT_mqpx, options::OPT_mno_qpx, HasQPX); + if (HasQPX) { + ABIName = "elfv1-qpx"; + break; + } + if (T.isMusl() || (T.isOSFreeBSD() && T.getOSMajorVersion() >= 13)) ABIName = "elfv2"; else diff --git a/clang/test/Driver/clang-translation.c b/clang/test/Driver/clang-translation.c index d1daeb80004b7..2f02970a2a8ee 100644 --- a/clang/test/Driver/clang-translation.c +++ b/clang/test/Driver/clang-translation.c @@ -167,6 +167,12 @@ // PPCPWR8: "-cc1" // PPCPWR8: "-target-cpu" "pwr8" +// RUN: %clang -target powerpc64-unknown-linux-gnu \ +// RUN: -### -S %s -mcpu=a2q 2>&1 | FileCheck -check-prefix=PPCA2Q %s +// PPCA2Q: clang +// PPCA2Q: "-cc1" +// PPCA2Q: "-target-cpu" "a2q" + // RUN: %clang -target powerpc64-unknown-linux-gnu \ // RUN: -### -S %s -mcpu=630 2>&1 | FileCheck -check-prefix=PPC630 %s // PPC630: clang diff --git a/clang/test/Driver/ppc-abi.c b/clang/test/Driver/ppc-abi.c index 2b5cc463e7c3d..acc4981a2eee6 100644 --- a/clang/test/Driver/ppc-abi.c +++ b/clang/test/Driver/ppc-abi.c @@ -5,6 +5,14 @@ // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1 %s // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s +// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX %s +// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s +// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1 %s +// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-BE %s // RUN: %clang -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -26,6 +34,8 @@ // CHECK-ELFv1: "-target-abi" "elfv1" // CHECK-ELFv1-LE: "-mrelocation-model" "static" // CHECK-ELFv1-LE: "-target-abi" "elfv1" +// CHECK-ELFv1-QPX: "-mrelocation-model" "static" +// CHECK-ELFv1-QPX: "-target-abi" "elfv1-qpx" // CHECK-ELFv2: "-mrelocation-model" "static" // CHECK-ELFv2: "-target-abi" "elfv2" // CHECK-ELFv2-BE: "-mrelocation-model" "static" @@ -38,6 +48,14 @@ // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1-PIC %s // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s +// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s +// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s +// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ +// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1-PIC %s +// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-PIC %s // RUN: %clang -fPIC -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -51,6 +69,8 @@ // CHECK-ELFv1-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv1-PIC: "-target-abi" "elfv1" +// CHECK-ELFv1-QPX-PIC: "-mrelocation-model" "pic" "-pic-level" "2" +// CHECK-ELFv1-QPX-PIC: "-target-abi" "elfv1-qpx" // CHECK-ELFv2-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv2-PIC: "-target-abi" "elfv2" diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index bf6eaefe0b3ca..3a376a7caab46 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -79,7 +79,7 @@ // PPC: error: unknown target CPU 'not-a-cpu' // PPC: note: valid target CPU values are: generic, 440, 450, 601, 602, 603, // PPC-SAME: 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, -// PPC-SAME: 8548, 970, g5, a2, e500, e500mc, e5500, power3, pwr3, power4, +// PPC-SAME: 8548, 970, g5, a2, a2q, e500, e500mc, e5500, power3, pwr3, power4, // PPC-SAME: pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x, // PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, powerpc64, // PPC-SAME: ppc64, powerpc64le, ppc64le, future diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c index 48d35c95aa570..ed8601636554e 100644 --- a/clang/test/Preprocessor/init-ppc64.c +++ b/clang/test/Preprocessor/init-ppc64.c @@ -408,6 +408,21 @@ // PPC64LE:#define __ppc64__ 1 // PPC64LE:#define __ppc__ 1 // +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCA2Q %s +// +// PPCA2Q:#define _ARCH_A2 1 +// PPCA2Q:#define _ARCH_A2Q 1 +// PPCA2Q:#define _ARCH_PPC 1 +// PPCA2Q:#define _ARCH_PPC64 1 +// PPCA2Q:#define _ARCH_QP 1 +// +// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCBGQ %s +// +// PPCBGQ:#define __THW_BLUEGENE__ 1 +// PPCBGQ:#define __TOS_BGQ__ 1 +// PPCBGQ:#define __bg__ 1 +// PPCBGQ:#define __bgq__ 1 +// // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC630 %s // // PPC630:#define _ARCH_630 1 @@ -1054,6 +1069,7 @@ // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s +// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index af93a6ed5c56e..6b9c5c6899819 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4310,9 +4310,14 @@ PowerPC: - ``r``: A 32 or 64-bit integer register. - ``b``: A 32 or 64-bit integer register, excluding ``R0`` (that is: ``R1-R31``). -- ``f``: A 32 or 64-bit float register (``F0-F31``), -- ``v``: For ``4 x f32`` or ``4 x f64`` types, a 128-bit altivec vector - register (``V0-V31``). +- ``f``: A 32 or 64-bit float register (``F0-F31``), or when QPX is enabled, a + 128 or 256-bit QPX register (``Q0-Q31``; aliases the ``F`` registers). +- ``v``: For ``4 x f32`` or ``4 x f64`` types, when QPX is enabled, a + 128 or 256-bit QPX register (``Q0-Q31``), otherwise a 128-bit + altivec vector register (``V0-V31``). + + .. FIXME: is this a bug that v accepts QPX registers? I think this + is supposed to only use the altivec vector registers? - ``y``: Condition register (``CR0-CR7``). - ``wc``: An individual CR bit in a CR register. diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h index c578c097c6f64..6bad18f19244e 100644 --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -142,6 +142,8 @@ class Triple { Apple, PC, SCEI, + BGP, + BGQ, Freescale, IBM, ImaginationTechnologies, @@ -177,6 +179,7 @@ class Triple { Minix, RTEMS, NaCl, // Native Client + CNK, // BG/P Compute-Node Kernel AIX, CUDA, // NVIDIA CUDA NVCL, // NVIDIA OpenCL diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index 853d26c67ee3d..23bcf3ce1959c 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1109,6 +1109,182 @@ def int_ppc_vsx_xxblendvd: GCCBuiltin<"__builtin_vsx_xxblendvd">, [IntrNoMem]>; } +//===----------------------------------------------------------------------===// +// PowerPC QPX Intrinsics. +// + +let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.". + /// PowerPC_QPX_Intrinsic - Base class for all QPX intrinsics. + class PowerPC_QPX_Intrinsic ret_types, + list param_types, + list properties> + : GCCBuiltin, + Intrinsic; +} + +//===----------------------------------------------------------------------===// +// PowerPC QPX Intrinsic Class Definitions. +// + +/// PowerPC_QPX_FF_Intrinsic - A PowerPC intrinsic that takes one v4f64 +/// vector and returns one. These intrinsics have no side effects. +class PowerPC_QPX_FF_Intrinsic + : PowerPC_QPX_Intrinsic; + +/// PowerPC_QPX_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f64 +/// vectors and returns one. These intrinsics have no side effects. +class PowerPC_QPX_FFF_Intrinsic + : PowerPC_QPX_Intrinsic; + +/// PowerPC_QPX_FFFF_Intrinsic - A PowerPC intrinsic that takes three v4f64 +/// vectors and returns one. These intrinsics have no side effects. +class PowerPC_QPX_FFFF_Intrinsic + : PowerPC_QPX_Intrinsic; + +/// PowerPC_QPX_Load_Intrinsic - A PowerPC intrinsic that takes a pointer +/// and returns a v4f64. +class PowerPC_QPX_Load_Intrinsic + : PowerPC_QPX_Intrinsic; + +/// PowerPC_QPX_LoadPerm_Intrinsic - A PowerPC intrinsic that takes a pointer +/// and returns a v4f64 permutation. +class PowerPC_QPX_LoadPerm_Intrinsic + : PowerPC_QPX_Intrinsic; + +/// PowerPC_QPX_Store_Intrinsic - A PowerPC intrinsic that takes a pointer +/// and stores a v4f64. +class PowerPC_QPX_Store_Intrinsic + : PowerPC_QPX_Intrinsic; + +//===----------------------------------------------------------------------===// +// PowerPC QPX Intrinsic Definitions. + +let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". + // Add Instructions + def int_ppc_qpx_qvfadd : PowerPC_QPX_FFF_Intrinsic<"qvfadd">; + def int_ppc_qpx_qvfadds : PowerPC_QPX_FFF_Intrinsic<"qvfadds">; + def int_ppc_qpx_qvfsub : PowerPC_QPX_FFF_Intrinsic<"qvfsub">; + def int_ppc_qpx_qvfsubs : PowerPC_QPX_FFF_Intrinsic<"qvfsubs">; + + // Estimate Instructions + def int_ppc_qpx_qvfre : PowerPC_QPX_FF_Intrinsic<"qvfre">; + def int_ppc_qpx_qvfres : PowerPC_QPX_FF_Intrinsic<"qvfres">; + def int_ppc_qpx_qvfrsqrte : PowerPC_QPX_FF_Intrinsic<"qvfrsqrte">; + def int_ppc_qpx_qvfrsqrtes : PowerPC_QPX_FF_Intrinsic<"qvfrsqrtes">; + + // Multiply Instructions + def int_ppc_qpx_qvfmul : PowerPC_QPX_FFF_Intrinsic<"qvfmul">; + def int_ppc_qpx_qvfmuls : PowerPC_QPX_FFF_Intrinsic<"qvfmuls">; + def int_ppc_qpx_qvfxmul : PowerPC_QPX_FFF_Intrinsic<"qvfxmul">; + def int_ppc_qpx_qvfxmuls : PowerPC_QPX_FFF_Intrinsic<"qvfxmuls">; + + // Multiply-add instructions + def int_ppc_qpx_qvfmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfmadd">; + def int_ppc_qpx_qvfmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfmadds">; + def int_ppc_qpx_qvfnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadd">; + def int_ppc_qpx_qvfnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadds">; + def int_ppc_qpx_qvfmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfmsub">; + def int_ppc_qpx_qvfmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfmsubs">; + def int_ppc_qpx_qvfnmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsub">; + def int_ppc_qpx_qvfnmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsubs">; + def int_ppc_qpx_qvfxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadd">; + def int_ppc_qpx_qvfxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadds">; + def int_ppc_qpx_qvfxxnpmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadd">; + def int_ppc_qpx_qvfxxnpmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadds">; + def int_ppc_qpx_qvfxxcpnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadd">; + def int_ppc_qpx_qvfxxcpnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadds">; + def int_ppc_qpx_qvfxxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadd">; + def int_ppc_qpx_qvfxxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadds">; + + // Select Instruction + def int_ppc_qpx_qvfsel : PowerPC_QPX_FFFF_Intrinsic<"qvfsel">; + + // Permute Instruction + def int_ppc_qpx_qvfperm : PowerPC_QPX_FFFF_Intrinsic<"qvfperm">; + + // Convert and Round Instructions + def int_ppc_qpx_qvfctid : PowerPC_QPX_FF_Intrinsic<"qvfctid">; + def int_ppc_qpx_qvfctidu : PowerPC_QPX_FF_Intrinsic<"qvfctidu">; + def int_ppc_qpx_qvfctidz : PowerPC_QPX_FF_Intrinsic<"qvfctidz">; + def int_ppc_qpx_qvfctiduz : PowerPC_QPX_FF_Intrinsic<"qvfctiduz">; + def int_ppc_qpx_qvfctiw : PowerPC_QPX_FF_Intrinsic<"qvfctiw">; + def int_ppc_qpx_qvfctiwu : PowerPC_QPX_FF_Intrinsic<"qvfctiwu">; + def int_ppc_qpx_qvfctiwz : PowerPC_QPX_FF_Intrinsic<"qvfctiwz">; + def int_ppc_qpx_qvfctiwuz : PowerPC_QPX_FF_Intrinsic<"qvfctiwuz">; + def int_ppc_qpx_qvfcfid : PowerPC_QPX_FF_Intrinsic<"qvfcfid">; + def int_ppc_qpx_qvfcfidu : PowerPC_QPX_FF_Intrinsic<"qvfcfidu">; + def int_ppc_qpx_qvfcfids : PowerPC_QPX_FF_Intrinsic<"qvfcfids">; + def int_ppc_qpx_qvfcfidus : PowerPC_QPX_FF_Intrinsic<"qvfcfidus">; + def int_ppc_qpx_qvfrsp : PowerPC_QPX_FF_Intrinsic<"qvfrsp">; + def int_ppc_qpx_qvfriz : PowerPC_QPX_FF_Intrinsic<"qvfriz">; + def int_ppc_qpx_qvfrin : PowerPC_QPX_FF_Intrinsic<"qvfrin">; + def int_ppc_qpx_qvfrip : PowerPC_QPX_FF_Intrinsic<"qvfrip">; + def int_ppc_qpx_qvfrim : PowerPC_QPX_FF_Intrinsic<"qvfrim">; + + // Move Instructions + def int_ppc_qpx_qvfneg : PowerPC_QPX_FF_Intrinsic<"qvfneg">; + def int_ppc_qpx_qvfabs : PowerPC_QPX_FF_Intrinsic<"qvfabs">; + def int_ppc_qpx_qvfnabs : PowerPC_QPX_FF_Intrinsic<"qvfnabs">; + def int_ppc_qpx_qvfcpsgn : PowerPC_QPX_FFF_Intrinsic<"qvfcpsgn">; + + // Compare Instructions + def int_ppc_qpx_qvftstnan : PowerPC_QPX_FFF_Intrinsic<"qvftstnan">; + def int_ppc_qpx_qvfcmplt : PowerPC_QPX_FFF_Intrinsic<"qvfcmplt">; + def int_ppc_qpx_qvfcmpgt : PowerPC_QPX_FFF_Intrinsic<"qvfcmpgt">; + def int_ppc_qpx_qvfcmpeq : PowerPC_QPX_FFF_Intrinsic<"qvfcmpeq">; + + // Load instructions + def int_ppc_qpx_qvlfd : PowerPC_QPX_Load_Intrinsic<"qvlfd">; + def int_ppc_qpx_qvlfda : PowerPC_QPX_Load_Intrinsic<"qvlfda">; + def int_ppc_qpx_qvlfs : PowerPC_QPX_Load_Intrinsic<"qvlfs">; + def int_ppc_qpx_qvlfsa : PowerPC_QPX_Load_Intrinsic<"qvlfsa">; + + def int_ppc_qpx_qvlfcda : PowerPC_QPX_Load_Intrinsic<"qvlfcda">; + def int_ppc_qpx_qvlfcd : PowerPC_QPX_Load_Intrinsic<"qvlfcd">; + def int_ppc_qpx_qvlfcsa : PowerPC_QPX_Load_Intrinsic<"qvlfcsa">; + def int_ppc_qpx_qvlfcs : PowerPC_QPX_Load_Intrinsic<"qvlfcs">; + def int_ppc_qpx_qvlfiwaa : PowerPC_QPX_Load_Intrinsic<"qvlfiwaa">; + def int_ppc_qpx_qvlfiwa : PowerPC_QPX_Load_Intrinsic<"qvlfiwa">; + def int_ppc_qpx_qvlfiwza : PowerPC_QPX_Load_Intrinsic<"qvlfiwza">; + def int_ppc_qpx_qvlfiwz : PowerPC_QPX_Load_Intrinsic<"qvlfiwz">; + + def int_ppc_qpx_qvlpcld : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcld">; + def int_ppc_qpx_qvlpcls : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcls">; + def int_ppc_qpx_qvlpcrd : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrd">; + def int_ppc_qpx_qvlpcrs : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrs">; + + // Store instructions + def int_ppc_qpx_qvstfd : PowerPC_QPX_Store_Intrinsic<"qvstfd">; + def int_ppc_qpx_qvstfda : PowerPC_QPX_Store_Intrinsic<"qvstfda">; + def int_ppc_qpx_qvstfs : PowerPC_QPX_Store_Intrinsic<"qvstfs">; + def int_ppc_qpx_qvstfsa : PowerPC_QPX_Store_Intrinsic<"qvstfsa">; + + def int_ppc_qpx_qvstfcda : PowerPC_QPX_Store_Intrinsic<"qvstfcda">; + def int_ppc_qpx_qvstfcd : PowerPC_QPX_Store_Intrinsic<"qvstfcd">; + def int_ppc_qpx_qvstfcsa : PowerPC_QPX_Store_Intrinsic<"qvstfcsa">; + def int_ppc_qpx_qvstfcs : PowerPC_QPX_Store_Intrinsic<"qvstfcs">; + def int_ppc_qpx_qvstfiwa : PowerPC_QPX_Store_Intrinsic<"qvstfiwa">; + def int_ppc_qpx_qvstfiw : PowerPC_QPX_Store_Intrinsic<"qvstfiw">; + + // Logical and permutation formation + def int_ppc_qpx_qvflogical : PowerPC_QPX_Intrinsic<"qvflogical", + [llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_ppc_qpx_qvgpci : PowerPC_QPX_Intrinsic<"qvgpci", + [llvm_v4f64_ty], [llvm_i32_ty], [IntrNoMem]>; +} + //===----------------------------------------------------------------------===// // PowerPC HTM Intrinsic Definitions. diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp index 72648273b4cd5..fec1985ccacae 100644 --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -160,6 +160,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case AMD: return "amd"; case Apple: return "apple"; + case BGP: return "bgp"; + case BGQ: return "bgq"; case CSR: return "csr"; case Freescale: return "fsl"; case IBM: return "ibm"; @@ -185,6 +187,7 @@ StringRef Triple::getOSTypeName(OSType Kind) { case AMDHSA: return "amdhsa"; case AMDPAL: return "amdpal"; case Ananas: return "ananas"; + case CNK: return "cnk"; case CUDA: return "cuda"; case CloudABI: return "cloudabi"; case Contiki: return "contiki"; @@ -467,6 +470,8 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("apple", Triple::Apple) .Case("pc", Triple::PC) .Case("scei", Triple::SCEI) + .Case("bgp", Triple::BGP) + .Case("bgq", Triple::BGQ) .Case("fsl", Triple::Freescale) .Case("ibm", Triple::IBM) .Case("img", Triple::ImaginationTechnologies) @@ -503,6 +508,7 @@ static Triple::OSType parseOS(StringRef OSName) { .StartsWith("minix", Triple::Minix) .StartsWith("rtems", Triple::RTEMS) .StartsWith("nacl", Triple::NaCl) + .StartsWith("cnk", Triple::CNK) .StartsWith("aix", Triple::AIX) .StartsWith("cuda", Triple::CUDA) .StartsWith("nvcl", Triple::NVCL) diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 81008d3ea5662..13fd7d05ab9f4 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -492,6 +492,21 @@ struct PPCOperand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()])); } + void addRegQFRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + } + + void addRegQSRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + } + + void addRegQBRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + } + void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(RRegs[getReg()])); @@ -1192,6 +1207,9 @@ bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) { } else if (Name.startswith_lower("v") && !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { RegNo = VRegs[IntVal]; + } else if (Name.startswith_lower("q") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = QFRegs[IntVal]; } else if (Name.startswith_lower("cr") && !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { RegNo = CRRegs[IntVal]; diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index 5a06faa16be19..91021d4e584e1 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -36,6 +36,7 @@ add_llvm_target(PowerPCCodeGen PPCMacroFusion.cpp PPCMIPeephole.cpp PPCRegisterInfo.cpp + PPCQPXLoadSplat.cpp PPCSubtarget.cpp PPCTargetMachine.cpp PPCTargetObjectFile.cpp diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 362ddf7204557..74c6fd3733f03 100644 --- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -167,6 +167,12 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass +static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, QFRegs); +} + static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { @@ -395,9 +401,14 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Read the instruction in the proper endianness. uint64_t Inst = ReadFunc(Bytes.data()); - if (STI.getFeatureBits()[PPC::FeatureSPE]) { + if (STI.getFeatureBits()[PPC::FeatureQPX]) { + DecodeStatus result = + decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); + if (result != MCDisassembler::Fail) + return result; + } else if (STI.getFeatureBits()[PPC::FeatureSPE]) { DecodeStatus result = - decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); + decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); if (result != MCDisassembler::Fail) return result; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index ce1a43a0c25b2..222bf2fa82836 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -49,6 +49,18 @@ FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { const char *RegName = getRegisterName(RegNo); + if (RegName[0] == 'q' /* QPX */) { + // The system toolchain on the BG/Q does not understand QPX register names + // in .cfi_* directives, so print the name of the floating-point + // subregister instead. + std::string RN(RegName); + + RN[0] = 'f'; + OS << RN; + + return; + } + OS << RegName; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 325ede0fc17ac..719e005d98135 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -159,6 +159,7 @@ using llvm::MCPhysReg; static const MCPhysReg SPERegs[32] = PPC_REGS0_31(PPC::S); \ static const MCPhysReg VFRegs[32] = PPC_REGS0_31(PPC::VF); \ static const MCPhysReg VRegs[32] = PPC_REGS0_31(PPC::V); \ + static const MCPhysReg QFRegs[32] = PPC_REGS0_31(PPC::QF); \ static const MCPhysReg RRegsNoR0[32] = \ PPC_REGS_NO0_31(PPC::ZERO, PPC::R); \ static const MCPhysReg XRegsNoX0[32] = \ diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 24a9d419d3ea5..3106290442afa 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -44,6 +44,7 @@ namespace llvm { FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCBranchCoalescingPass(); + FunctionPass *createPPCQPXLoadSplatPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); @@ -67,6 +68,7 @@ namespace llvm { void initializePPCReduceCRLogicalsPass(PassRegistry&); void initializePPCBSelPass(PassRegistry&); void initializePPCBranchCoalescingPass(PassRegistry&); + void initializePPCQPXLoadSplatPass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCPreEmitPeepholePass(PassRegistry &); diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index adb9366217d51..9ad78bf67fe6c 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -132,6 +132,9 @@ def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true", "Enable PPC 4xx instructions">; def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true", "Enable PPC 6xx instructions">; +def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", + "Enable QPX instructions", + [FeatureFPU]>; def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", "Enable VSX instructions", [FeatureAltivec]>; @@ -190,7 +193,7 @@ def FeatureFloat128 : def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD", "POPCNTD_Fast", "Enable the popcnt[dw] instructions">; -// Note that for the a2 processor models we should not use popcnt[dw] by +// Note that for the a2/a2q processor models we should not use popcnt[dw] by // default. These processors do support the instructions, but they're // microcoded, and the software emulation is about twice as fast. def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD", @@ -511,6 +514,15 @@ def : ProcessorModel<"a2", PPCA2Model, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>; +def : ProcessorModel<"a2q", PPCA2Model, + [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, + FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, + Feature64Bit /*, Feature64BitRegs */, FeatureQPX, + FeatureMFTB]>; def : ProcessorModel<"pwr3", G5Model, [DirectivePwr3, FeatureAltivec, FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 5affddd8d147a..540e620a845bc 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -549,6 +549,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { if (Subtarget->hasSPE()) { if (PPC::F4RCRegClass.contains(Reg) || PPC::F8RCRegClass.contains(Reg) || + PPC::QBRCRegClass.contains(Reg) || + PPC::QFRCRegClass.contains(Reg) || + PPC::QSRCRegClass.contains(Reg) || PPC::VFRCRegClass.contains(Reg) || PPC::VRRCRegClass.contains(Reg) || PPC::VSFRCRegClass.contains(Reg) || diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td index 9a15490f1fb0d..1eaa7f7a44b39 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -61,6 +61,9 @@ def RetCC_PPC_Cold : CallingConv<[ CCIfType<[f64], CCAssignToReg<[F1]>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>, + CCIfType<[v4f64, v4f32, v4i1], + CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>> @@ -95,6 +98,10 @@ def RetCC_PPC : CallingConv<[ CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, + // QPX vectors are returned in QF1 and QF2. + CCIfType<[v4f64, v4f32, v4i1], + CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, + // Vector types returned as "direct" go into V2 .. V9; note that only the // ELFv2 ABI fully utilizes all these registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], @@ -151,6 +158,8 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, + CCIfType<[v4f64, v4f32, v4i1], + CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>> @@ -214,6 +223,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>, CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>, + // QPX vectors that are stored in double precision need 32-byte alignment. + CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>, + // Vectors and float128 get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>> @@ -231,6 +243,10 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[ // put vector arguments in vector registers before putting them on the stack. let Entry = 1 in def CC_PPC32_SVR4 : CallingConv<[ + // QPX vectors mirror the scalar FP convention. + CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()", + CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>, + // The first 12 Vector arguments are passed in AltiVec registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 3e218e14d8d44..8ffd89ef5ccd2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4142,7 +4142,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { // Altivec Vector compare instructions do not set any CR register by default and // vector compare operations return the same type as the operands. if (LHS.getValueType().isVector()) { - if (Subtarget->hasSPE()) + if (Subtarget->hasQPX() || Subtarget->hasSPE()) return false; EVT VecVT = LHS.getValueType(); @@ -4813,6 +4813,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) { assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); switch (LoadedVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid PPC load type!"); + case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX + case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX case MVT::f64: Opcode = PPC::LFDUX; break; case MVT::f32: Opcode = PPC::LFSUX; break; case MVT::i32: Opcode = PPC::LWZUX; break; @@ -5093,6 +5095,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SelectCCOp = PPC::SELECT_CC_F16; else if (Subtarget->hasSPE()) SelectCCOp = PPC::SELECT_CC_SPE; + else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64) + SelectCCOp = PPC::SELECT_CC_QFRC; + else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32) + SelectCCOp = PPC::SELECT_CC_QSRC; + else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1) + SelectCCOp = PPC::SELECT_CC_QBRC; else if (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64) SelectCCOp = PPC::SELECT_CC_VSRC; @@ -5848,6 +5856,9 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: + case PPC::SELECT_QFRC: + case PPC::SELECT_QSRC: + case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: @@ -6166,6 +6177,9 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: + case PPC::SELECT_QFRC: + case PPC::SELECT_QSRC: + case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index db3833d595797..ae840a9fa37de 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1095,6 +1095,161 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } } + if (Subtarget.hasQPX()) { + setOperationAction(ISD::FADD, MVT::v4f64, Legal); + setOperationAction(ISD::FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::FREM, MVT::v4f64, Expand); + + setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); + setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); + + setOperationAction(ISD::LOAD , MVT::v4f64, Custom); + setOperationAction(ISD::STORE , MVT::v4f64, Custom); + + setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); + + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SELECT, MVT::v4f64, Expand); + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); + setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); + + setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); + setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); + + setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); + + setOperationAction(ISD::FNEG , MVT::v4f64, Legal); + setOperationAction(ISD::FABS , MVT::v4f64, Legal); + setOperationAction(ISD::FSIN , MVT::v4f64, Expand); + setOperationAction(ISD::FCOS , MVT::v4f64, Expand); + setOperationAction(ISD::FPOW , MVT::v4f64, Expand); + setOperationAction(ISD::FLOG , MVT::v4f64, Expand); + setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); + setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); + setOperationAction(ISD::FEXP , MVT::v4f64, Expand); + setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); + + setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); + + setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); + + addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); + + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FREM, MVT::v4f32, Expand); + + setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); + setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); + + setOperationAction(ISD::LOAD , MVT::v4f32, Custom); + setOperationAction(ISD::STORE , MVT::v4f32, Custom); + + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SELECT, MVT::v4f32, Expand); + setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); + setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + + setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); + setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); + + setOperationAction(ISD::FNEG , MVT::v4f32, Legal); + setOperationAction(ISD::FABS , MVT::v4f32, Legal); + setOperationAction(ISD::FSIN , MVT::v4f32, Expand); + setOperationAction(ISD::FCOS , MVT::v4f32, Expand); + setOperationAction(ISD::FPOW , MVT::v4f32, Expand); + setOperationAction(ISD::FLOG , MVT::v4f32, Expand); + setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); + setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); + setOperationAction(ISD::FEXP , MVT::v4f32, Expand); + setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); + + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + + setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); + + addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); + + setOperationAction(ISD::AND , MVT::v4i1, Legal); + setOperationAction(ISD::OR , MVT::v4i1, Legal); + setOperationAction(ISD::XOR , MVT::v4i1, Legal); + + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SELECT, MVT::v4i1, Expand); + setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); + + setOperationAction(ISD::LOAD , MVT::v4i1, Custom); + setOperationAction(ISD::STORE , MVT::v4i1, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); + setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); + + addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); + + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); + setOperationAction(ISD::FROUND, MVT::v4f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); + + // These need to set FE_INEXACT, and so cannot be vectorized here. + setOperationAction(ISD::FRINT, MVT::v4f64, Expand); + setOperationAction(ISD::FRINT, MVT::v4f32, Expand); + + if (TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + } else { + setOperationAction(ISD::FDIV, MVT::v4f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); + + setOperationAction(ISD::FDIV, MVT::v4f32, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); + } + + // TODO: Handle constrained floating-point operations of v4f64 + } + if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); @@ -1283,8 +1438,8 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4); - if (Subtarget.hasAltivec()) - getMaxByValAlign(Ty, Alignment, Align(16)); + if (Subtarget.hasAltivec() || Subtarget.hasQPX()) + getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16)); return Alignment.value(); } @@ -1422,6 +1577,12 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::VABSD: return "PPCISD::VABSD"; + case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; + case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; + case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; + case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; + case PPCISD::QBFLT: return "PPCISD::QBFLT"; + case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; @@ -1440,6 +1601,9 @@ EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; + if (Subtarget.hasQPX()) + return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); + return VT.changeVectorElementTypeToInteger(); } @@ -2613,9 +2777,16 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, return false; } - // PowerPC doesn't have preinc load/store instructions for vectors - if (VT.isVector()) - return false; + // PowerPC doesn't have preinc load/store instructions for vectors (except + // for QPX, which does have preinc r+r forms). + if (VT.isVector()) { + if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { + return false; + } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { + AM = ISD::PRE_INC; + return true; + } + } if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { // Common code will reject creating a pre-inc form if the base pointer @@ -3337,6 +3508,11 @@ static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; +/// QFPR - The set of QPX registers that should be allocated for arguments. +static const MCPhysReg QFPR[] = { + PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, + PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; + /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, @@ -3366,6 +3542,10 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128 || ArgVT == MVT::f128) Alignment = Align(16); + // QPX vector types stored in double-precision are padded to a 32 byte + // boundary. + else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) + Alignment = Align(32); // ByVal parameters are aligned as requested. if (Flags.isByVal()) { @@ -3397,11 +3577,14 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. -static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, - unsigned PtrByteSize, unsigned LinkageSize, - unsigned ParamAreaSize, unsigned &ArgOffset, +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, + unsigned LinkageSize, + unsigned ParamAreaSize, + unsigned &ArgOffset, unsigned &AvailableFPRs, - unsigned &AvailableVRs) { + unsigned &AvailableVRs, bool HasQPX) { bool UseMemory = false; // Respect alignment of argument on the stack. @@ -3425,7 +3608,11 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, // However, if the argument is actually passed in an FPR or a VR, // we don't use memory after all. if (!Flags.isByVal()) { - if (ArgVT == MVT::f32 || ArgVT == MVT::f64) + if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || + // QPX registers overlap with the scalar FP registers. + (HasQPX && (ArgVT == MVT::v4f32 || + ArgVT == MVT::v4f64 || + ArgVT == MVT::v4i1))) if (AvailableFPRs > 0) { --AvailableFPRs; return false; @@ -3564,12 +3751,18 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( RC = &PPC::VRRCRegClass; break; case MVT::v4f32: - RC = &PPC::VRRCRegClass; + RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; break; case MVT::v2f64: case MVT::v2i64: RC = &PPC::VRRCRegClass; break; + case MVT::v4f64: + RC = &PPC::QFRCRegClass; + break; + case MVT::v4i1: + RC = &PPC::QBRCRegClass; + break; } SDValue ArgValue; @@ -3768,6 +3961,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; const unsigned Num_VR_Regs = array_lengthof(VR); + const unsigned Num_QFPR_Regs = Num_FPR_Regs; // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area @@ -3786,7 +3980,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs)) + NumBytes, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) HasParameterArea = true; } @@ -3796,6 +3991,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned &QFPR_idx = FPR_idx; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; @@ -4038,20 +4234,51 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - // These can be scalar arguments or elements of a vector array type - // passed directly. The latter are used to implement ELFv2 homogenous - // vector aggregates. - if (VR_idx != Num_VR_Regs) { - unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); + if (!Subtarget.hasQPX()) { + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++VR_idx; + } else { + if (CallConv == CallingConv::Fast) + ComputeArgOffset(); + needsLoad = true; + } + if (CallConv != CallingConv::Fast || needsLoad) + ArgOffset += 16; + break; + } // not QPX + + assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && + "Invalid QPX parameter type"); + LLVM_FALLTHROUGH; + + case MVT::v4f64: + case MVT::v4i1: + // QPX vectors are treated like their scalar floating-point subregisters + // (except that they're larger). + unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; + if (QFPR_idx != Num_QFPR_Regs) { + const TargetRegisterClass *RC; + switch (ObjectVT.getSimpleVT().SimpleTy) { + case MVT::v4f64: RC = &PPC::QFRCRegClass; break; + case MVT::v4f32: RC = &PPC::QSRCRegClass; break; + default: RC = &PPC::QBRCRegClass; break; + } + + unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++VR_idx; + ++QFPR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += 16; + ArgOffset += Sz; break; } @@ -4604,9 +4831,10 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, for (const ISD::OutputArg& Param : Outs) { if (Param.Flags.isNest()) continue; - if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize, - LinkageSize, ParamAreaSize, NumBytes, - AvailableFPRs, AvailableVRs)) + if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) return true; } return false; @@ -5836,6 +6064,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned &QFPR_idx = FPR_idx; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -5849,6 +6078,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); + const unsigned NumQFPRs = NumFPRs; // On ELFv2, we can avoid allocating the parameter area if all the arguments // can be passed to the callee in registers. @@ -5863,8 +6093,9 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( for (unsigned i = 0; i != NumOps; ++i) { if (Outs[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytesTmp, AvailableFPRs, AvailableVRs)) + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytesTmp, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) HasParameterArea = true; } } @@ -5912,11 +6143,20 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( continue; break; case MVT::v4f32: - if (++NumVRsUsed <= NumVRs) - continue; + // When using QPX, this is handled like a FP register, otherwise, it + // is an Altivec register. + if (Subtarget.hasQPX()) { + if (++NumFPRsUsed <= NumFPRs) + continue; + } else { + if (++NumVRsUsed <= NumVRs) + continue; + } break; case MVT::f32: case MVT::f64: + case MVT::v4f64: // QPX + case MVT::v4i1: // QPX if (++NumFPRsUsed <= NumFPRs) continue; break; @@ -6278,6 +6518,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: + if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. @@ -6333,6 +6574,63 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (!IsFastCall) ArgOffset += 16; break; + } // not QPX + + assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && + "Invalid QPX parameter type"); + + LLVM_FALLTHROUGH; + case MVT::v4f64: + case MVT::v4i1: { + bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; + if (CFlags.IsVarArg) { + assert(HasParameterArea && + "Parameter area must exist if we have a varargs call."); + // We could elide this store in the case where the object fits + // entirely in R registers. Maybe later. + SDValue Store = + DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); + MemOpChains.push_back(Store); + if (QFPR_idx != NumQFPRs) { + SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, + PtrOff, MachinePointerInfo()); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); + } + ArgOffset += (IsF32 ? 16 : 32); + for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { + if (GPR_idx == NumGPRs) + break; + SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, + DAG.getConstant(i, dl, PtrVT)); + SDValue Load = + DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + break; + } + + // Non-varargs QPX params go into registers or on the stack. + if (QFPR_idx != NumQFPRs) { + RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); + } else { + if (IsFastCall) + ComputePtrOff(); + + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, CFlags.IsTailCall, true, MemOpChains, + TailCallArguments, dl); + if (IsFastCall) + ArgOffset += (IsF32 ? 16 : 32); + } + + if (!IsFastCall) + ArgOffset += (IsF32 ? 16 : 32); + break; + } } } @@ -7003,6 +7301,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( const PPCSubtarget &Subtarget = static_cast(DAG.getSubtarget()); + if (Subtarget.hasQPX()) + report_fatal_error("QPX support is not supported on AIX."); const bool IsPPC64 = Subtarget.isPPC64(); const unsigned PtrByteSize = IsPPC64 ? 8 : 4; @@ -7222,6 +7522,8 @@ SDValue PPCTargetLowering::LowerCall_AIX( const PPCSubtarget& Subtarget = static_cast(DAG.getSubtarget()); + if (Subtarget.hasQPX()) + report_fatal_error("QPX is not supported on AIX."); if (Subtarget.hasAltivec()) report_fatal_error("Altivec support is unimplemented on AIX."); @@ -7689,6 +7991,8 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType().isVector()) + return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); @@ -7712,6 +8016,9 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + if (Op.getOperand(1).getValueType().isVector()) + return LowerVectorStore(Op, DAG); + assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); @@ -8288,6 +8595,27 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType() == MVT::f128) return Op; + if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { + if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) + return SDValue(); + + SDValue Value = Op.getOperand(0); + // The values are now known to be -1 (false) or 1 (true). To convert this + // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). + // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 + Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); + + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); + + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + + if (Op.getValueType() != MVT::v4f64) + Value = DAG.getNode(ISD::FP_ROUND, dl, + Op.getValueType(), Value, + DAG.getIntPtrConstant(1, dl)); + return Value; + } + // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); @@ -8856,6 +9184,110 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); + if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { + // We first build an i32 vector, load it into a QPX register, + // then convert it to a floating-point vector and compare it + // to a zero vector to get the boolean result. + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = MFI.CreateStackObject(16, Align(16), false); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + assert(BVN->getNumOperands() == 4 && + "BUILD_VECTOR for v4i1 does not have 4 operands"); + + bool IsConst = true; + for (unsigned i = 0; i < 4; ++i) { + if (BVN->getOperand(i).isUndef()) continue; + if (!isa(BVN->getOperand(i))) { + IsConst = false; + break; + } + } + + if (IsConst) { + Constant *One = + ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); + Constant *NegOne = + ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); + + Constant *CV[4]; + for (unsigned i = 0; i < 4; ++i) { + if (BVN->getOperand(i).isUndef()) + CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); + else if (isNullConstant(BVN->getOperand(i))) + CV[i] = NegOne; + else + CV[i] = One; + } + + Constant *CP = ConstantVector::get(CV); + SDValue CPIdx = + DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16)); + + SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; + SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); + return DAG.getMemIntrinsicNode( + PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + } + + SmallVector Stores; + for (unsigned i = 0; i < 4; ++i) { + if (BVN->getOperand(i).isUndef()) continue; + + unsigned Offset = 4*i; + SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); + + unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); + if (StoreSize > 4) { + Stores.push_back( + DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, + PtrInfo.getWithOffset(Offset), MVT::i32)); + } else { + SDValue StoreValue = BVN->getOperand(i); + if (StoreSize < 4) + StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); + + Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, + PtrInfo.getWithOffset(Offset))); + } + } + + SDValue StoreChain; + if (!Stores.empty()) + StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + else + StoreChain = DAG.getEntryNode(); + + // Now load from v4i32 into the QPX register; this will extend it to + // v4i64 but not yet convert it to a floating point. Nevertheless, this + // is typed as v4f64 because the QPX register integer states are not + // explicitly represented. + + SDValue Ops[] = {StoreChain, + DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), + FIdx}; + SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); + + SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, + dl, VTs, Ops, MVT::v4i32, PtrInfo); + LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, + DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), + LoadedVect); + + SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); + + return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); + } + + // All other QPX vectors are handled by generic code. + if (Subtarget.hasQPX()) + return SDValue(); + // Check if this is a splat of a constant value. APInt APSplatBits, APSplatUndef; unsigned SplatBitSize; @@ -9648,6 +10080,42 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } } + if (Subtarget.hasQPX()) { + if (VT.getVectorNumElements() != 4) + return SDValue(); + + if (V2.isUndef()) V2 = V1; + + int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); + if (AlignIdx != -1) { + return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, + DAG.getConstant(AlignIdx, dl, MVT::i32)); + } else if (SVOp->isSplat()) { + int SplatIdx = SVOp->getSplatIndex(); + if (SplatIdx >= 4) { + std::swap(V1, V2); + SplatIdx -= 4; + } + + return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, + DAG.getConstant(SplatIdx, dl, MVT::i32)); + } + + // Lower this into a qvgpci/qvfperm pair. + + // Compute the qvgpci literal + unsigned idx = 0; + for (unsigned i = 0; i < 4; ++i) { + int m = SVOp->getMaskElt(i); + unsigned mm = m >= 0 ? (unsigned) m : i; + idx |= mm << (3-i)*3; + } + + SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, + DAG.getConstant(idx, dl, MVT::i32)); + return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); + } + // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be // selected by the instruction selector. @@ -10235,6 +10703,279 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return Op; } +SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + SDNode *N = Op.getNode(); + + assert(N->getOperand(0).getValueType() == MVT::v4i1 && + "Unknown extract_vector_elt type"); + + SDValue Value = N->getOperand(0); + + // The first part of this is like the store lowering except that we don't + // need to track the chain. + + // The values are now known to be -1 (false) or 1 (true). To convert this + // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). + // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 + Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); + + // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to + // understand how to form the extending load. + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); + + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + + // Now convert to an integer and store. + Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, + DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), + Value); + + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = MFI.CreateStackObject(16, Align(16), false); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue StoreChain = DAG.getEntryNode(); + SDValue Ops[] = {StoreChain, + DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), + Value, FIdx}; + SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); + + StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, + dl, VTs, Ops, MVT::v4i32, PtrInfo); + + // Extract the value requested. + unsigned Offset = 4*cast(N->getOperand(1))->getZExtValue(); + SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); + + SDValue IntVal = + DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); + + if (!Subtarget.useCRBits()) + return IntVal; + + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); +} + +/// Lowering for QPX v4i1 loads +SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + LoadSDNode *LN = cast(Op.getNode()); + SDValue LoadChain = LN->getChain(); + SDValue BasePtr = LN->getBasePtr(); + + if (Op.getValueType() == MVT::v4f64 || + Op.getValueType() == MVT::v4f32) { + EVT MemVT = LN->getMemoryVT(); + unsigned Alignment = LN->getAlignment(); + + // If this load is properly aligned, then it is legal. + if (Alignment >= MemVT.getStoreSize()) + return Op; + + EVT ScalarVT = Op.getValueType().getScalarType(), + ScalarMemVT = MemVT.getScalarType(); + unsigned Stride = ScalarMemVT.getStoreSize(); + + SDValue Vals[4], LoadChains[4]; + for (unsigned Idx = 0; Idx < 4; ++Idx) { + SDValue Load; + if (ScalarVT != ScalarMemVT) + Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, + BasePtr, + LN->getPointerInfo().getWithOffset(Idx * Stride), + ScalarMemVT, MinAlign(Alignment, Idx * Stride), + LN->getMemOperand()->getFlags(), LN->getAAInfo()); + else + Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, + LN->getPointerInfo().getWithOffset(Idx * Stride), + MinAlign(Alignment, Idx * Stride), + LN->getMemOperand()->getFlags(), LN->getAAInfo()); + + if (Idx == 0 && LN->isIndexed()) { + assert(LN->getAddressingMode() == ISD::PRE_INC && + "Unknown addressing mode on vector load"); + Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), + LN->getAddressingMode()); + } + + Vals[Idx] = Load; + LoadChains[Idx] = Load.getValue(1); + + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(Stride, dl, + BasePtr.getValueType())); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); + + if (LN->isIndexed()) { + SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; + return DAG.getMergeValues(RetOps, dl); + } + + SDValue RetOps[] = { Value, TF }; + return DAG.getMergeValues(RetOps, dl); + } + + assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); + assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); + + // To lower v4i1 from a byte array, we load the byte elements of the + // vector and then reuse the BUILD_VECTOR logic. + + SDValue VectElmts[4], VectElmtChains[4]; + for (unsigned i = 0; i < 4; ++i) { + SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); + + VectElmts[i] = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, + LN->getPointerInfo().getWithOffset(i), MVT::i8, + /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); + VectElmtChains[i] = VectElmts[i].getValue(1); + } + + LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); + SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); + + SDValue RVals[] = { Value, LoadChain }; + return DAG.getMergeValues(RVals, dl); +} + +/// Lowering for QPX v4i1 stores +SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + StoreSDNode *SN = cast(Op.getNode()); + SDValue StoreChain = SN->getChain(); + SDValue BasePtr = SN->getBasePtr(); + SDValue Value = SN->getValue(); + + if (Value.getValueType() == MVT::v4f64 || + Value.getValueType() == MVT::v4f32) { + EVT MemVT = SN->getMemoryVT(); + unsigned Alignment = SN->getAlignment(); + + // If this store is properly aligned, then it is legal. + if (Alignment >= MemVT.getStoreSize()) + return Op; + + EVT ScalarVT = Value.getValueType().getScalarType(), + ScalarMemVT = MemVT.getScalarType(); + unsigned Stride = ScalarMemVT.getStoreSize(); + + SDValue Stores[4]; + for (unsigned Idx = 0; Idx < 4; ++Idx) { + SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, + DAG.getVectorIdxConstant(Idx, dl)); + SDValue Store; + if (ScalarVT != ScalarMemVT) + Store = + DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, + SN->getPointerInfo().getWithOffset(Idx * Stride), + ScalarMemVT, MinAlign(Alignment, Idx * Stride), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + else + Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, + SN->getPointerInfo().getWithOffset(Idx * Stride), + MinAlign(Alignment, Idx * Stride), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + + if (Idx == 0 && SN->isIndexed()) { + assert(SN->getAddressingMode() == ISD::PRE_INC && + "Unknown addressing mode on vector store"); + Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), + SN->getAddressingMode()); + } + + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(Stride, dl, + BasePtr.getValueType())); + Stores[Idx] = Store; + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + + if (SN->isIndexed()) { + SDValue RetOps[] = { TF, Stores[0].getValue(1) }; + return DAG.getMergeValues(RetOps, dl); + } + + return TF; + } + + assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); + assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); + + // The values are now known to be -1 (false) or 1 (true). To convert this + // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). + // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 + Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); + + // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to + // understand how to form the extending load. + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); + + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + + // Now convert to an integer and store. + Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, + DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), + Value); + + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = MFI.CreateStackObject(16, Align(16), false); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Ops[] = {StoreChain, + DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), + Value, FIdx}; + SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); + + StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, + dl, VTs, Ops, MVT::v4i32, PtrInfo); + + // Move data into the byte array. + SDValue Loads[4], LoadChains[4]; + for (unsigned i = 0; i < 4; ++i) { + unsigned Offset = 4*i; + SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); + + Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, + PtrInfo.getWithOffset(Offset)); + LoadChains[i] = Loads[i].getValue(1); + } + + StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + + SDValue Stores[4]; + for (unsigned i = 0; i < 4; ++i) { + SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); + Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); + + Stores[i] = DAG.getTruncStore( + StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), + MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), + SN->getAAInfo()); + } + + StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + + return StoreChain; +} + SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { @@ -10463,6 +11204,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); @@ -11406,6 +12148,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } else if (MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_F16 || + MI.getOpcode() == PPC::SELECT_CC_QFRC || + MI.getOpcode() == PPC::SELECT_CC_QSRC || + MI.getOpcode() == PPC::SELECT_CC_QBRC || MI.getOpcode() == PPC::SELECT_CC_VRRC || MI.getOpcode() == PPC::SELECT_CC_VSFRC || MI.getOpcode() == PPC::SELECT_CC_VSSRC || @@ -11415,6 +12160,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || + MI.getOpcode() == PPC::SELECT_QFRC || + MI.getOpcode() == PPC::SELECT_QSRC || + MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_SPE || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_VRRC || @@ -11452,6 +12200,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F16 || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_SPE || + MI.getOpcode() == PPC::SELECT_QFRC || + MI.getOpcode() == PPC::SELECT_QSRC || + MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || @@ -12144,7 +12895,9 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX()) || + (VT == MVT::v4f32 && Subtarget.hasQPX()) || + (VT == MVT::v4f64 && Subtarget.hasQPX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); @@ -12163,7 +12916,9 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX()) || + (VT == MVT::v4f32 && Subtarget.hasQPX()) || + (VT == MVT::v4f64 && Subtarget.hasQPX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); @@ -12261,6 +13016,24 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; + case Intrinsic::ppc_qpx_qvlfd: + case Intrinsic::ppc_qpx_qvlfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvlfs: + case Intrinsic::ppc_qpx_qvlfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvlfcd: + case Intrinsic::ppc_qpx_qvlfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvlfcs: + case Intrinsic::ppc_qpx_qvlfcsa: + VT = MVT::v2f32; + break; + case Intrinsic::ppc_qpx_qvlfiwa: + case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_vsx_lxvw4x: @@ -12289,6 +13062,24 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; + case Intrinsic::ppc_qpx_qvstfd: + case Intrinsic::ppc_qpx_qvstfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvstfs: + case Intrinsic::ppc_qpx_qvstfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvstfcd: + case Intrinsic::ppc_qpx_qvstfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvstfcs: + case Intrinsic::ppc_qpx_qvstfcsa: + VT = MVT::v2f32; + break; + case Intrinsic::ppc_qpx_qvstfiw: + case Intrinsic::ppc_qpx_qvstfiwa: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_vsx_stxvw4x: @@ -14286,14 +15077,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty); + Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); + Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v4f32))) && + VT == MVT::v4f32)) || + (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && + LD->getAlign() >= ScalarABIAlignment)) && LD->getAlign() < ABIAlignment) { - // This is a type-legal unaligned Altivec load. + // This is a type-legal unaligned Altivec or QPX load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); bool isLittleEndian = Subtarget.isLittleEndian(); @@ -14324,13 +15119,24 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // optimization later. Intrinsic::ID Intr, IntrLD, IntrPerm; MVT PermCntlTy, PermTy, LDTy; - Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr - : Intrinsic::ppc_altivec_lvsl; - IntrLD = Intrinsic::ppc_altivec_lvx; - IntrPerm = Intrinsic::ppc_altivec_vperm; - PermCntlTy = MVT::v16i8; - PermTy = MVT::v4i32; - LDTy = MVT::v4i32; + if (Subtarget.hasAltivec()) { + Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl; + IntrLD = Intrinsic::ppc_altivec_lvx; + IntrPerm = Intrinsic::ppc_altivec_vperm; + PermCntlTy = MVT::v16i8; + PermTy = MVT::v4i32; + LDTy = MVT::v4i32; + } else { + Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : + Intrinsic::ppc_qpx_qvlpcls; + IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : + Intrinsic::ppc_qpx_qvlfs; + IntrPerm = Intrinsic::ppc_qpx_qvfperm; + PermCntlTy = MVT::v4f64; + PermTy = MVT::v4f64; + LDTy = MemVT.getSimpleVT(); + } SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); @@ -14401,10 +15207,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != PermTy) - Perm = Subtarget.hasAltivec() - ? DAG.getNode(ISD::BITCAST, dl, VT, Perm) - : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, - DAG.getTargetConstant(1, dl, MVT::i64)); + Perm = Subtarget.hasAltivec() ? + DAG.getNode(ISD::BITCAST, dl, VT, Perm) : + DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX + DAG.getTargetConstant(1, dl, MVT::i64)); // second argument is 1 because this rounding // is always exact. @@ -14420,10 +15226,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, unsigned IID = cast(N->getOperand(0))->getZExtValue(); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); - if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { + if ((IID == Intr || + IID == Intrinsic::ppc_qpx_qvlpcld || + IID == Intrinsic::ppc_qpx_qvlpcls) && + N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); - int Bits = 4 /* 16 byte alignment */; + int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? + 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; if (DAG.MaskedValueIsZero(Add->getOperand(1), APInt::getAllOnesValue(Bits /* alignment */) @@ -14433,8 +15243,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast(UI->getOperand(0))->getZExtValue() == - IID) { + cast(UI->getOperand(0))->getZExtValue() == IID) { // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -14983,9 +15792,17 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::F4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); + if (VT == MVT::v4f64 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QFRCRegClass); + if (VT == MVT::v4f32 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QSRCRegClass); } break; case 'v': + if (VT == MVT::v4f64 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QFRCRegClass); + if (VT == MVT::v4f32 && Subtarget.hasQPX()) + return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); break; @@ -15277,6 +16094,12 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { + case Intrinsic::ppc_qpx_qvlfd: + case Intrinsic::ppc_qpx_qvlfs: + case Intrinsic::ppc_qpx_qvlfcd: + case Intrinsic::ppc_qpx_qvlfcs: + case Intrinsic::ppc_qpx_qvlfiwa: + case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: @@ -15298,6 +16121,18 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_lxvd2x: VT = MVT::v2f64; break; + case Intrinsic::ppc_qpx_qvlfd: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvlfs: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvlfcd: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvlfcs: + VT = MVT::v2f32; + break; default: VT = MVT::v4i32; break; @@ -15312,6 +16147,45 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad; return true; } + case Intrinsic::ppc_qpx_qvlfda: + case Intrinsic::ppc_qpx_qvlfsa: + case Intrinsic::ppc_qpx_qvlfcda: + case Intrinsic::ppc_qpx_qvlfcsa: + case Intrinsic::ppc_qpx_qvlfiwaa: + case Intrinsic::ppc_qpx_qvlfiwza: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_qpx_qvlfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvlfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvlfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvlfcsa: + VT = MVT::v2f32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.size = VT.getStoreSize(); + Info.align = Align(1); + Info.flags = MachineMemOperand::MOLoad; + return true; + } + case Intrinsic::ppc_qpx_qvstfd: + case Intrinsic::ppc_qpx_qvstfs: + case Intrinsic::ppc_qpx_qvstfcd: + case Intrinsic::ppc_qpx_qvstfcs: + case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: @@ -15333,6 +16207,18 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; + case Intrinsic::ppc_qpx_qvstfd: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvstfs: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvstfcd: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvstfcs: + VT = MVT::v2f32; + break; default: VT = MVT::v4i32; break; @@ -15347,6 +16233,39 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::ppc_qpx_qvstfda: + case Intrinsic::ppc_qpx_qvstfsa: + case Intrinsic::ppc_qpx_qvstfcda: + case Intrinsic::ppc_qpx_qvstfcsa: + case Intrinsic::ppc_qpx_qvstfiwa: { + EVT VT; + switch (Intrinsic) { + case Intrinsic::ppc_qpx_qvstfda: + VT = MVT::v4f64; + break; + case Intrinsic::ppc_qpx_qvstfsa: + VT = MVT::v4f32; + break; + case Intrinsic::ppc_qpx_qvstfcda: + VT = MVT::v2f64; + break; + case Intrinsic::ppc_qpx_qvstfcsa: + VT = MVT::v2f32; + break; + default: + VT = MVT::v4i32; + break; + } + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.size = VT.getStoreSize(); + Info.align = Align(1); + Info.flags = MachineMemOperand::MOStore; + return true; + } default: break; } @@ -15359,6 +16278,14 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, EVT PPCTargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { + // When expanding a memset, require at least two QPX instructions to cover + // the cost of loading the value to be stored from the constant pool. + if (Subtarget.hasQPX() && Op.size() >= 32 && + (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) && + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { + return MVT::v4f64; + } + // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. if (Subtarget.hasAltivec() && Op.size() >= 16 && @@ -15577,7 +16504,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles( if (VT == MVT::v2i64) return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves - if (Subtarget.hasVSX()) + if (Subtarget.hasVSX() || Subtarget.hasQPX()) return true; return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); @@ -15623,7 +16550,8 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, switch (Opc) { case PPCISD::FNMSUB: - if (!Op.hasOneUse() || !isTypeLegal(VT)) + // TODO: QPX subtarget is deprecated. No transformation here. + if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX()) break; const TargetOptions &Options = getTargetMachine().Options; @@ -16104,7 +17032,8 @@ SDValue PPCTargetLowering::combineFMALike(SDNode *N, bool LegalOps = !DCI.isBeforeLegalizeOps(); SDLoc Loc(N); - if (!isOperationLegal(ISD::FMA, VT)) + // TODO: QPX subtarget is deprecated. No transformation here. + if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT)) return SDValue(); // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0 diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 8cc42226d7f0b..768eaa43e0135 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -427,6 +427,22 @@ namespace llvm { /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) VABSD, + /// QVFPERM = This corresponds to the QPX qvfperm instruction. + QVFPERM, + + /// QVGPCI = This corresponds to the QPX qvgpci instruction. + QVGPCI, + + /// QVALIGNI = This corresponds to the QPX qvaligni instruction. + QVALIGNI, + + /// QVESPLATI = This corresponds to the QPX qvesplati instruction. + QVESPLATI, + + /// QBFLT = Access the underlying QPX floating-point boolean + /// representation. + QBFLT, + /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or /// lower (IDX=1) half of v4f32 to v2f64. FP_EXTEND_HALF, @@ -503,6 +519,10 @@ namespace llvm { /// Store scalar integers from VSR. ST_VSR_SCAL_INT, + /// QBRC, CHAIN = QVLFSb CHAIN, Ptr + /// The 4xf32 load used for v4i1 constants. + QVLFSb, + /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes /// except they ensure that the compare input is zero-extended for /// sub-word versions because the atomic loads zero-extend. diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 5ff5fc78326ba..632d4d9deb8a2 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -642,6 +642,7 @@ class XForm_17a opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let FRA = 0; } +// Used for QPX class XForm_18 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { @@ -1780,6 +1781,14 @@ class AForm_4 opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } +// Used for QPX +class AForm_4a opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : AForm_1 { + let FRA = 0; + let FRC = 0; +} + // 1.7.13 M-Form class MForm_1 opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> @@ -2090,6 +2099,49 @@ class VX_RD5_RSp5_PS1_XO9 xo, dag OOL, dag IOL, string asmstr, let Inst{23-31} = xo; } +// Z23-Form (used by QPX) +class Z23Form_1 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : I { + bits<5> FRT; + bits<5> FRA; + bits<5> FRB; + bits<2> idx; + + let Pattern = pattern; + + bit RC = 0; // set by isRecordForm + + let Inst{6-10} = FRT; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-22} = idx; + let Inst{23-30} = xo; + let Inst{31} = RC; +} + +class Z23Form_2 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : Z23Form_1 { + let FRB = 0; +} + +class Z23Form_3 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : I { + bits<5> FRT; + bits<12> idx; + + let Pattern = pattern; + + bit RC = 0; // set by isRecordForm + + let Inst{6-10} = FRT; + let Inst{11-22} = idx; + let Inst{23-30} = xo; + let Inst{31} = RC; +} + class Z23Form_8 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index d4c3c5f5504c7..99e25bb130ce4 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -259,6 +259,16 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case PPC::XVMULDP: case PPC::XVMULSP: case PPC::XSMULSP: + // QPX Add: + case PPC::QVFADD: + case PPC::QVFADDS: + case PPC::QVFADDSs: + // QPX Multiply: + case PPC::QVFMUL: + case PPC::QVFMULS: + case PPC::QVFMULSs: + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); // Fixed point: // Multiply: case PPC::MULHD: @@ -290,7 +300,9 @@ static const uint16_t FMAOpIdxInfo[][5] = { {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2}, {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2}, {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1}, - {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}}; + {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}, + {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1}, + {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}}; // Check if an opcode is a FMA instruction. If it is, return the index in array // FMAOpIdxInfo. Otherwise, return -1. @@ -654,6 +666,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::LI8: case PPC::LIS: case PPC::LIS8: + case PPC::QVGPCI: case PPC::ADDIStocHA: case PPC::ADDIStocHA8: case PPC::ADDItocL: @@ -1330,6 +1343,12 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) || PPC::VSSRCRegClass.contains(DestReg, SrcReg)) Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf; + else if (PPC::QFRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::QVFMR; + else if (PPC::QSRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::QVFMRs; + else if (PPC::QBRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::QVFMRb; else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) @@ -1374,6 +1393,12 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) { OpcodeIndex = SOK_VectorFloat4Spill; } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_VRSaveSpill; + } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { + OpcodeIndex = SOK_QuadFloat8Spill; + } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { + OpcodeIndex = SOK_QuadFloat4Spill; + } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { + OpcodeIndex = SOK_QuadBitSpill; } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SpillToVSR; } else { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index bdcfa76505daf..43973c627fcf1 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -123,6 +123,9 @@ enum SpillOpcodeKey { SOK_VectorFloat8Spill, SOK_VectorFloat4Spill, SOK_VRSaveSpill, + SOK_QuadFloat8Spill, + SOK_QuadFloat4Spill, + SOK_QuadBitSpill, SOK_SpillToVSR, SOK_SPESpill, SOK_LastOpcodeSpill // This must be last on the enum. @@ -133,28 +136,32 @@ enum SpillOpcodeKey { { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \ - PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD, PPC::EVLDD \ + PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb, \ + PPC::SPILLTOVSR_LD, PPC::EVLDD \ } #define Pwr9LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ - PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD \ + PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, \ + PPC::QVLFDXb, PPC::SPILLTOVSR_LD \ } #define Pwr8StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \ - PPC::SPILLTOVSR_ST, PPC::EVSTDD \ + PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST, \ + PPC::EVSTDD \ } #define Pwr9StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ - PPC::SPILL_VRSAVE, PPC::SPILLTOVSR_ST \ + PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, \ + PPC::SPILLTOVSR_ST \ } // Initialize arrays for load and store spill opcodes on supported subtargets. @@ -266,10 +273,10 @@ class PPCInstrInfo : public PPCGenInstrInfo { } static bool isSameClassPhysRegCopy(unsigned Opcode) { - unsigned CopyOpcodes[] = {PPC::OR, PPC::OR8, PPC::FMR, - PPC::VOR, PPC::XXLOR, PPC::XXLORf, - PPC::XSCPSGNDP, PPC::MCRF, PPC::CROR, - PPC::EVOR, -1U}; + unsigned CopyOpcodes[] = + { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf, + PPC::XSCPSGNDP, PPC::MCRF, PPC::QVFMR, PPC::QVFMRs, PPC::QVFMRb, + PPC::CROR, PPC::EVOR, -1U }; for (int i = 0; CopyOpcodes[i] != -1U; i++) if (Opcode == CopyOpcodes[i]) return true; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 83a434f5e793a..c565758973bf5 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -203,6 +203,16 @@ def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; +def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; +def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>; +def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>; +def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>; + +def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>; + +def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb, + [SDNPHasChain, SDNPMayLoad]>; + def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>; // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift @@ -3451,6 +3461,7 @@ include "PPCInstrAltivec.td" include "PPCInstrSPE.td" include "PPCInstr64Bit.td" include "PPCInstrVSX.td" +include "PPCInstrQPX.td" include "PPCInstrHTM.td" def crnot : OutPatFrag<(ops node:$in), diff --git a/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/llvm/lib/Target/PowerPC/PPCInstrQPX.td new file mode 100644 index 0000000000000..2265af2815cb5 --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCInstrQPX.td @@ -0,0 +1,1212 @@ +//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the QPX extension to the PowerPC instruction set. +// Reference: +// Book Q: QPX Architecture Definition. IBM (as updated in) 2011. +// +//===----------------------------------------------------------------------===// + +def PPCRegQFRCAsmOperand : AsmOperandClass { + let Name = "RegQFRC"; let PredicateMethod = "isRegNumber"; +} +def qfrc : RegisterOperand { + let ParserMatchClass = PPCRegQFRCAsmOperand; +} +def PPCRegQSRCAsmOperand : AsmOperandClass { + let Name = "RegQSRC"; let PredicateMethod = "isRegNumber"; +} +def qsrc : RegisterOperand { + let ParserMatchClass = PPCRegQSRCAsmOperand; +} +def PPCRegQBRCAsmOperand : AsmOperandClass { + let Name = "RegQBRC"; let PredicateMethod = "isRegNumber"; +} +def qbrc : RegisterOperand { + let ParserMatchClass = PPCRegQBRCAsmOperand; +} + +//===----------------------------------------------------------------------===// +// Helpers for defining instructions that directly correspond to intrinsics. + +// QPXA1_Int - A AForm_1 intrinsic definition. +class QPXA1_Int opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_1; +// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions). +class QPXA1s_Int opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_1; +// QPXA2_Int - A AForm_2 intrinsic definition. +class QPXA2_Int opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_2; +// QPXA3_Int - A AForm_3 intrinsic definition. +class QPXA3_Int opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_3; +// QPXA4_Int - A AForm_4a intrinsic definition. +class QPXA4_Int opcode, bits<5> xo, string opc, Intrinsic IntID> + : AForm_4a; +// QPXX18_Int - A XForm_18 intrinsic definition. +class QPXX18_Int opcode, bits<10> xo, string opc, Intrinsic IntID> + : XForm_18; +// QPXX19_Int - A XForm_19 intrinsic definition. +class QPXX19_Int opcode, bits<10> xo, string opc, Intrinsic IntID> + : XForm_19; + +//===----------------------------------------------------------------------===// +// Pattern Frags. + +def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::v4f32; +}]>; + +def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::v4f32; +}]>; +def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset), + (pre_truncst node:$val, + node:$base, node:$offset), [{ + return cast(N)->getMemoryVT() == MVT::v4f32; +}]>; + +def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{ + return cast(N->getOperand(1))->getZExtValue() == 0; +}]>; + +def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{ + return cast(N->getOperand(1))->getZExtValue() == 1; +}]>; + +let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs. + def u12 : ImmLeaf; + +//===----------------------------------------------------------------------===// +// Instruction Definitions. + +def HasQPX : Predicate<"Subtarget->hasQPX()">; +let Predicates = [HasQPX] in { +let DecoderNamespace = "QPX" in { +let hasSideEffects = 0 in { // QPX instructions don't have side effects. +let Uses = [RM] in { + // Add Instructions + let isCommutable = 1 in { + def QVFADD : AForm_2<4, 21, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>; + def QVFADDSs : AForm_2<0, 21, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>; + } + def QVFSUB : AForm_2<4, 20, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>; + def QVFSUBSs : AForm_2<0, 20, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>; + + // Estimate Instructions + def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfre $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>; + def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>; + let isCodeGenOnly = 1 in + def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfres $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>; + + def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrsqrte $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>; + def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>; + let isCodeGenOnly = 1 in + def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>; + + // Multiply Instructions + let isCommutable = 1 in { + def QVFMUL : AForm_3<4, 25, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), + "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral, + [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>; + let isCodeGenOnly = 1 in + def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>; + def QVFMULSs : AForm_3<0, 25, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC), + "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral, + [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>; + } + def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>; + def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>; + + // Multiply-add instructions + def QVFMADD : AForm_1<4, 29, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), + "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>; + def QVFMADDSs : AForm_1<0, 29, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), + "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>; + def QVFNMADD : AForm_1<4, 31, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), + "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, + v4f64:$FRB)))]>; + let isCodeGenOnly = 1 in + def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>; + def QVFNMADDSs : AForm_1<0, 31, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), + "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, + v4f32:$FRB)))]>; + def QVFMSUB : AForm_1<4, 28, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), + "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, + (fneg v4f64:$FRB)))]>; + let isCodeGenOnly = 1 in + def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>; + def QVFMSUBSs : AForm_1<0, 28, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), + "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, + (fneg v4f32:$FRB)))]>; + def QVFNMSUB : AForm_1<4, 30, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), + "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, + (fneg v4f64:$FRB))))]>; + let isCodeGenOnly = 1 in + def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>; + def QVFNMSUBSs : AForm_1<0, 30, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), + "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, + (fneg v4f32:$FRB))))]>; + def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>; + def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>; + def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>; + def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>; + def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>; + def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>; + def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>; + def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>; + + // Select Instruction + let isCodeGenOnly = 1 in + def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>; + def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT), + (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (vselect v4i1:$FRA, + v4f64:$FRC, v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT), + (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC), + "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (vselect v4i1:$FRA, + v4f32:$FRC, v4f32:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT), + (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC), + "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, + [(set v4i1:$FRT, (vselect v4i1:$FRA, + v4i1:$FRC, v4i1:$FRB))]>; + + // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after + // instruction selection into a branch sequence. + def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, + i32imm:$BROPC), "#SELECT_CC_QFRC", + []>; + def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, + i32imm:$BROPC), "#SELECT_CC_QSRC", + []>; + def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, + i32imm:$BROPC), "#SELECT_CC_QBRC", + []>; + + // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition + // register bit directly. + def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond, + qfrc:$T, qfrc:$F), "#SELECT_QFRC", + [(set v4f64:$dst, + (select i1:$cond, v4f64:$T, v4f64:$F))]>; + def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond, + qsrc:$T, qsrc:$F), "#SELECT_QSRC", + [(set v4f32:$dst, + (select i1:$cond, v4f32:$T, v4f32:$F))]>; + def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond, + qbrc:$T, qbrc:$F), "#SELECT_QBRC", + [(set v4i1:$dst, + (select i1:$cond, v4i1:$T, v4i1:$F))]>; + + // Convert and Round Instructions + def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>; + let isCodeGenOnly = 1 in + def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB), + "qvfctid $FRT, $FRB", IIC_FPGeneral, []>; + + def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>; + def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>; + def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>; + def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>; + def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>; + def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>; + def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>; + def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>; + let isCodeGenOnly = 1 in + def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB), + "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>; + + def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>; + def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>; + def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>; + + let isCodeGenOnly = 1 in + def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>; + def QVFRSPs : XForm_19<4, 12, + (outs qsrc:$FRT), (ins qfrc:$FRB), + "qvfrsp $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>; + + def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfriz $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfriz $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>; + + def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrin $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (fround v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrin $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fround v4f32:$FRB))]>; + + def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrip $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (fceil v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrip $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (fceil v4f32:$FRB))]>; + + def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrim $FRT, $FRB", IIC_FPGeneral, + [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrim $FRT, $FRB", IIC_FPGeneral, + [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>; + + // Move Instructions + def QVFMR : XForm_19<4, 72, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfmr $FRT, $FRB", IIC_VecPerm, + [/* (set v4f64:$FRT, v4f64:$FRB) */]>; + let isCodeGenOnly = 1 in { + def QVFMRs : XForm_19<4, 72, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfmr $FRT, $FRB", IIC_VecPerm, + [/* (set v4f32:$FRT, v4f32:$FRB) */]>; + def QVFMRb : XForm_19<4, 72, + (outs qbrc:$FRT), (ins qbrc:$FRB), + "qvfmr $FRT, $FRB", IIC_VecPerm, + [/* (set v4i1:$FRT, v4i1:$FRB) */]>; + } + def QVFNEG : XForm_19<4, 40, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfneg $FRT, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fneg v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFNEGs : XForm_19<4, 40, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfneg $FRT, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fneg v4f32:$FRB))]>; + def QVFABS : XForm_19<4, 264, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfabs $FRT, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fabs v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFABSs : XForm_19<4, 264, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfabs $FRT, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fabs v4f32:$FRB))]>; + def QVFNABS : XForm_19<4, 136, + (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfnabs $FRT, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>; + let isCodeGenOnly = 1 in + def QVFNABSs : XForm_19<4, 136, + (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfnabs $FRT, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>; + def QVFCPSGN : XForm_18<4, 8, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, + [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>; + let isCodeGenOnly = 1 in + def QVFCPSGNs : XForm_18<4, 8, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, + [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>; + + def QVALIGNI : Z23Form_1<4, 5, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx), + "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, + [(set v4f64:$FRT, + (PPCqvaligni v4f64:$FRA, v4f64:$FRB, + (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVALIGNIs : Z23Form_1<4, 5, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx), + "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, + [(set v4f32:$FRT, + (PPCqvaligni v4f32:$FRA, v4f32:$FRB, + (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVALIGNIb : Z23Form_1<4, 5, + (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx), + "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, + [(set v4i1:$FRT, + (PPCqvaligni v4i1:$FRA, v4i1:$FRB, + (i32 imm:$idx)))]>; + + def QVESPLATI : Z23Form_2<4, 37, + (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx), + "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, + [(set v4f64:$FRT, + (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVESPLATIs : Z23Form_2<4, 37, + (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx), + "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, + [(set v4f32:$FRT, + (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>; + let isCodeGenOnly = 1 in + def QVESPLATIb : Z23Form_2<4, 37, + (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx), + "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, + [(set v4i1:$FRT, + (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>; + + def QVFPERM : AForm_1<4, 6, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), + "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, + [(set v4f64:$FRT, + (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; + let isCodeGenOnly = 1 in + def QVFPERMs : AForm_1<4, 6, + (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC), + "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, + [(set v4f32:$FRT, + (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>; + + let isReMaterializable = 1, isAsCheapAsAMove = 1 in + def QVGPCI : Z23Form_3<4, 133, + (outs qfrc:$FRT), (ins u12imm:$idx), + "qvgpci $FRT, $idx", IIC_VecPerm, + [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>; + + // Compare Instruction + let isCodeGenOnly = 1 in + def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>; + def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>; + let isCodeGenOnly = 1 in + def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>; + let isCodeGenOnly = 1 in + def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>; + def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>; + let isCodeGenOnly = 1 in + def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>; + let isCodeGenOnly = 1 in + def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>; + def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>; + let isCodeGenOnly = 1 in + def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>; + let isCodeGenOnly = 1 in + def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>; + def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), + "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>; + let isCodeGenOnly = 1 in + def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), + "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, + [(set v4i1:$FRT, + (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>; + + let isCodeGenOnly = 1 in + def QVFLOGICAL : XForm_20<4, 4, + (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt), + "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; + def QVFLOGICALb : XForm_20<4, 4, + (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), + "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; + let isCodeGenOnly = 1 in + def QVFLOGICALs : XForm_20<4, 4, + (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), + "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; + + // Load indexed instructions + let mayLoad = 1 in { + def QVLFDX : XForm_1_memOp<31, 583, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfdx $FRT, $src", IIC_LdStLFD, + [(set v4f64:$FRT, (load xoaddr:$src))]>; + let isCodeGenOnly = 1 in + def QVLFDXb : XForm_1_memOp<31, 583, + (outs qbrc:$FRT), (ins memrr:$src), + "qvlfdx $FRT, $src", IIC_LdStLFD, []>; + + let RC = 1 in + def QVLFDXA : XForm_1<31, 583, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfdxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFDUX : XForm_1<31, 615, + (outs qfrc:$FRT, ptr_rc_nor0:$ea_result), + (ins memrr:$src), + "qvlfdux $FRT, $src", IIC_LdStLFDU, []>, + RegConstraint<"$src.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + let RC = 1 in + def QVLFDUXA : XForm_1<31, 615, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfduxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFSX : XForm_1_memOp<31, 519, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfsx $FRT, $src", IIC_LdStLFD, + [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>; + + let isCodeGenOnly = 1 in + def QVLFSXb : XForm_1<31, 519, + (outs qbrc:$FRT), (ins memrr:$src), + "qvlfsx $FRT, $src", IIC_LdStLFD, + [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>; + let isCodeGenOnly = 1 in + def QVLFSXs : XForm_1_memOp<31, 519, + (outs qsrc:$FRT), (ins memrr:$src), + "qvlfsx $FRT, $src", IIC_LdStLFD, + [(set v4f32:$FRT, (load xoaddr:$src))]>; + + let RC = 1 in + def QVLFSXA : XForm_1<31, 519, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfsxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFSUX : XForm_1<31, 551, + (outs qsrc:$FRT, ptr_rc_nor0:$ea_result), + (ins memrr:$src), + "qvlfsux $FRT, $src", IIC_LdStLFDU, []>, + RegConstraint<"$src.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + + let RC = 1 in + def QVLFSUXA : XForm_1<31, 551, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCDX : XForm_1<31, 71, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcdx $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFCDXA : XForm_1<31, 71, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCDUX : XForm_1<31, 103, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcdux $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFCDUXA : XForm_1<31, 103, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCSX : XForm_1<31, 7, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; + let isCodeGenOnly = 1 in + def QVLFCSXs : XForm_1<31, 7, + (outs qsrc:$FRT), (ins memrr:$src), + "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; + + let RC = 1 in + def QVLFCSXA : XForm_1<31, 7, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFCSUX : XForm_1<31, 39, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsux $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFCSUXA : XForm_1<31, 39, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFIWAX : XForm_1<31, 871, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwax $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFIWAXA : XForm_1<31, 871, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>; + + def QVLFIWZX : XForm_1<31, 839, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>; + let RC = 1 in + def QVLFIWZXA : XForm_1<31, 839, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>; + } + + + def QVLPCLDX : XForm_1<31, 582, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpcldx $FRT, $src", IIC_LdStLFD, []>; + def QVLPCLSX : XForm_1<31, 518, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpclsx $FRT, $src", IIC_LdStLFD, []>; + let isCodeGenOnly = 1 in + def QVLPCLSXint : XForm_11<31, 518, + (outs qfrc:$FRT), (ins G8RC:$src), + "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>; + def QVLPCRDX : XForm_1<31, 70, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>; + def QVLPCRSX : XForm_1<31, 6, + (outs qfrc:$FRT), (ins memrr:$src), + "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>; + + // Store indexed instructions + let mayStore = 1 in { + def QVSTFDX : XForm_8_memOp<31, 711, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdx $FRT, $dst", IIC_LdStSTFD, + [(store qfrc:$FRT, xoaddr:$dst)]>; + let isCodeGenOnly = 1 in + def QVSTFDXb : XForm_8_memOp<31, 711, + (outs), (ins qbrc:$FRT, memrr:$dst), + "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>; + + let RC = 1 in + def QVSTFDXA : XForm_8<31, 711, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res), + (ins qfrc:$FRT, memrr:$dst), + "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, + NoEncode<"$ea_res">; + + let RC = 1 in + def QVSTFDUXA : XForm_8<31, 743, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFDXI : XForm_8<31, 709, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFDXIA : XForm_8<31, 709, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFDUXI : XForm_8<31, 741, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFDUXIA : XForm_8<31, 741, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSX : XForm_8_memOp<31, 647, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsx $FRT, $dst", IIC_LdStSTFD, + [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>; + let isCodeGenOnly = 1 in + def QVSTFSXs : XForm_8_memOp<31, 647, + (outs), (ins qsrc:$FRT, memrr:$dst), + "qvstfsx $FRT, $dst", IIC_LdStSTFD, + [(store qsrc:$FRT, xoaddr:$dst)]>; + + let RC = 1 in + def QVSTFSXA : XForm_8<31, 647, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), + (ins qsrc:$FRT, memrr:$dst), + "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, + NoEncode<"$ea_res">; + let isCodeGenOnly = 1 in + def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), + (ins qfrc:$FRT, memrr:$dst), + "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, + NoEncode<"$ea_res">; + + let RC = 1 in + def QVSTFSUXA : XForm_8<31, 679, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSXI : XForm_8<31, 645, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFSXIA : XForm_8<31, 645, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFSUXI : XForm_8<31, 677, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFSUXIA : XForm_8<31, 677, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDX : XForm_8<31, 199, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDXA : XForm_8<31, 199, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSX : XForm_8<31, 135, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; + let isCodeGenOnly = 1 in + def QVSTFCSXs : XForm_8<31, 135, + (outs), (ins qsrc:$FRT, memrr:$dst), + "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; + + let RC = 1 in + def QVSTFCSXA : XForm_8<31, 135, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDUX : XForm_8<31, 231, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDUXA : XForm_8<31, 231, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSUX : XForm_8<31, 167, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCSUXA : XForm_8<31, 167, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDXI : XForm_8<31, 197, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDXIA : XForm_8<31, 197, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSXI : XForm_8<31, 133, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCSXIA : XForm_8<31, 133, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCDUXI : XForm_8<31, 229, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCDUXIA : XForm_8<31, 229, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFCSUXI : XForm_8<31, 165, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFCSUXIA : XForm_8<31, 165, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>; + + def QVSTFIWX : XForm_8<31, 967, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>; + let RC = 1 in + def QVSTFIWXA : XForm_8<31, 967, + (outs), (ins qfrc:$FRT, memrr:$dst), + "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>; + } +} + +} // neverHasSideEffects +} + +def : InstAlias<"qvfclr $FRT", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>; +def : InstAlias<"qvfand $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>; +def : InstAlias<"qvfandc $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>; +def : InstAlias<"qvfctfb $FRT, $FRA", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>; +def : InstAlias<"qvfxor $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>; +def : InstAlias<"qvfor $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>; +def : InstAlias<"qvfnor $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>; +def : InstAlias<"qvfequ $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>; +def : InstAlias<"qvfnot $FRT, $FRA", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>; +def : InstAlias<"qvforc $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>; +def : InstAlias<"qvfnand $FRT, $FRA, $FRB", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>; +def : InstAlias<"qvfset $FRT", + (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>; + +//===----------------------------------------------------------------------===// +// Additional QPX Patterns +// + +def : Pat<(v4f64 (scalar_to_vector f64:$A)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>; +def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; + +def : Pat<(f64 (extractelt v4f64:$S, 0)), + (EXTRACT_SUBREG $S, sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, 0)), + (EXTRACT_SUBREG $S, sub_64)>; + +def : Pat<(f64 (extractelt v4f64:$S, 1)), + (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; +def : Pat<(f64 (extractelt v4f64:$S, 2)), + (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; +def : Pat<(f64 (extractelt v4f64:$S, 3)), + (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; + +def : Pat<(f32 (extractelt v4f32:$S, 1)), + (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, 2)), + (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, 3)), + (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; + +def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), + (EXTRACT_SUBREG (QVFPERM $S, $S, + (QVLPCLSXint (RLDICR $F, 2, + /* 63-2 = */ 61))), + sub_64)>; +def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), + (EXTRACT_SUBREG (QVFPERMs $S, $S, + (QVLPCLSXint (RLDICR $F, 2, + /* 63-2 = */ 61))), + sub_64)>; + +def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C), + (QVFPERM $A, $B, $C)>; + +def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B), + (QVFCPSGN $A, $B)>; + +// FCOPYSIGN's operand types need not agree. +def : Pat<(fcopysign v4f64:$frB, v4f32:$frA), + (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>; +def : Pat<(fcopysign QSRC:$frB, QFRC:$frA), + (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>; + +def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>; +def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>; +def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>; + +def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>; +def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>; +def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>; +def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>; + +def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>; +def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>; + +def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B), + (QVFADD $A, $B)>; +def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B), + (QVFSUB $A, $B)>; +def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B), + (QVFMUL $A, $B)>; + +// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b) +def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B), + (QVFNMSUB $A, $C, $B)>; +def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B), + (QVFNMSUB $A, $C, $B)>; +def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), + (QVFNMSUBSs $A, $C, $B)>; +def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), + (QVFNMSUBSs $A, $C, $B)>; + +def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C), + (QVFMADD $A, $B, $C)>; +def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C), + (QVFNMADD $A, $B, $C)>; +def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C), + (QVFMSUB $A, $B, $C)>; +def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C), + (QVFNMSUB $A, $B, $C)>; + +def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src), + (QVLFDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), + (QVLFDXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src), + (QVLFSX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), + (QVLFSXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src), + (QVLFCDXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src), + (QVLFCDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src), + (QVLFCSXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src), + (QVLFCSX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), + (QVLFDXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src), + (QVLFIWAXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src), + (QVLFIWAX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src), + (QVLFIWZXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src), + (QVLFIWZX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), + (QVLFSXA xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src), + (QVLPCLDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src), + (QVLPCLSX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src), + (QVLPCRDX xoaddr:$src)>; +def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src), + (QVLPCRSX xoaddr:$src)>; + +def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst), + (QVSTFDX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst), + (QVSTFSX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst), + (QVSTFCDXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst), + (QVSTFCDX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst), + (QVSTFCSXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst), + (QVSTFCSX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst), + (QVSTFDXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst), + (QVSTFIWXA $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst), + (QVSTFIWX $T, xoaddr:$dst)>; +def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst), + (QVSTFSXA $T, xoaddr:$dst)>; + +def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (QVSTFDUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (QVSTFSUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (QVSTFSUXs $rS, $ptrreg, $ptroff)>; + +def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)), + (QVFLOGICAL $A, $B, imm:$idx)>; +def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)), + (QVGPCI imm:$idx)>; + +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE), + (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE), + (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE), + (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ), + (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT), + (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFCMPLTb $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT), + (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFTSTNANb $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFCMPGTb $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE), + (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), + (QVFCMPEQb $FRA, $FRB), (i32 13))>; + +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ), + (QVFCMPEQb $FRA, $FRB)>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT), + (QVFCMPGTb $FRA, $FRB)>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE), + (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFCMPLTb $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT), + (QVFCMPLTb $FRA, $FRB)>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE), + (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFCMPGTb $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE), + (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), + (QVFCMPEQb $FRA, $FRB), (i32 10))>; + +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE), + (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE), + (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE), + (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 8))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ), + (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT), + (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFCMPLTbs $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT), + (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFTSTNANbs $FRA, $FRB), (i32 7))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFCMPGTbs $FRA, $FRB), (i32 13))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE), + (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), + (QVFCMPEQbs $FRA, $FRB), (i32 13))>; + +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ), + (QVFCMPEQbs $FRA, $FRB)>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT), + (QVFCMPGTbs $FRA, $FRB)>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE), + (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFCMPLTbs $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT), + (QVFCMPLTbs $FRA, $FRB)>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE), + (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFCMPGTbs $FRA, $FRB), (i32 10))>; +def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE), + (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), + (QVFCMPEQbs $FRA, $FRB), (i32 10))>; + +def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 4))>; +def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 8))>; +def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 9))>; +def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 13))>; +def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)), + (QVFLOGICALb $FRA, $FRB, (i32 14))>; + +def : Pat<(and v4i1:$FRA, v4i1:$FRB), + (QVFLOGICALb $FRA, $FRB, (i32 1))>; +def : Pat<(or v4i1:$FRA, v4i1:$FRB), + (QVFLOGICALb $FRA, $FRB, (i32 7))>; +def : Pat<(xor v4i1:$FRA, v4i1:$FRB), + (QVFLOGICALb $FRA, $FRB, (i32 6))>; +def : Pat<(not v4i1:$FRA), + (QVFLOGICALb $FRA, $FRA, (i32 10))>; + +def : Pat<(v4f64 (fpextend v4f32:$src)), + (COPY_TO_REGCLASS $src, QFRC)>; + +def : Pat<(v4f32 (fround_exact v4f64:$src)), + (COPY_TO_REGCLASS $src, QSRC)>; + +// Extract the underlying floating-point values from the +// QPX (-1.0, 1.0) boolean representation. +def : Pat<(v4f64 (PPCqbflt v4i1:$src)), + (COPY_TO_REGCLASS $src, QFRC)>; + +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)), + (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)), + (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)), + (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)), + (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)), + (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)), + (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)), + (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)), + (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)), + (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)), + (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)), + (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)), + (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)), + (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)), + (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)), + (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)), + (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)), + (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)), + (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)), + (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)), + (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)), + (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)), + (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)), + (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)), + (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)), + (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)), + (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)), + (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)), + (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)), + (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)), + (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +} // end HasQPX + +let Predicates = [HasQPX, NoNaNsFPMath] in { +def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>; +def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>; + +def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>; +def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>; +} + +let Predicates = [HasQPX, NaNsFPMath] in { +// When either of these operands is NaN, we should return the other operand. +// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need +// to explicitly or with a NaN test on the second operand. +def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), + (QVFTSTNANb $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; +def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), + (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), + (QVFTSTNANb $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; + +def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), + (QVFTSTNANbs $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; +def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), + (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), + (QVFTSTNANbs $FRB, $FRB), (i32 7)), + $FRB, $FRA)>; +} diff --git a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp new file mode 100644 index 0000000000000..6e90426438208 --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp @@ -0,0 +1,161 @@ +//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The QPX vector registers overlay the scalar floating-point registers, and +// any scalar floating-point loads splat their value across all vector lanes. +// Thus, if we have a scalar load followed by a splat, we can remove the splat +// (i.e. replace the load with a load-and-splat pseudo instruction). +// +// This pass must run after anything that might do store-to-load forwarding. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "ppc-qpx-load-splat" + +STATISTIC(NumSimplified, "Number of QPX load splats simplified"); + +namespace { + struct PPCQPXLoadSplat : public MachineFunctionPass { + static char ID; + PPCQPXLoadSplat() : MachineFunctionPass(ID) { + initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { + return "PowerPC QPX Load Splat Simplification"; + } + }; + char PPCQPXLoadSplat::ID = 0; +} + +INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat", + "PowerPC QPX Load Splat Simplification", + false, false) + +FunctionPass *llvm::createPPCQPXLoadSplatPass() { + return new PPCQPXLoadSplat(); +} + +bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + bool MadeChange = false; + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) { + MachineBasicBlock *MBB = &*MFI; + SmallVector Splats; + + for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) { + MachineInstr *MI = &*MBBI; + + if (MI->hasUnmodeledSideEffects() || MI->isCall()) { + Splats.clear(); + continue; + } + + // We're looking for a sequence like this: + // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2) + // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm + + for (auto SI = Splats.begin(); SI != Splats.end();) { + MachineInstr *SMI = *SI; + Register SplatReg = SMI->getOperand(0).getReg(); + Register SrcReg = SMI->getOperand(1).getReg(); + + if (MI->modifiesRegister(SrcReg, TRI)) { + switch (MI->getOpcode()) { + default: + SI = Splats.erase(SI); + continue; + case PPC::LFS: + case PPC::LFD: + case PPC::LFSU: + case PPC::LFDU: + case PPC::LFSUX: + case PPC::LFDUX: + case PPC::LFSX: + case PPC::LFDX: + case PPC::LFIWAX: + case PPC::LFIWZX: + if (SplatReg != SrcReg) { + // We need to change the load to define the scalar subregister of + // the QPX splat source register. + unsigned SubRegIndex = + TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg()); + Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); + + // Substitute both the explicit defined register, and also the + // implicit def of the containing QPX register. + MI->getOperand(0).setReg(SplatSubReg); + MI->substituteRegister(SrcReg, SplatReg, 0, *TRI); + } + + SI = Splats.erase(SI); + + // If SMI is directly after MI, then MBBI's base iterator is + // pointing at SMI. Adjust MBBI around the call to erase SMI to + // avoid invalidating MBBI. + ++MBBI; + SMI->eraseFromParent(); + --MBBI; + + ++NumSimplified; + MadeChange = true; + continue; + } + } + + // If this instruction defines the splat register, then we cannot move + // the previous definition above it. If it reads from the splat + // register, then it must already be alive from some previous + // definition, and if the splat register is different from the source + // register, then this definition must not be the load for which we're + // searching. + if (MI->modifiesRegister(SplatReg, TRI) || + (SrcReg != SplatReg && + MI->readsRegister(SplatReg, TRI))) { + SI = Splats.erase(SI); + continue; + } + + ++SI; + } + + if (MI->getOpcode() != PPC::QVESPLATI && + MI->getOpcode() != PPC::QVESPLATIs && + MI->getOpcode() != PPC::QVESPLATIb) + continue; + if (MI->getOperand(2).getImm() != 0) + continue; + + // If there are other uses of the scalar value after this, replacing + // those uses might be non-trivial. + if (!MI->getOperand(1).isKill()) + continue; + + Splats.push_back(MI); + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 96666ad58dfe5..ed8948a639728 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -404,6 +404,9 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } case PPC::F8RCRegClassID: case PPC::F4RCRegClassID: + case PPC::QFRCRegClassID: + case PPC::QSRCRegClassID: + case PPC::QBRCRegClassID: case PPC::VRRCRegClassID: case PPC::VFRCRegClassID: case PPC::VSLRCRegClassID: diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index a931967862c7b..61acd955e1cba 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -153,6 +153,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { switch (RegName[0]) { case 'r': case 'f': + case 'q': // for QPX case 'v': if (RegName[1] == 's') return RegName + 2; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index e07b960ae305b..b45757c1acc5e 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -54,6 +54,13 @@ class FPR num, string n> : PPCReg { let HWEncoding{4-0} = num; } +// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX) +class QFPR : PPCReg { + let HWEncoding = SubReg.HWEncoding; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_64]; +} + // VF - One of the 32 64-bit floating-point subregisters of the vector // registers (used by VSX). class VF num, string n> : PPCReg { @@ -125,6 +132,12 @@ foreach Index = 0-31 in { DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; } +// QPX Floating-point registers +foreach Index = 0-31 in { + def QF#Index : QFPR("F"#Index), "q"#Index>, + DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; +} + // Vector registers foreach Index = 0-31 in { def V#Index : VR("VF"#Index), "v"#Index>, @@ -330,6 +343,16 @@ def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC, // Register class for single precision scalars in VSX registers def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>; +// For QPX +def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13), + (sequence "QF%u", 31, 14))>; +def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>; +def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> { + // These are actually stored as floating-point values where a positive + // number is true and anything else (including NaN) is false. + let Size = 256; +} + def CRBITRC : RegisterClass<"PPC", [i1], 32, (add CR2LT, CR2GT, CR2EQ, CR2UN, CR3LT, CR3GT, CR3EQ, CR3UN, diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td index 311d5cf165f63..0a1ae7e55b3c2 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td @@ -40,9 +40,12 @@ def P9Model : SchedMachineModel { let CompleteModel = 1; - // Do not support SPE (Signal Processing Engine), prefixed instructions on - // Power 9, PC relative mem ops, or instructions introduced in ISA 3.1. - let UnsupportedFeatures = [HasSPE, PrefixInstrs, PCRelativeMemops, IsISA3_1]; + // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing + // Engine), prefixed instructions on Power 9, PC relative mem ops, or + // instructions introduced in ISA 3.1. + let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops, + IsISA3_1]; + } let SchedModel = P9Model in { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 85d2966654970..3836cc960394f 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -35,6 +35,10 @@ using namespace llvm; static cl::opt UseSubRegLiveness("ppc-track-subreg-liveness", cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden); +static cl::opt QPXStackUnaligned("qpx-stack-unaligned", + cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"), + cl::Hidden); + static cl::opt EnableMachinePipeliner("ppc-enable-pipeliner", cl::desc("Enable Machine Pipeliner for PPC"), @@ -66,6 +70,7 @@ void PPCSubtarget::initializeEnvironment() { HasAltivec = false; HasSPE = false; HasFPU = false; + HasQPX = false; HasVSX = false; NeedsTwoConstNR = false; HasP8Vector = false; @@ -104,6 +109,7 @@ void PPCSubtarget::initializeEnvironment() { HasInvariantFunctionDescriptors = false; HasPartwordAtomics = false; HasDirectMove = false; + IsQPXStackUnaligned = false; HasHTM = false; HasFloat128 = false; HasFusion = false; @@ -152,7 +158,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (HasSPE && IsPPC64) report_fatal_error( "SPE is only supported for 32-bit targets.\n", false); - if (HasSPE && (HasAltivec || HasVSX || HasFPU)) + if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU)) report_fatal_error( "SPE and traditional floating point cannot both be enabled.\n", false); @@ -160,6 +166,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (!HasSPE) HasFPU = true; + // QPX requires a 32-byte aligned stack. Note that we need to do this if + // we're compiling for a BG/Q system regardless of whether or not QPX + // is enabled because external functions will assume this alignment. + IsQPXStackUnaligned = QPXStackUnaligned; StackAlignment = getPlatformStackAlignment(); // Determine endianness. diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 8a4041518e3c2..ec329022c4572 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -97,6 +97,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool HasAltivec; bool HasFPU; bool HasSPE; + bool HasQPX; bool HasVSX; bool NeedsTwoConstNR; bool HasP8Vector; @@ -149,6 +150,11 @@ class PPCSubtarget : public PPCGenSubtargetInfo { POPCNTDKind HasPOPCNTD; + /// When targeting QPX running a stock PPC64 Linux kernel where the stack + /// alignment has not been changed, we need to keep the 16-byte alignment + /// of the stack. + bool IsQPXStackUnaligned; + const PPCTargetMachine &TM; PPCFrameLowering FrameLowering; PPCInstrInfo InstrInfo; @@ -249,6 +255,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasAltivec() const { return HasAltivec; } bool hasSPE() const { return HasSPE; } bool hasFPU() const { return HasFPU; } + bool hasQPX() const { return HasQPX; } bool hasVSX() const { return HasVSX; } bool needsTwoConstNR() const { return NeedsTwoConstNR; } bool hasP8Vector() const { return HasP8Vector; } @@ -284,7 +291,11 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasPartwordAtomics() const { return HasPartwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } + bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } Align getPlatformStackAlignment() const { + if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned()) + return Align(32); + return Align(16); } @@ -314,6 +325,9 @@ class PPCSubtarget : public PPCGenSubtargetInfo { const Triple &getTargetTriple() const { return TargetTriple; } + /// isBGQ - True if this is a BG/Q platform. + bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 27de5b29cd341..f15f9c7f49429 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -63,6 +63,10 @@ static cl:: opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); +static cl:: +opt DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden, + cl::desc("Disable QPX load splat simplification")); + static cl:: opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); @@ -110,6 +114,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { initializePPCReduceCRLogicalsPass(PR); initializePPCBSelPass(PR); initializePPCBranchCoalescingPass(PR); + initializePPCQPXLoadSplatPass(PR); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCPreEmitPeepholePass(PR); @@ -406,9 +411,14 @@ void PPCPassConfig::addIRPasses() { // Lower generic MASSV routines to PowerPC subtarget-specific entries. addPass(createPPCLowerMASSVEntriesPass()); - - // If explicitly requested, add explicit data prefetch intrinsics. + + // For the BG/Q (or if explicitly requested), add explicit data prefetch + // intrinsics. + bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && + getOptLevel() != CodeGenOpt::None; if (EnablePrefetch.getNumOccurrences() > 0) + UsePrefetching = EnablePrefetch; + if (UsePrefetching) addPass(createLoopDataPrefetchPass()); if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) { @@ -505,8 +515,15 @@ void PPCPassConfig::addPreRegAlloc() { } void PPCPassConfig::addPreSched2() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { addPass(&IfConverterID); + + // This optimization must happen after anything that might do store-to-load + // forwarding. Here we're after RA (and, thus, when spills are inserted) + // but before post-RA scheduling. + if (!DisableQPXLoadSplat) + addPass(createPPCQPXLoadSplatPass()); + } } void PPCPassConfig::addPreEmitPass() { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index ee8842f4d8663..bbb4239d36da5 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -25,7 +25,8 @@ using namespace llvm; static cl::opt DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); -// This is currently only used for the data prefetch pass +// This is currently only used for the data prefetch pass which is only enabled +// for BG/Q by default. static cl::opt CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), cl::desc("The loop prefetch cache line size")); @@ -103,6 +104,55 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1)); } + case Intrinsic::ppc_qpx_qvlfs: + // Turn PPC QPX qvlfs -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(0), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Type *VTy = + VectorType::get(IC.Builder.getFloatTy(), + cast(II.getType())->getElementCount()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0), + PointerType::getUnqual(VTy)); + Value *Load = IC.Builder.CreateLoad(VTy, Ptr); + return new FPExtInst(Load, II.getType()); + } + break; + case Intrinsic::ppc_qpx_qvlfd: + // Turn PPC QPX qvlfd -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(0), Align(32), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { + Value *Ptr = IC.Builder.CreateBitCast( + II.getArgOperand(0), PointerType::getUnqual(II.getType())); + return new LoadInst(II.getType(), Ptr, "", false, Align(32)); + } + break; + case Intrinsic::ppc_qpx_qvstfs: + // Turn PPC QPX qvstfs -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(1), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Type *VTy = VectorType::get( + IC.Builder.getFloatTy(), + cast(II.getArgOperand(0)->getType())->getElementCount()); + Value *TOp = IC.Builder.CreateFPTrunc(II.getArgOperand(0), VTy); + Type *OpPtrTy = PointerType::getUnqual(VTy); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(TOp, Ptr, false, Align(16)); + } + break; + case Intrinsic::ppc_qpx_qvstfd: + // Turn PPC QPX qvstfd -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(1), Align(32), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { + Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(II.getArgOperand(0), Ptr, false, Align(32)); + } + break; + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -686,7 +736,10 @@ bool PPCTTIImpl::useColdCCForColdCall(Function &F) { } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { - // On the A2, always unroll aggressively. + // On the A2, always unroll aggressively. For QPX unaligned loads, we depend + // on combining the loads generated for consecutive accesses, and failure to + // do so is particularly expensive. This makes it much more likely (compared + // to only using concatenation unrolling). if (ST->getCPUDirective() == PPC::DIR_A2) return true; @@ -746,6 +799,7 @@ const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { + if (ST->hasQPX()) return 256; if (ST->hasAltivec()) return 128; return 0; } @@ -774,6 +828,8 @@ unsigned PPCTTIImpl::getCacheLineSize() const { } unsigned PPCTTIImpl::getPrefetchDistance() const { + // This seems like a reasonable default for the BG/Q (this pass is enabled, by + // default, only on the BG/Q). return 300; } @@ -862,7 +918,7 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - // PPC, for both Altivec/VSX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the @@ -918,6 +974,13 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return Cost; + } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { + // Floating point scalars are already located in index #0. + if (Index == 0) + return 0; + + return Cost; + } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { if (ST->hasP9Altivec()) { if (ISD == ISD::INSERT_VECTOR_ELT) @@ -992,6 +1055,8 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, LT.second == MVT::v4i32 || LT.second == MVT::v4f32); bool IsVSXType = ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); + bool IsQPXType = ST->hasQPX() && + (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); // VSX has 32b/64b load instructions. Legalization can handle loading of // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and @@ -1014,7 +1079,8 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // for Altivec types using the VSX instructions, but that's more expensive // than using the permutation-based load sequence. On the P8, that's no // longer true. - if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) && + if (Opcode == Instruction::Load && + ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && *Alignment >= LT.second.getScalarType().getStoreSize()) return Cost + LT.first; // Add the cost of the permutations. @@ -1067,7 +1133,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost( getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); - // PPC, for both Altivec/VSX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). For each result vector, we need one shuffle per incoming // vector (except that the first shuffle can take two incoming vectors diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index aa06e8144f634..427abde4277d4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4751,14 +4751,15 @@ struct VarArgPowerPC64Helper : public VarArgHelper { // For PowerPC, we need to deal with alignment of stack arguments - // they are mostly aligned to 8 bytes, but vectors and i128 arrays // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes, - // For that reason, we compute current offset from stack pointer (which is - // always properly aligned), and offset for the first vararg, then subtract - // them. + // and QPX vectors are aligned to 32 bytes. For that reason, we + // compute current offset from stack pointer (which is always properly + // aligned), and offset for the first vararg, then subtract them. unsigned VAArgBase; Triple TargetTriple(F.getParent()->getTargetTriple()); // Parameter save area starts at 48 bytes from frame pointer for ABIv1, // and 32 bytes for ABIv2. This is usually determined by target // endianness, but in theory could be overridden by function attribute. + // For simplicity, we ignore it here (it'd only matter for QPX vectors). if (TargetTriple.getArch() == Triple::ppc64) VAArgBase = 48; else diff --git a/llvm/test/Analysis/BasicAA/phi-spec-order.ll b/llvm/test/Analysis/BasicAA/phi-spec-order.ll index e5d435c09ccc7..f8586f094c2ce 100644 --- a/llvm/test/Analysis/BasicAA/phi-spec-order.ll +++ b/llvm/test/Analysis/BasicAA/phi-spec-order.ll @@ -1,5 +1,5 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" ; RUN: opt < %s -basic-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s @X = external global [16000 x double], align 32 diff --git a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll index e5fbf070cf32a..3b1bc3b3fdbc0 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll @@ -218,6 +218,42 @@ entry: ; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 } +define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { +entry: + %r = load <4 x float>, <4 x float>* %p, align 4 + ret <4 x float> %r + +; CHECK-LABEL: test_l_qv4float +; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 +} + +define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { +entry: + %r = load <8 x float>, <8 x float>* %p, align 4 + ret <8 x float> %r + +; CHECK-LABEL: test_l_qv8float +; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 +} + +define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { +entry: + %r = load <4 x double>, <4 x double>* %p, align 8 + ret <4 x double> %r + +; CHECK-LABEL: test_l_qv4double +; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 +} + +define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { +entry: + %r = load <8 x double>, <8 x double>* %p, align 8 + ret <8 x double> %r + +; CHECK-LABEL: test_l_qv8double +; CHECK: cost of 4 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8 +} + define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { entry: store <16 x i8> %v, <16 x i8>* %p, align 1 @@ -326,6 +362,43 @@ entry: ; CHECK: cost of 2 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 } +define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { +entry: + store <4 x float> %v, <4 x float>* %p, align 4 + ret void + +; CHECK-LABEL: test_s_qv4float +; CHECK: cost of 7 for instruction: store <4 x float> %v, <4 x float>* %p, align 4 +} + +define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { +entry: + store <8 x float> %v, <8 x float>* %p, align 4 + ret void + +; CHECK-LABEL: test_s_qv8float +; CHECK: cost of 15 for instruction: store <8 x float> %v, <8 x float>* %p, align 4 +} + +define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { +entry: + store <4 x double> %v, <4 x double>* %p, align 8 + ret void + +; CHECK-LABEL: test_s_qv4double +; CHECK: cost of 7 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 +} + +define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { +entry: + store <8 x double> %v, <8 x double>* %p, align 8 + ret void + +; CHECK-LABEL: test_s_qv8double +; CHECK: cost of 15 for instruction: store <8 x double> %v, <8 x double>* %p, align 8 +} + attributes #0 = { nounwind "target-cpu"="pwr7" } +attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll index d93f192b1274d..69f9cff5c525f 100644 --- a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll +++ b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -enable-misched < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -enable-misched < %s | FileCheck %s ; ; PR14315: misched should not move the physreg copy of %t below the calls. diff --git a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir index a0139879f8c91..738aa1df5dd9d 100644 --- a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir +++ b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir @@ -55,7 +55,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.module.flags = !{!0, !1} diff --git a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir index 01ce79995512a..bcd51d31c6cfd 100644 --- a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir +++ b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir @@ -30,7 +30,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll b/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll new file mode 100644 index 0000000000000..17e3df6d58ccc --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll @@ -0,0 +1,23 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 | FileCheck -check-prefix=CHECK-A2 %s +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck -check-prefix=CHECK-A2Q %s +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-bgq-linux -mcpu=a2 | FileCheck -check-prefix=CHECK-BGQ %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +declare i32 @bar(i8* %a) nounwind; +define i32 @foo() nounwind { + %p = alloca i8, i8 115 + store i8 0, i8* %p + %r = call i32 @bar(i8* %p) + ret i32 %r +} + +; Without QPX, the allocated stack frame is 240 bytes, but with QPX +; (because we require 32-byte alignment), it is 256 bytes. +; CHECK-A2: @foo +; CHECK-A2: stdu 1, -240(1) +; CHECK-A2Q: @foo +; CHECK-A2Q: stdu 1, -256(1) +; CHECK-BGQ: @foo +; CHECK-BGQ: stdu 1, -256(1) + diff --git a/llvm/test/CodeGen/PowerPC/a2q.ll b/llvm/test/CodeGen/PowerPC/a2q.ll new file mode 100644 index 0000000000000..84e2dfa991d78 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/a2q.ll @@ -0,0 +1,10 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 -mattr=+qpx | FileCheck %s + +define void @foo() { +entry: + ret void +} + +; CHECK: @foo + diff --git a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll index d629148535aa7..1b0ea26f1fdea 100644 --- a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll +++ b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll @@ -298,7 +298,7 @@ _ZN10SubProcess12SafeSyscalls5fcntlEiil.exit: ; preds = %_ZN10SubProcess12Sa ; Function Attrs: nounwind argmemonly declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind argmemonly } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/asm-Zy.ll b/llvm/test/CodeGen/PowerPC/asm-Zy.ll index c8b5e9f1aa1d1..78bb0f4c73eca 100644 --- a/llvm/test/CodeGen/PowerPC/asm-Zy.ll +++ b/llvm/test/CodeGen/PowerPC/asm-Zy.ll @@ -1,5 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mcpu=a2 | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" define i32 @zytest(i32 %a) nounwind { entry: diff --git a/llvm/test/CodeGen/PowerPC/asm-constraints.ll b/llvm/test/CodeGen/PowerPC/asm-constraints.ll index da77d1a169792..a3e573d8935e9 100644 --- a/llvm/test/CodeGen/PowerPC/asm-constraints.ll +++ b/llvm/test/CodeGen/PowerPC/asm-constraints.ll @@ -65,7 +65,7 @@ entry: } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir index 904210ee13477..2081e6fd02f51 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir @@ -63,8 +63,8 @@ ret i64 %2 } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir index f46d4fc0a42a4..b52e0a4103add 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir @@ -187,7 +187,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir index ba950dc3d3ae9..4d2595e1abdcb 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir @@ -983,10 +983,10 @@ ret i64 %xor } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll index 75640d1d26072..ed3c9f07c1a85 100644 --- a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll +++ b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX declare float @fabsf(float) @@ -63,6 +64,11 @@ loop_exit: ; CHECK-NOT: xsmindp ; CHECK: blr +; QPX-LABEL: test1v: +; QPX: mtctr +; QPX-NOT: bl fminf +; QPX: blr + define void @test1a(float %f, float* %fp) { entry: br label %loop_body @@ -133,6 +139,11 @@ loop_exit: ; CHECK-NOT: xsmaxdp ; CHECK: blr +; QPX-LABEL: test2v: +; QPX: mtctr +; QPX-NOT: bl fmax +; QPX: blr + define void @test2a(float %f, float* %fp) { entry: br label %loop_body diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll index 636c86b815c8c..44acfcdd6e66a 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s --check-prefixes=CHECK,CHECK-A2Q ; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4 ; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result. @@ -85,8 +86,11 @@ for.body: ; preds = %entry, %for.body } ; Function Attrs: norecurse nounwind +; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. +; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount2NonSmallLoop() { ; CHECK-LABEL: testTripCount2NonSmallLoop: +; CHECK-A2Q: mtctr ; CHECK-PWR8-NOT: mtctr ; CHECK: blr @@ -117,9 +121,12 @@ for.end: ; preds = %if.end ret i32 %conv } +; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. +; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount5() { ; CHECK-LABEL: testTripCount5: ; CHECK-PWR8-NOT: mtctr +; CHECK-A2Q: mtctr entry: %.prea = load i32, i32* @a, align 4 diff --git a/llvm/test/CodeGen/PowerPC/ec-input.ll b/llvm/test/CodeGen/PowerPC/ec-input.ll index 425bc1985d419..9a1c121699a69 100644 --- a/llvm/test/CodeGen/PowerPC/ec-input.ll +++ b/llvm/test/CodeGen/PowerPC/ec-input.ll @@ -5,7 +5,7 @@ ; that were both inputs to the inline asm and also early-clobber outputs). target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713 = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712 = type { %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32 } diff --git a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll index 023928bcb5896..e066b45d3ca4b 100644 --- a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll +++ b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-unknown-linux" +target triple = "powerpc64-bgq-linux" %"class.Foam::messageStream.6" = type <{ %"class.Foam::string.5", i32, i32, i32, [4 x i8] }> %"class.Foam::string.5" = type { %"class.std::basic_string.4" } @@ -419,8 +419,8 @@ declare void @_ZN4Foam11regIOobjectD2Ev() #0 declare void @_ZN4Foam6reduceIiNS_5sumOpIiEEEEvRKNS_4ListINS_8UPstream11commsStructEEERT_RKT0_ii() #0 -attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll index b08b050f2c2fd..fdd0fc2767803 100644 --- a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll +++ b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs -O0 -relocation-model=pic < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" %"class.std::__1::__tree_node.130.151" = type { %"class.std::__1::__tree_node_base.base.128.149", %"class.boost::serialization::extended_type_info.129.150"* } %"class.std::__1::__tree_node_base.base.128.149" = type <{ %"class.std::__1::__tree_end_node.127.148", %"class.std::__1::__tree_node_base.126.147"*, %"class.std::__1::__tree_node_base.126.147"*, i8 }> diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll index a336fc796ca52..eef6e0ccac02b 100644 --- a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll @@ -33,4 +33,4 @@ define float @f(float %xf) #0 { ret float %25 } -attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll index 3b555cf898f57..2feb4556dfab7 100644 --- a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll +++ b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll @@ -1,5 +1,6 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" define linkonce_odr double @test1(ppc_fp128 %input) { entry: diff --git a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll index 2aa5239f25eb8..54c3e11528b7b 100644 --- a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll +++ b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" %"class.std::__1::__assoc_sub_state" = type { %"class.std::__1::__shared_count", %"class.std::__exception_ptr::exception_ptr", %"class.std::__1::mutex", %"class.std::__1::condition_variable", i32 } %"class.std::__1::__shared_count" = type { i32 (...)**, i64 } diff --git a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll index a2d0eb599f91d..74bfa75e5e313 100644 --- a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll +++ b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll @@ -33,5 +33,5 @@ declare i8* @_ZN11__sanitizer21internal_start_threadEPFvPvES0_(void (i8*)*, i8*) declare hidden void @_ZN11__sanitizer16BackgroundThreadEPv(i8* nocapture readnone) #5 -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #7 = { nobuiltin nounwind } diff --git a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll index 6f1bc76d816ae..e4dfd6c58f0e8 100644 --- a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll +++ b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" %struct.BG_CoordinateMapping_t = type { [4 x i8] } diff --git a/llvm/test/CodeGen/PowerPC/load-two-flts.ll b/llvm/test/CodeGen/PowerPC/load-two-flts.ll index 19e21faf47232..1cfcff5e01601 100644 --- a/llvm/test/CodeGen/PowerPC/load-two-flts.ll +++ b/llvm/test/CodeGen/PowerPC/load-two-flts.ll @@ -1,5 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) { entry: diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll index 2cbb70bb14cb5..f4664788930d4 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll @@ -1,6 +1,6 @@ -; RUN: llc -enable-ppc-prefetching=true -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" ; Function Attrs: nounwind define void @foo(double* %x, double* nocapture readonly %y) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll index defc52eec8e0d..f4821564c202b 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -enable-ppc-prefetching=true -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" ; Function Attrs: nounwind define void @foo(double* nocapture %a, double* nocapture readonly %b) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll index 7fdabcd4be210..a13192d3e6586 100644 --- a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll +++ b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BGQ target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -20,6 +21,7 @@ for.body: ; preds = %for.body, %entry ; CHECK-LABEL: @foo +; CHECK-BGQ-DAG: dcbt 4, 5 ; CHECK-DAG: lfdu [[REG1:[0-9]+]], 8({{[0-9]+}}) ; CHECK-DAG: fadd [[REG2:[0-9]+]], [[REG1]], 0 ; CHECK-DAG: stfdu [[REG2]], 8({{[0-9]+}}) @@ -32,13 +34,15 @@ for.cond.cleanup6: ; preds = %for.body7 for.body7: ; preds = %for.body, %for.body7 %i3.017 = phi i32 [ %inc9, %for.body7 ], [ 0, %for.body ] - tail call void bitcast (void (...)* @bar to void ()*)() #0 + tail call void bitcast (void (...)* @bar to void ()*)() #2 %inc9 = add nuw nsw i32 %i3.017, 1 %exitcond = icmp eq i32 %inc9, 1024 br i1 %exitcond, label %for.cond.cleanup6, label %for.body7 } -declare void @bar(...) +declare void @bar(...) #1 -attributes #0 = { nounwind } +attributes #0 = { nounwind "target-cpu"="a2q" } +attributes #1 = { "target-cpu"="a2q" } +attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll index aa618d2b732c7..93868007d0d36 100644 --- a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll +++ b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll @@ -41,6 +41,6 @@ define void @aligned_slot() #0 { ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll index 16fc3ee3e5202..2e834b1fe788c 100644 --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr7 < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-PWR +; RUN: llc -verify-machineinstrs -O3 -mcpu=a2q < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-QPX ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 < %s | FileCheck %s -check-prefix=FIXPOINT target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -92,6 +93,9 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds1: ; CHECK: # %bb.0: +; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 +; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 +; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -106,6 +110,9 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds2: ; CHECK: # %bb.0: +; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 +; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 +; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -120,6 +127,9 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds3: ; CHECK: # %bb.0: +; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 +; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 +; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -134,6 +144,9 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds4: ; CHECK: # %bb.0: +; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 +; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 +; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -204,6 +217,9 @@ define i64 @reassociate_mulld(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { define double @reassociate_mamaa_double(double %0, double %1, double %2, double %3, double %4, double %5) { ; CHECK-LABEL: reassociate_mamaa_double: ; CHECK: # %bb.0: +; CHECK-QPX-DAG: fmadd [[REG0:[0-9]+]], 4, 3, 2 +; CHECK-QPX-DAG: fmadd [[REG1:[0-9]+]], 6, 5, 1 +; CHECK-QPX: fadd 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xsmaddadp 1, 6, 5 ; CHECK-PWR-DAG: xsmaddadp 2, 4, 3 ; CHECK-PWR: xsadddp 1, 2, 1 @@ -234,6 +250,9 @@ define float @reassociate_mamaa_float(float %0, float %1, float %2, float %3, fl define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) { ; CHECK-LABEL: reassociate_mamaa_vec: ; CHECK: # %bb.0: +; CHECK-QPX-DAG: qvfmadds [[REG0:[0-9]+]], 4, 3, 2 +; CHECK-QPX-DAG: qvfmadds [[REG1:[0-9]+]], 6, 5, 1 +; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xvmaddasp [[REG0:[0-9]+]], 39, 38 ; CHECK-PWR-DAG: xvmaddasp [[REG1:[0-9]+]], 37, 36 ; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] @@ -249,6 +268,11 @@ define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x f define double @reassociate_mamama_double(double %0, double %1, double %2, double %3, double %4, double %5, double %6, double %7, double %8) { ; CHECK-LABEL: reassociate_mamama_double: ; CHECK: # %bb.0: +; CHECK-QPX: fmadd [[REG0:[0-9]+]], 2, 1, 7 +; CHECK-QPX-DAG: fmul [[REG1:[0-9]+]], 4, 3 +; CHECK-QPX-DAG: fmadd [[REG2:[0-9]+]], 6, 5, [[REG0]] +; CHECK-QPX-DAG: fmadd [[REG3:[0-9]+]], 9, 8, [[REG1]] +; CHECK-QPX: fadd 1, [[REG2]], [[REG3]] ; CHECK-PWR: xsmaddadp 7, 2, 1 ; CHECK-PWR-DAG: xsmuldp [[REG0:[0-9]+]], 4, 3 ; CHECK-PWR-DAG: xsmaddadp 7, 6, 5 diff --git a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll index f807f4fa20d25..e135986a2894c 100644 --- a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll +++ b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll @@ -19,7 +19,7 @@ entry: declare void @bar(double) #1 -attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll index 502347a3af198..cbb7947be2198 100644 --- a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll +++ b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll @@ -1,8 +1,9 @@ -; RUN: opt -ee-instrument < %s | opt -inline | llc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: opt -ee-instrument < %s | opt -inline | llc | FileCheck %s ; The run-line mimics how Clang might run the instrumentation passes. target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" define void @leaf_function() #0 { diff --git a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll index c4e60f8c4b1f5..cd0abd6149bde 100644 --- a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll +++ b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll @@ -1,5 +1,6 @@ ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR7 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR8 +; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s -check-prefix=A2Q target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -24,6 +25,12 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr + +; A2Q-LABEL: @foo1 +; A2Q-NOT: bl memcpy +; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) +; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) +; A2Q: blr } ; Function Attrs: nounwind @@ -45,6 +52,12 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr + +; A2Q-LABEL: @foo2 +; A2Q-NOT: bl memcpy +; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) +; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) +; A2Q: blr } ; Function Attrs: nounwind @@ -63,6 +76,11 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr + +; A2Q-LABEL: @bar1 +; A2Q-NOT: bl memset +; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) +; A2Q: blr } ; Function Attrs: nounwind @@ -81,6 +99,11 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr + +; A2Q-LABEL: @bar2 +; A2Q-NOT: bl memset +; A2Q: qvstfdx +; A2Q: blr } ; Function Attrs: nounwind diff --git a/llvm/test/CodeGen/PowerPC/memset-nc.ll b/llvm/test/CodeGen/PowerPC/memset-nc.ll new file mode 100644 index 0000000000000..663d0cb1d6785 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/memset-nc.ll @@ -0,0 +1,48 @@ +; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 < %s | FileCheck %s -check-prefix=CHECK-O0 +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" + +; Function Attrs: nounwind +define void @test_qpx() unnamed_addr #0 align 2 { +entry: + %0 = load i32, i32* undef, align 4 + %1 = trunc i32 %0 to i8 + call void @llvm.memset.p0i8.i64(i8* align 32 null, i8 %1, i64 64, i1 false) + ret void + +; CHECK-LABEL: @test_qpx +; CHECK: qvstfdx +; CHECK: qvstfdx +; CHECK: blr + +; CHECK-O0-LABEL: @test_qpx +; CHECK-O0-NOT: qvstfdx +; CHECK-O0: blr +} + +; Function Attrs: nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 + +; Function Attrs: nounwind +define void @test_vsx() unnamed_addr #2 align 2 { +entry: + %0 = load i32, i32* undef, align 4 + %1 = trunc i32 %0 to i8 + call void @llvm.memset.p0i8.i64(i8* null, i8 %1, i64 32, i1 false) + ret void + +; CHECK-LABEL: @test_vsx +; CHECK: stxvw4x +; CHECK: stxvw4x +; CHECK: blr + +; CHECK-O0-LABEL: @test_vsx +; CHECK-O0-NOT: stxvw4x +; CHECK-O0: blr +} + +attributes #0 = { nounwind "target-cpu"="a2q" } +attributes #1 = { nounwind } +attributes #2 = { nounwind "target-cpu"="pwr7" } + diff --git a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll index 089c947713b9d..26663d81f3575 100644 --- a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll +++ b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -1,7 +1,8 @@ ; RUN: llc -verify-machineinstrs < %s -enable-misched -pre-RA-sched=source -scheditins=false \ -; RUN: -disable-ifcvt-triangle-false -disable-post-ra -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s ; target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" ; %val1 is a load live out of %entry. It should be hoisted ; above the add. diff --git a/llvm/test/CodeGen/PowerPC/misched.ll b/llvm/test/CodeGen/PowerPC/misched.ll index 9a75fe44b7176..1c868b3f171c9 100644 --- a/llvm/test/CodeGen/PowerPC/misched.ll +++ b/llvm/test/CodeGen/PowerPC/misched.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -enable-misched -verify-machineinstrs ; PR14302 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" @b = external global [16000 x double], align 32 diff --git a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll index ad5976318fe3a..f59df4291c48f 100644 --- a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll +++ b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll @@ -1,5 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" declare zeroext i1 @ri1() declare void @se1() diff --git a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll index 2871e077df565..2e248506c7b7b 100644 --- a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll @@ -92,7 +92,7 @@ entry: ; Left the target features in this test because it is important that caller has ; -pcrelative-memops while callee has +pcrelative-memops -attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } -attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } -attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-spe" } +attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } +attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } +attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-qpx,-spe" } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/popcnt.ll b/llvm/test/CodeGen/PowerPC/popcnt.ll index 695863d87f16e..a06c59d4b945a 100644 --- a/llvm/test/CodeGen/PowerPC/popcnt.ll +++ b/llvm/test/CodeGen/PowerPC/popcnt.ll @@ -1,6 +1,8 @@ ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+popcntd < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+slow-popcntd < %s | FileCheck %s --check-prefix=SLOWPC ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=pwr7 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q < %s | FileCheck %s --check-prefix=SLOWPC +; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q -mattr=+popcntd < %s | FileCheck %s define i64 @_cntb64(i64 %x) nounwind readnone { %cnt = tail call i64 @llvm.ppc.popcntb(i64 %x) diff --git a/llvm/test/CodeGen/PowerPC/ppc-passname.ll b/llvm/test/CodeGen/PowerPC/ppc-passname.ll index 06f13278d84cd..98343bdb535c2 100644 --- a/llvm/test/CodeGen/PowerPC/ppc-passname.ll +++ b/llvm/test/CodeGen/PowerPC/ppc-passname.ll @@ -105,3 +105,14 @@ ; STOP-AFTER-BRANCH-COALESCING-NOT: "ppc-branch-coalescing" pass is not registered. ; STOP-AFTER-BRANCH-COALESCING: Branch Coalescing + +; Test pass name: ppc-qpx-load-splat. +; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-QPX-LOAD-SPLAT +; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: -ppc-qpx-load-splat +; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. +; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: PowerPC QPX Load Splat Simplification + +; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-QPX-LOAD-SPLAT +; STOP-AFTER-QPX-LOAD-SPLAT: -ppc-qpx-load-splat +; STOP-AFTER-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. +; STOP-AFTER-QPX-LOAD-SPLAT: PowerPC QPX Load Splat Simplification diff --git a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll index 357f28e88b184..fc0e71f878cab 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -code-model=small | FileCheck %s -check-prefix=SCM ; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because @@ -117,6 +117,23 @@ define void @caller_local_sret_32(%S_32* %a) #1 { attributes #0 = { noinline nounwind } attributes #1 = { nounwind } +; vector <4 x i1> test + +define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void } +define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) { + tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b) + ret void + +; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't +; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder + +; CHECK-SCO-LABEL: caller_v4i1_reorder: +; CHECK-SCO: bl callee_v4i1 + +; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder: +; CHECK-SCO-HASQPX: b callee_v4i1 +} + define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void } define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) diff --git a/llvm/test/CodeGen/PowerPC/pr24546.ll b/llvm/test/CodeGen/PowerPC/pr24546.ll index 028fd2d8f0064..28c03293680e5 100644 --- a/llvm/test/CodeGen/PowerPC/pr24546.ll +++ b/llvm/test/CodeGen/PowerPC/pr24546.ll @@ -47,8 +47,8 @@ declare double @pow(double, double) #0 ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/pr27350.ll b/llvm/test/CodeGen/PowerPC/pr27350.ll index 93dbd10fecdeb..982023a1fcdc8 100644 --- a/llvm/test/CodeGen/PowerPC/pr27350.ll +++ b/llvm/test/CodeGen/PowerPC/pr27350.ll @@ -18,7 +18,7 @@ entry: declare fastcc void @bar([2 x i64], [2 x i64]) unnamed_addr #1 align 2 attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/pr28130.ll b/llvm/test/CodeGen/PowerPC/pr28130.ll index 4da415bd29269..cb703dfda8a59 100644 --- a/llvm/test/CodeGen/PowerPC/pr28130.ll +++ b/llvm/test/CodeGen/PowerPC/pr28130.ll @@ -67,4 +67,4 @@ bb: ret void } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll index 35aec57ec2640..04dee1ee182bb 100644 --- a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll +++ b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" %t1 = type { %t2*, %t3* } %t2 = type <{ %t3*, i32, [4 x i8] }> diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll b/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll new file mode 100644 index 0000000000000..4e0aef4c3df71 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll @@ -0,0 +1,33 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" + +define void @s452(i32 %inp1) nounwind { +entry: + br label %for.body4 + +for.body4: ; preds = %for.body4, %entry + %conv.4 = sitofp i32 %inp1 to double + %conv.5 = sitofp i32 %inp1 to double + %mul.4.v.i0.1 = insertelement <2 x double> undef, double %conv.4, i32 0 + %v = insertelement <2 x double> %mul.4.v.i0.1, double %conv.5, i32 1 + %vv = fmul <2 x double> %v, %v + %add7.4 = fadd <2 x double> %vv, %vv + store <2 x double> %add7.4, <2 x double>* undef, align 16 + br i1 undef, label %for.end, label %for.body4 + +for.end: ; preds = %for.body4 + unreachable +; CHECK-LABEL: @s452 +; CHECK: lfiwax [[REG1:[0-9]+]], +; CHECK: fcfid [[REG2:[0-9]+]], [[REG1]] +; FIXME: We could 'promote' this to a vector earlier and remove this splat. +; CHECK: qvesplati {{[0-9]+}}, [[REG2]], 0 +; CHECK: qvfmul +; CHECK: qvfadd +; CHECK: qvesplati {{[0-9]+}}, +; FIXME: We can use qvstfcdx here instead of two stores. +; CHECK: stfd +; CHECK: stfd +} + diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv.ll b/llvm/test/CodeGen/PowerPC/qpx-bv.ll new file mode 100644 index 0000000000000..93a739b864c1d --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-bv.ll @@ -0,0 +1,37 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s + +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" + +define <4 x double> @foo(double %f1, double %f2, double %f3, double %f4) { + %v1 = insertelement <4 x double> undef, double %f1, i32 0 + %v2 = insertelement <4 x double> %v1, double %f2, i32 1 + %v3 = insertelement <4 x double> %v2, double %f3, i32 2 + %v4 = insertelement <4 x double> %v3, double %f4, i32 3 + ret <4 x double> %v4 + +; CHECK-LABEL: @foo +; CHECK: qvgpci [[REG1:[0-9]+]], 275 +; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 +; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] +; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] +; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] +; CHECK: blr +} + +define <4 x float> @goo(float %f1, float %f2, float %f3, float %f4) { + %v1 = insertelement <4 x float> undef, float %f1, i32 0 + %v2 = insertelement <4 x float> %v1, float %f2, i32 1 + %v3 = insertelement <4 x float> %v2, float %f3, i32 2 + %v4 = insertelement <4 x float> %v3, float %f4, i32 3 + ret <4 x float> %v4 + +; CHECK-LABEL: @goo +; CHECK: qvgpci [[REG1:[0-9]+]], 275 +; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 +; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] +; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] +; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] +; CHECK: blr +} + diff --git a/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll b/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll new file mode 100644 index 0000000000000..ccbbd162a0cdb --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll @@ -0,0 +1,22 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +declare <4 x double> @foo(<4 x double> %p) + +define <4 x double> @bar(<4 x double> %p, <4 x double> %q) { +entry: + %v = call <4 x double> @foo(<4 x double> %p) + %w = call <4 x double> @foo(<4 x double> %q) + %x = fadd <4 x double> %v, %w + ret <4 x double> %x + +; CHECK-LABEL: @bar +; CHECK: qvstfdx 2, +; CHECK: bl foo +; CHECK: qvstfdx 1, +; CHECK: qvlfdx 1, +; CHECK: bl foo +; CHECK: qvlfdx [[REG:[0-9]+]], +; CHECK: qvfadd 1, [[REG]], 1 +} + diff --git a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll new file mode 100644 index 0000000000000..50b864980d985 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s + +; Function Attrs: norecurse nounwind readonly +define <4 x double> @foo(double* nocapture readonly %a) #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvdsx v2, 0, r3 +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: blr +entry: + %0 = load double, double* %a, align 8 + %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 + %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %shuffle.i +} + +define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 { +; CHECK-LABEL: foox: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r4, r4, 3 +; CHECK-NEXT: lxvdsx v2, r3, r4 +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: blr +entry: + %p = getelementptr double, double* %a, i64 %idx + %0 = load double, double* %p, align 8 + %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 + %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %shuffle.i +} + +define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 { +; CHECK-LABEL: fooxu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r4, r4, 3 +; CHECK-NEXT: add r6, r3, r4 +; CHECK-NEXT: std r6, 0(r5) +; CHECK-NEXT: lxvdsx v2, r3, r4 +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: blr +entry: + %p = getelementptr double, double* %a, i64 %idx + %0 = load double, double* %p, align 8 + %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 + %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer + store double* %p, double** %pptr, align 8 + ret <4 x double> %shuffle.i +} + +define <4 x float> @foof(float* nocapture readonly %a) #0 { +; CHECK-LABEL: foof: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxspltw v2, vs0, 1 +; CHECK-NEXT: blr +entry: + %0 = load float, float* %a, align 4 + %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 + %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %shuffle.i +} + +define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 { +; CHECK-LABEL: foofx: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r4, r4, 2 +; CHECK-NEXT: lfiwzx f0, r3, r4 +; CHECK-NEXT: xxspltw v2, vs0, 1 +; CHECK-NEXT: blr +entry: + %p = getelementptr float, float* %a, i64 %idx + %0 = load float, float* %p, align 4 + %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 + %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %shuffle.i +} + + diff --git a/llvm/test/CodeGen/PowerPC/qpx-load.ll b/llvm/test/CodeGen/PowerPC/qpx-load.ll new file mode 100644 index 0000000000000..514f0934b6cfc --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-load.ll @@ -0,0 +1,26 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +define <4 x double> @foo(<4 x double>* %p) { +entry: + %v = load <4 x double>, <4 x double>* %p, align 8 + ret <4 x double> %v +} + +; CHECK: @foo +; CHECK-DAG: li [[REG1:[0-9]+]], 31 +; CHECK-DAG: qvlfdx [[REG4:[0-9]+]], 0, 3 +; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], 3, [[REG1]] +; CHECK-DAG: qvlpcldx [[REG3:[0-9]+]], 0, 3 +; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] +; CHECK: blr + +define <4 x double> @bar(<4 x double>* %p) { +entry: + %v = load <4 x double>, <4 x double>* %p, align 32 + ret <4 x double> %v +} + +; CHECK: @bar +; CHECK: qvlfdx + diff --git a/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll b/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll new file mode 100644 index 0000000000000..eab4d6af7e9fc --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll @@ -0,0 +1,79 @@ +; RUN: llc -verify-machineinstrs -stop-after=finalize-isel < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +define <2 x double> @test_qvfmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { +; CHECK: test_qvfmadd +; CHECK: QVFMADD %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <2 x double> %2, %1 + %5 = fadd reassoc nsz <2 x double> %4, %0 + ret <2 x double> %5 +} + +define <4 x float> @test_qvfmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { +; CHECK: test_qvfmadds +; CHECK: QVFMADDSs %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <4 x float> %2, %1 + %5 = fadd reassoc nsz <4 x float> %4, %0 + ret <4 x float> %5 +} + +define <2 x double> @test_qvfnmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { +; CHECK: test_qvfnmadd +; CHECK: QVFNMADD %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <2 x double> %2, %1 + %5 = fadd reassoc nsz <2 x double> %4, %0 + %6 = fneg reassoc nsz <2 x double> %5 + ret <2 x double> %6 +} + +define <4 x float> @test_qvfnmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { +; CHECK: test_qvfnmadds +; CHECK: QVFNMADDSs %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <4 x float> %2, %1 + %5 = fadd reassoc nsz <4 x float> %4, %0 + %6 = fneg reassoc nsz <4 x float> %5 + ret <4 x float> %6 +} + +define <2 x double> @test_qvfmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { +; CHECK: test_qvfmsub +; CHECK: QVFMSUB %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <2 x double> %2, %1 + %5 = fsub reassoc nsz <2 x double> %4, %0 + ret <2 x double> %5 +} + +define <4 x float> @test_qvfmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { +; CHECK: test_qvfmsubs +; CHECK: QVFMSUBSs %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <4 x float> %2, %1 + %5 = fsub reassoc nsz <4 x float> %4, %0 + ret <4 x float> %5 +} + +define <2 x double> @test_qvfnmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { +; CHECK: test_qvfnmsub +; CHECK: QVFNMSUB %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <2 x double> %2, %1 + %5 = fsub reassoc nsz <2 x double> %4, %0 + %6 = fneg reassoc nsz <2 x double> %5 + ret <2 x double> %6 +} + +define <4 x float> @test_qvfnmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { +; CHECK: test_qvfnmsubs +; CHECK: QVFNMSUBSs %2, %1, %0, implicit $rm +; + %4 = fmul reassoc nsz <4 x float> %2, %1 + %5 = fsub reassoc nsz <4 x float> %4, %0 + %6 = fneg reassoc nsz <4 x float> %5 + ret <4 x float> %6 +} + diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll new file mode 100644 index 0000000000000..498ab62819ced --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll @@ -0,0 +1,473 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define <4 x double> @foo_fmf(<4 x double> %a, <4 x double> %b) nounwind { +; CHECK-LABEL: foo_fmf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha +; CHECK-NEXT: qvfrsqrte 3, 2 +; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l +; CHECK-NEXT: qvlfdx 0, 0, 3 +; CHECK-NEXT: qvfmul 4, 3, 3 +; CHECK-NEXT: qvfmsub 2, 2, 0, 2 +; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 +; CHECK-NEXT: qvfmul 3, 3, 4 +; CHECK-NEXT: qvfmul 4, 3, 3 +; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 +; CHECK-NEXT: qvfmul 0, 3, 0 +; CHECK-NEXT: qvfmul 1, 1, 0 +; CHECK-NEXT: blr +entry: + %x = call ninf afn reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) + %r = fdiv arcp reassoc <4 x double> %a, %x + ret <4 x double> %r +} + +define <4 x double> @foo_safe(<4 x double> %a, <4 x double> %b) nounwind { +; CHECK-LABEL: foo_safe: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 5, 2, 3 +; CHECK-NEXT: qvesplati 3, 2, 1 +; CHECK-NEXT: qvesplati 4, 2, 2 +; CHECK-NEXT: fsqrt 2, 2 +; CHECK-NEXT: fsqrt 5, 5 +; CHECK-NEXT: fsqrt 4, 4 +; CHECK-NEXT: fsqrt 3, 3 +; CHECK-NEXT: qvesplati 6, 1, 3 +; CHECK-NEXT: qvgpci 0, 275 +; CHECK-NEXT: fdiv 2, 1, 2 +; CHECK-NEXT: fdiv 5, 6, 5 +; CHECK-NEXT: qvesplati 6, 1, 2 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: fdiv 4, 6, 4 +; CHECK-NEXT: fdiv 1, 1, 3 +; CHECK-NEXT: qvfperm 3, 4, 5, 0 +; CHECK-NEXT: qvfperm 0, 2, 1, 0 +; CHECK-NEXT: qvgpci 1, 101 +; CHECK-NEXT: qvfperm 1, 0, 3, 1 +; CHECK-NEXT: blr +entry: + %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) + %r = fdiv <4 x double> %a, %x + ret <4 x double> %r +} + +define <4 x double> @foof_fmf(<4 x double> %a, <4 x float> %b) nounwind { +; CHECK-LABEL: foof_fmf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI2_0@toc@ha +; CHECK-NEXT: qvfrsqrtes 3, 2 +; CHECK-NEXT: addi 3, 3, .LCPI2_0@toc@l +; CHECK-NEXT: qvlfsx 0, 0, 3 +; CHECK-NEXT: qvfmuls 4, 3, 3 +; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 +; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 +; CHECK-NEXT: qvfmuls 0, 3, 0 +; CHECK-NEXT: qvfmul 1, 1, 0 +; CHECK-NEXT: blr +entry: + %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) + %y = fpext <4 x float> %x to <4 x double> + %r = fdiv arcp reassoc nsz <4 x double> %a, %y + ret <4 x double> %r +} + +define <4 x double> @foof_safe(<4 x double> %a, <4 x float> %b) nounwind { +; CHECK-LABEL: foof_safe: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 0, 2, 3 +; CHECK-NEXT: qvesplati 3, 2, 2 +; CHECK-NEXT: fsqrts 4, 2 +; CHECK-NEXT: qvesplati 2, 2, 1 +; CHECK-NEXT: fsqrts 0, 0 +; CHECK-NEXT: fsqrts 3, 3 +; CHECK-NEXT: fsqrts 2, 2 +; CHECK-NEXT: qvgpci 5, 275 +; CHECK-NEXT: qvgpci 6, 101 +; CHECK-NEXT: qvfperm 0, 3, 0, 5 +; CHECK-NEXT: qvesplati 3, 1, 2 +; CHECK-NEXT: qvfperm 2, 4, 2, 5 +; CHECK-NEXT: qvfperm 0, 2, 0, 6 +; CHECK-NEXT: qvesplati 2, 1, 3 +; CHECK-NEXT: qvesplati 4, 0, 3 +; CHECK-NEXT: fdiv 2, 2, 4 +; CHECK-NEXT: qvesplati 4, 0, 2 +; CHECK-NEXT: fdiv 3, 3, 4 +; CHECK-NEXT: qvesplati 4, 1, 1 +; CHECK-NEXT: fdiv 1, 1, 0 +; CHECK-NEXT: qvesplati 0, 0, 1 +; CHECK-NEXT: fdiv 0, 4, 0 +; CHECK-NEXT: qvfperm 2, 3, 2, 5 +; CHECK-NEXT: qvfperm 0, 1, 0, 5 +; CHECK-NEXT: qvfperm 1, 0, 2, 6 +; CHECK-NEXT: blr +entry: + %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) + %y = fpext <4 x float> %x to <4 x double> + %r = fdiv <4 x double> %a, %y + ret <4 x double> %r +} + +define <4 x float> @food_fmf(<4 x float> %a, <4 x double> %b) nounwind { +; CHECK-LABEL: food_fmf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha +; CHECK-NEXT: qvfrsqrte 3, 2 +; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l +; CHECK-NEXT: qvlfdx 0, 0, 3 +; CHECK-NEXT: qvfmul 4, 3, 3 +; CHECK-NEXT: qvfmsub 2, 2, 0, 2 +; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 +; CHECK-NEXT: qvfmul 3, 3, 4 +; CHECK-NEXT: qvfmul 4, 3, 3 +; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 +; CHECK-NEXT: qvfmul 0, 3, 0 +; CHECK-NEXT: qvfrsp 0, 0 +; CHECK-NEXT: qvfmuls 1, 1, 0 +; CHECK-NEXT: blr +entry: + %x = call afn ninf reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) + %y = fptrunc <4 x double> %x to <4 x float> + %r = fdiv arcp reassoc <4 x float> %a, %y + ret <4 x float> %r +} + +define <4 x float> @food_safe(<4 x float> %a, <4 x double> %b) nounwind { +; CHECK-LABEL: food_safe: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 0, 2, 3 +; CHECK-NEXT: qvesplati 3, 2, 2 +; CHECK-NEXT: fsqrt 4, 2 +; CHECK-NEXT: qvesplati 2, 2, 1 +; CHECK-NEXT: fsqrt 0, 0 +; CHECK-NEXT: fsqrt 3, 3 +; CHECK-NEXT: fsqrt 2, 2 +; CHECK-NEXT: qvgpci 5, 275 +; CHECK-NEXT: qvgpci 6, 101 +; CHECK-NEXT: qvfperm 0, 3, 0, 5 +; CHECK-NEXT: qvesplati 3, 1, 2 +; CHECK-NEXT: qvfperm 2, 4, 2, 5 +; CHECK-NEXT: qvfperm 0, 2, 0, 6 +; CHECK-NEXT: qvesplati 2, 1, 3 +; CHECK-NEXT: qvfrsp 0, 0 +; CHECK-NEXT: qvesplati 4, 0, 3 +; CHECK-NEXT: fdivs 2, 2, 4 +; CHECK-NEXT: qvesplati 4, 0, 2 +; CHECK-NEXT: fdivs 3, 3, 4 +; CHECK-NEXT: qvesplati 4, 1, 1 +; CHECK-NEXT: fdivs 1, 1, 0 +; CHECK-NEXT: qvesplati 0, 0, 1 +; CHECK-NEXT: fdivs 0, 4, 0 +; CHECK-NEXT: qvfperm 2, 3, 2, 5 +; CHECK-NEXT: qvfperm 0, 1, 0, 5 +; CHECK-NEXT: qvfperm 1, 0, 2, 6 +; CHECK-NEXT: blr +entry: + %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) + %y = fptrunc <4 x double> %x to <4 x float> + %r = fdiv <4 x float> %a, %y + ret <4 x float> %r +} + +define <4 x float> @goo_fmf(<4 x float> %a, <4 x float> %b) nounwind { +; CHECK-LABEL: goo_fmf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI6_0@toc@ha +; CHECK-NEXT: qvfrsqrtes 3, 2 +; CHECK-NEXT: addi 3, 3, .LCPI6_0@toc@l +; CHECK-NEXT: qvlfsx 0, 0, 3 +; CHECK-NEXT: qvfmuls 4, 3, 3 +; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 +; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 +; CHECK-NEXT: qvfmuls 0, 3, 0 +; CHECK-NEXT: qvfmuls 1, 1, 0 +; CHECK-NEXT: blr +entry: + %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) + %r = fdiv arcp reassoc nsz <4 x float> %a, %x + ret <4 x float> %r +} + +define <4 x float> @goo_safe(<4 x float> %a, <4 x float> %b) nounwind { +; CHECK-LABEL: goo_safe: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 5, 2, 3 +; CHECK-NEXT: qvesplati 3, 2, 1 +; CHECK-NEXT: qvesplati 4, 2, 2 +; CHECK-NEXT: fsqrts 2, 2 +; CHECK-NEXT: fsqrts 5, 5 +; CHECK-NEXT: fsqrts 4, 4 +; CHECK-NEXT: fsqrts 3, 3 +; CHECK-NEXT: qvesplati 6, 1, 3 +; CHECK-NEXT: qvgpci 0, 275 +; CHECK-NEXT: fdivs 2, 1, 2 +; CHECK-NEXT: fdivs 5, 6, 5 +; CHECK-NEXT: qvesplati 6, 1, 2 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: fdivs 4, 6, 4 +; CHECK-NEXT: fdivs 1, 1, 3 +; CHECK-NEXT: qvfperm 3, 4, 5, 0 +; CHECK-NEXT: qvfperm 0, 2, 1, 0 +; CHECK-NEXT: qvgpci 1, 101 +; CHECK-NEXT: qvfperm 1, 0, 3, 1 +; CHECK-NEXT: blr +entry: + %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) + %r = fdiv <4 x float> %a, %x + ret <4 x float> %r +} + +define <4 x double> @foo2_fmf(<4 x double> %a, <4 x double> %b) nounwind { +; CHECK-LABEL: foo2_fmf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; CHECK-NEXT: qvfre 3, 2 +; CHECK-NEXT: addi 3, 3, .LCPI8_0@toc@l +; CHECK-NEXT: qvlfdx 0, 0, 3 +; CHECK-NEXT: qvfmadd 0, 2, 3, 0 +; CHECK-NEXT: qvfnmsub 0, 3, 0, 3 +; CHECK-NEXT: qvfmul 3, 1, 0 +; CHECK-NEXT: qvfnmsub 1, 2, 3, 1 +; CHECK-NEXT: qvfmadd 1, 0, 1, 3 +; CHECK-NEXT: blr +entry: + %r = fdiv arcp reassoc nsz ninf <4 x double> %a, %b + ret <4 x double> %r +} + +define <4 x double> @foo2_safe(<4 x double> %a, <4 x double> %b) nounwind { +; CHECK-LABEL: foo2_safe: +; CHECK: # %bb.0: +; CHECK-NEXT: qvesplati 3, 2, 3 +; CHECK-NEXT: qvesplati 4, 1, 3 +; CHECK-NEXT: qvesplati 5, 2, 2 +; CHECK-NEXT: qvgpci 0, 275 +; CHECK-NEXT: fdiv 3, 4, 3 +; CHECK-NEXT: qvesplati 4, 1, 2 +; CHECK-NEXT: fdiv 4, 4, 5 +; CHECK-NEXT: fdiv 5, 1, 2 +; CHECK-NEXT: qvesplati 2, 2, 1 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: fdiv 1, 1, 2 +; CHECK-NEXT: qvfperm 2, 4, 3, 0 +; CHECK-NEXT: qvfperm 0, 5, 1, 0 +; CHECK-NEXT: qvgpci 1, 101 +; CHECK-NEXT: qvfperm 1, 0, 2, 1 +; CHECK-NEXT: blr + %r = fdiv <4 x double> %a, %b + ret <4 x double> %r +} + +define <4 x float> @goo2_fmf(<4 x float> %a, <4 x float> %b) nounwind { +; CHECK-LABEL: goo2_fmf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvfres 0, 2 +; CHECK-NEXT: qvfmuls 3, 1, 0 +; CHECK-NEXT: qvfnmsubs 1, 2, 3, 1 +; CHECK-NEXT: qvfmadds 1, 0, 1, 3 +; CHECK-NEXT: blr +entry: + %r = fdiv arcp reassoc ninf <4 x float> %a, %b + ret <4 x float> %r +} + +define <4 x float> @goo2_safe(<4 x float> %a, <4 x float> %b) nounwind { +; CHECK-LABEL: goo2_safe: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 3, 2, 3 +; CHECK-NEXT: qvesplati 4, 1, 3 +; CHECK-NEXT: qvesplati 5, 2, 2 +; CHECK-NEXT: qvgpci 0, 275 +; CHECK-NEXT: fdivs 3, 4, 3 +; CHECK-NEXT: qvesplati 4, 1, 2 +; CHECK-NEXT: fdivs 4, 4, 5 +; CHECK-NEXT: fdivs 5, 1, 2 +; CHECK-NEXT: qvesplati 2, 2, 1 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: fdivs 1, 1, 2 +; CHECK-NEXT: qvfperm 2, 4, 3, 0 +; CHECK-NEXT: qvfperm 0, 5, 1, 0 +; CHECK-NEXT: qvgpci 1, 101 +; CHECK-NEXT: qvfperm 1, 0, 2, 1 +; CHECK-NEXT: blr +entry: + %r = fdiv <4 x float> %a, %b + ret <4 x float> %r +} + +define <4 x double> @foo3_fmf_denorm_on(<4 x double> %a) #0 { +; CHECK-LABEL: foo3_fmf_denorm_on: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI12_0@toc@ha +; CHECK-NEXT: qvfrsqrte 0, 1 +; CHECK-NEXT: addi 3, 3, .LCPI12_0@toc@l +; CHECK-NEXT: qvlfdx 2, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI12_1@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI12_1@toc@l +; CHECK-NEXT: qvfmul 3, 0, 0 +; CHECK-NEXT: qvfmsub 4, 1, 2, 1 +; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 +; CHECK-NEXT: qvfmul 0, 0, 3 +; CHECK-NEXT: qvfmul 3, 0, 0 +; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 +; CHECK-NEXT: qvfmul 0, 0, 2 +; CHECK-NEXT: qvlfdx 2, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI12_2@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI12_2@toc@l +; CHECK-NEXT: qvlfdx 3, 0, 3 +; CHECK-NEXT: qvfmul 0, 0, 1 +; CHECK-NEXT: qvfabs 1, 1 +; CHECK-NEXT: qvfcmplt 1, 1, 2 +; CHECK-NEXT: qvfsel 1, 1, 3, 0 +; CHECK-NEXT: blr +entry: + %r = call reassoc ninf afn <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) + ret <4 x double> %r +} + +define <4 x double> @foo3_fmf_denorm_off(<4 x double> %a) #1 { +; CHECK-LABEL: foo3_fmf_denorm_off: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI13_0@toc@ha +; CHECK-NEXT: qvfrsqrte 0, 1 +; CHECK-NEXT: addi 3, 3, .LCPI13_0@toc@l +; CHECK-NEXT: qvlfdx 2, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI13_1@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI13_1@toc@l +; CHECK-NEXT: qvfmul 3, 0, 0 +; CHECK-NEXT: qvfmsub 4, 1, 2, 1 +; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 +; CHECK-NEXT: qvfmul 0, 0, 3 +; CHECK-NEXT: qvfmul 3, 0, 0 +; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 +; CHECK-NEXT: qvfmul 0, 0, 2 +; CHECK-NEXT: qvlfdx 2, 0, 3 +; CHECK-NEXT: qvfmul 0, 0, 1 +; CHECK-NEXT: qvfcmpeq 1, 1, 2 +; CHECK-NEXT: qvfsel 1, 1, 2, 0 +; CHECK-NEXT: blr +entry: + %r = call afn reassoc ninf <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) + ret <4 x double> %r +} + +define <4 x double> @foo3_safe_denorm_on(<4 x double> %a) #0 { +; CHECK-LABEL: foo3_safe_denorm_on: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 2, 1, 3 +; CHECK-NEXT: qvesplati 3, 1, 2 +; CHECK-NEXT: fsqrt 4, 1 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: fsqrt 2, 2 +; CHECK-NEXT: fsqrt 3, 3 +; CHECK-NEXT: fsqrt 1, 1 +; CHECK-NEXT: qvgpci 0, 275 +; CHECK-NEXT: qvfperm 2, 3, 2, 0 +; CHECK-NEXT: qvfperm 0, 4, 1, 0 +; CHECK-NEXT: qvgpci 1, 101 +; CHECK-NEXT: qvfperm 1, 0, 2, 1 +; CHECK-NEXT: blr +entry: + %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) + ret <4 x double> %r +} + +define <4 x double> @foo3_safe_denorm_off(<4 x double> %a) #1 { +; CHECK-LABEL: foo3_safe_denorm_off: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 2, 1, 3 +; CHECK-NEXT: qvesplati 3, 1, 2 +; CHECK-NEXT: fsqrt 4, 1 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: fsqrt 2, 2 +; CHECK-NEXT: fsqrt 3, 3 +; CHECK-NEXT: fsqrt 1, 1 +; CHECK-NEXT: qvgpci 0, 275 +; CHECK-NEXT: qvfperm 2, 3, 2, 0 +; CHECK-NEXT: qvfperm 0, 4, 1, 0 +; CHECK-NEXT: qvgpci 1, 101 +; CHECK-NEXT: qvfperm 1, 0, 2, 1 +; CHECK-NEXT: blr +entry: + %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) + ret <4 x double> %r +} + +define <4 x float> @goo3_fmf_denorm_on(<4 x float> %a) #0 { +; CHECK-LABEL: goo3_fmf_denorm_on: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI16_1@toc@ha +; CHECK-NEXT: qvfrsqrtes 2, 1 +; CHECK-NEXT: addi 3, 3, .LCPI16_1@toc@l +; CHECK-NEXT: qvlfsx 0, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI16_0@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI16_0@toc@l +; CHECK-NEXT: qvfmuls 4, 2, 2 +; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 +; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 +; CHECK-NEXT: qvlfsx 3, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI16_2@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI16_2@toc@l +; CHECK-NEXT: qvlfsx 4, 0, 3 +; CHECK-NEXT: qvfmuls 0, 2, 0 +; CHECK-NEXT: qvfabs 2, 1 +; CHECK-NEXT: qvfmuls 0, 0, 1 +; CHECK-NEXT: qvfcmplt 1, 2, 3 +; CHECK-NEXT: qvfsel 1, 1, 4, 0 +; CHECK-NEXT: blr +entry: + %r = call reassoc afn ninf nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) + ret <4 x float> %r +} + +define <4 x float> @goo3_fmf_denorm_off(<4 x float> %a) #1 { +; CHECK-LABEL: goo3_fmf_denorm_off: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 3, 2, .LCPI17_1@toc@ha +; CHECK-NEXT: qvfrsqrtes 2, 1 +; CHECK-NEXT: addi 3, 3, .LCPI17_1@toc@l +; CHECK-NEXT: qvlfsx 0, 0, 3 +; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha +; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l +; CHECK-NEXT: qvfmuls 4, 2, 2 +; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 +; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 +; CHECK-NEXT: qvlfsx 3, 0, 3 +; CHECK-NEXT: qvfmuls 0, 2, 0 +; CHECK-NEXT: qvfmuls 0, 0, 1 +; CHECK-NEXT: qvfcmpeq 1, 1, 3 +; CHECK-NEXT: qvfsel 1, 1, 3, 0 +; CHECK-NEXT: blr +entry: + %r = call reassoc ninf afn nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) + ret <4 x float> %r +} + +define <4 x float> @goo3_safe(<4 x float> %a) nounwind { +; CHECK-LABEL: goo3_safe: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 2, 1, 3 +; CHECK-NEXT: qvesplati 3, 1, 2 +; CHECK-NEXT: fsqrts 4, 1 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: fsqrts 2, 2 +; CHECK-NEXT: fsqrts 3, 3 +; CHECK-NEXT: fsqrts 1, 1 +; CHECK-NEXT: qvgpci 0, 275 +; CHECK-NEXT: qvfperm 2, 3, 2, 0 +; CHECK-NEXT: qvfperm 0, 4, 1, 0 +; CHECK-NEXT: qvgpci 1, 101 +; CHECK-NEXT: qvfperm 1, 0, 2, 1 +; CHECK-NEXT: blr +entry: + %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) + ret <4 x float> %r +} + +attributes #0 = { nounwind "denormal-fp-math"="ieee,ieee" } +attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll new file mode 100644 index 0000000000000..ee3357156a6c0 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll @@ -0,0 +1,109 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q -enable-unsafe-fp-math | FileCheck -check-prefix=CHECK-FM %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +define <4 x float> @test1(<4 x float> %x) nounwind { + %call = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %x) nounwind readnone + ret <4 x float> %call + +; CHECK: test1: +; CHECK: qvfrim 1, 1 + +; CHECK-FM: test1: +; CHECK-FM: qvfrim 1, 1 +} + +declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone + +define <4 x double> @test2(<4 x double> %x) nounwind { + %call = tail call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone + ret <4 x double> %call + +; CHECK: test2: +; CHECK: qvfrim 1, 1 + +; CHECK-FM: test2: +; CHECK-FM: qvfrim 1, 1 +} + +declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone + +define <4 x float> @test3(<4 x float> %x) nounwind { + %call = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x) nounwind readnone + ret <4 x float> %call + +; CHECK: test3: +; CHECK-NOT: qvfrin + +; CHECK-FM: test3: +; CHECK-FM-NOT: qvfrin +} + +declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone + +define <4 x double> @test4(<4 x double> %x) nounwind { + %call = tail call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) nounwind readnone + ret <4 x double> %call + +; CHECK: test4: +; CHECK-NOT: qvfrin + +; CHECK-FM: test4: +; CHECK-FM-NOT: qvfrin +} + +declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) nounwind readnone + +define <4 x float> @test5(<4 x float> %x) nounwind { + %call = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone + ret <4 x float> %call + +; CHECK: test5: +; CHECK: qvfrip 1, 1 + +; CHECK-FM: test5: +; CHECK-FM: qvfrip 1, 1 +} + +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone + +define <4 x double> @test6(<4 x double> %x) nounwind { + %call = tail call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone + ret <4 x double> %call + +; CHECK: test6: +; CHECK: qvfrip 1, 1 + +; CHECK-FM: test6: +; CHECK-FM: qvfrip 1, 1 +} + +declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone + +define <4 x float> @test9(<4 x float> %x) nounwind { + %call = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone + ret <4 x float> %call + +; CHECK: test9: +; CHECK: qvfriz 1, 1 + +; CHECK-FM: test9: +; CHECK-FM: qvfriz 1, 1 +} + +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone + +define <4 x double> @test10(<4 x double> %x) nounwind { + %call = tail call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone + ret <4 x double> %call + +; CHECK: test10: +; CHECK: qvfriz 1, 1 + +; CHECK-FM: test10: +; CHECK-FM: qvfriz 1, 1 +} + +declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone + diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-load.ll b/llvm/test/CodeGen/PowerPC/qpx-s-load.ll new file mode 100644 index 0000000000000..57d7e3b0ded3c --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-s-load.ll @@ -0,0 +1,26 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +define <4 x float> @foo(<4 x float>* %p) { +entry: + %v = load <4 x float>, <4 x float>* %p, align 4 + ret <4 x float> %v +} + +; CHECK: @foo +; CHECK-DAG: li [[REG1:[0-9]+]], 15 +; CHECK-DAG: qvlfsx [[REG4:[0-9]+]], 0, 3 +; CHECK-DAG: qvlfsx [[REG2:[0-9]+]], 3, [[REG1]] +; CHECK-DAG: qvlpclsx [[REG3:[0-9]+]], 0, 3 +; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] +; CHECK: blr + +define <4 x float> @bar(<4 x float>* %p) { +entry: + %v = load <4 x float>, <4 x float>* %p, align 16 + ret <4 x float> %v +} + +; CHECK: @bar +; CHECK: qvlfsx + diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll new file mode 100644 index 0000000000000..5d42b9a529953 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll @@ -0,0 +1,143 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +@R = global <4 x i1> , align 16 + +define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) nounwind readnone { +entry: + %r = select <4 x i1> %c, <4 x float> %a, <4 x float> %b + ret <4 x float> %r + +; CHECK-LABEL: @test1 +; CHECK: qvfsel 1, 3, 1, 2 +; CHECK: blr +} + +define <4 x float> @test2(<4 x float> %a, <4 x float> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { +entry: + %v = insertelement <4 x i1> undef, i1 %c1, i32 0 + %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 + %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 + %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 + %r = select <4 x i1> %v4, <4 x float> %a, <4 x float> %b + ret <4 x float> %r + +; CHECK-LABEL: @test2 +; CHECK: stw +; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], +; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], +; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] +; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] +; CHECK: qvfsel 1, [[REG4]], 1, 2 +; CHECK: blr +} + +define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { +entry: + %v = and <4 x i1> %a, + ret <4 x i1> %v + +; CHECK-LABEL: @test3 +; CHECK: qvlfsx [[REG:[0-9]+]], +; qvflogical 1, 1, [[REG]], 1 +; blr +} + +define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { +entry: + %q = load <4 x i1>, <4 x i1>* %t, align 16 + %v = and <4 x i1> %a, %q + ret <4 x i1> %v + +; CHECK-LABEL: @test4 +; CHECK-DAG: lbz +; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], +; CHECK-DAG: stw +; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], +; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] +; CHECK: qvfand 1, 1, [[REG4]] +; CHECK: blr +} + +define void @test5(<4 x i1> %a) nounwind { +entry: + store <4 x i1> %a, <4 x i1>* @R + ret void + +; CHECK-LABEL: @test5 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK: lwz +; CHECK: stb +; CHECK: blr +} + +define i1 @test6(<4 x i1> %a) nounwind { +entry: + %r = extractelement <4 x i1> %a, i32 2 + ret i1 %r + +; CHECK-LABEL: @test6 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK: lwz +; CHECK: blr +} + +define i1 @test7(<4 x i1> %a) nounwind { +entry: + %r = extractelement <4 x i1> %a, i32 2 + %s = extractelement <4 x i1> %a, i32 3 + %q = and i1 %r, %s + ret i1 %q + +; CHECK-LABEL: @test7 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK-DAG: lwz [[REG4:[0-9]+]], +; FIXME: We're storing the vector twice, and that's silly. +; CHECK-DAG: qvstfiwx [[REG3]], +; CHECK: lwz [[REG5:[0-9]+]], +; CHECK: and 3, +; CHECK: blr +} + +define i1 @test8(<3 x i1> %a) nounwind { +entry: + %r = extractelement <3 x i1> %a, i32 2 + ret i1 %r + +; CHECK-LABEL: @test8 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK: lwz +; CHECK: blr +} + +define <3 x float> @test9(<3 x float> %a, <3 x float> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { +entry: + %v = insertelement <3 x i1> undef, i1 %c1, i32 0 + %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 + %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 + %r = select <3 x i1> %v3, <3 x float> %a, <3 x float> %b + ret <3 x float> %r + +; CHECK-LABEL: @test9 +; CHECK: stw +; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], +; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], +; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] +; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] +; CHECK: qvfsel 1, [[REG4]], 1, 2 +; CHECK: blr +} + diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-store.ll b/llvm/test/CodeGen/PowerPC/qpx-s-store.ll new file mode 100644 index 0000000000000..81cff7b6457f1 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-s-store.ll @@ -0,0 +1,25 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +define void @foo(<4 x float> %v, <4 x float>* %p) { +entry: + store <4 x float> %v, <4 x float>* %p, align 4 + ret void +} + +; CHECK: @foo +; CHECK: stfs +; CHECK: stfs +; CHECK: stfs +; CHECK: stfs +; CHECK: blr + +define void @bar(<4 x float> %v, <4 x float>* %p) { +entry: + store <4 x float> %v, <4 x float>* %p, align 16 + ret void +} + +; CHECK: @bar +; CHECK: qvstfsx + diff --git a/llvm/test/CodeGen/PowerPC/qpx-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-sel.ll new file mode 100644 index 0000000000000..abc92c9e98b13 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-sel.ll @@ -0,0 +1,151 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +@R = global <4 x i1> , align 16 + +define <4 x double> @test1(<4 x double> %a, <4 x double> %b, <4 x i1> %c) nounwind readnone { +entry: + %r = select <4 x i1> %c, <4 x double> %a, <4 x double> %b + ret <4 x double> %r + +; CHECK-LABEL: @test1 +; CHECK: qvfsel 1, 3, 1, 2 +; CHECK: blr +} + +define <4 x double> @test2(<4 x double> %a, <4 x double> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { +entry: + %v = insertelement <4 x i1> undef, i1 %c1, i32 0 + %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 + %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 + %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 + %r = select <4 x i1> %v4, <4 x double> %a, <4 x double> %b + ret <4 x double> %r + +; CHECK-LABEL: @test2 + +; FIXME: This load/store sequence is unnecessary. +; CHECK-DAG: lbz +; CHECK-DAG: stw + +; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], +; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], +; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] +; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] +; CHECK: qvfsel 1, [[REG4]], 1, 2 +; CHECK: blr +} + +define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { +entry: + %v = and <4 x i1> %a, + ret <4 x i1> %v + +; CHECK-LABEL: @test3 +; CHECK: qvlfsx [[REG:[0-9]+]], +; qvflogical 1, 1, [[REG]], 1 +; blr +} + +define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { +entry: + %q = load <4 x i1>, <4 x i1>* %t, align 16 + %v = and <4 x i1> %a, %q + ret <4 x i1> %v + +; CHECK-LABEL: @test4 +; CHECK-DAG: lbz +; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], +; CHECK-DAG: stw +; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], +; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] +; CHECK: qvfand 1, 1, [[REG4]] +; CHECK: blr +} + +define void @test5(<4 x i1> %a) nounwind { +entry: + store <4 x i1> %a, <4 x i1>* @R + ret void + +; CHECK-LABEL: @test5 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK: lwz +; CHECK: stb +; CHECK: blr +} + +define i1 @test6(<4 x i1> %a) nounwind { +entry: + %r = extractelement <4 x i1> %a, i32 2 + ret i1 %r + +; CHECK-LABEL: @test6 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK: lwz +; CHECK: blr +} + +define i1 @test7(<4 x i1> %a) nounwind { +entry: + %r = extractelement <4 x i1> %a, i32 2 + %s = extractelement <4 x i1> %a, i32 3 + %q = and i1 %r, %s + ret i1 %q + +; CHECK-LABEL: @test7 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK-DAG: lwz [[REG4:[0-9]+]], +; FIXME: We're storing the vector twice, and that's silly. +; CHECK-DAG: qvstfiwx [[REG3]], +; CHECK-DAG: lwz [[REG5:[0-9]+]], +; CHECK: and 3, +; CHECK: blr +} + +define i1 @test8(<3 x i1> %a) nounwind { +entry: + %r = extractelement <3 x i1> %a, i32 2 + ret i1 %r + +; CHECK-LABEL: @test8 +; CHECK: qvlfdx [[REG1:[0-9]+]], +; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] +; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] +; CHECK: qvstfiwx [[REG3]], +; CHECK: lwz +; CHECK: blr +} + +define <3 x double> @test9(<3 x double> %a, <3 x double> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { +entry: + %v = insertelement <3 x i1> undef, i1 %c1, i32 0 + %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 + %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 + %r = select <3 x i1> %v3, <3 x double> %a, <3 x double> %b + ret <3 x double> %r + +; CHECK-LABEL: @test9 + +; FIXME: This load/store sequence is unnecessary. +; CHECK-DAG: lbz +; CHECK-DAG: stw + +; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], +; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], +; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] +; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] +; CHECK: qvfsel 1, [[REG4]], 1, 2 +; CHECK: blr +} + diff --git a/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll b/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll new file mode 100644 index 0000000000000..df3e0befaef8a --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll @@ -0,0 +1,31 @@ +; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" + +; Function Attrs: nounwind +define void @gsl_sf_legendre_Pl_deriv_array(<4 x i32> %inp1, <4 x double> %inp2) #0 { +entry: + br label %vector.body198 + +vector.body198: ; preds = %vector.body198, %for.body46.lr.ph + %0 = icmp ne <4 x i32> %inp1, zeroinitializer + %1 = select <4 x i1> %0, <4 x double> , <4 x double> + %2 = fmul <4 x double> %inp2, %1 + %3 = fmul <4 x double> %inp2, %2 + %4 = fmul <4 x double> %3, %inp2 + store <4 x double> %4, <4 x double>* undef, align 8 + br label %return + +; CHECK-LABEL: @gsl_sf_legendre_Pl_deriv_array +; CHECK: qvlfiwzx +; CHECK: qvfcfidu +; CHECK: qvfcmpeq +; CHECK: qvfsel +; CHECK: qvfmul + +return: ; preds = %if.else.i + ret void +} + +attributes #0 = { nounwind } + diff --git a/llvm/test/CodeGen/PowerPC/qpx-store.ll b/llvm/test/CodeGen/PowerPC/qpx-store.ll new file mode 100644 index 0000000000000..2b96576ce4493 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-store.ll @@ -0,0 +1,25 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target triple = "powerpc64-bgq-linux" + +define void @foo(<4 x double> %v, <4 x double>* %p) { +entry: + store <4 x double> %v, <4 x double>* %p, align 8 + ret void +} + +; CHECK: @foo +; CHECK: stfd +; CHECK: stfd +; CHECK: stfd +; CHECK: stfd +; CHECK: blr + +define void @bar(<4 x double> %v, <4 x double>* %p) { +entry: + store <4 x double> %v, <4 x double>* %p, align 32 + ret void +} + +; CHECK: @bar +; CHECK: qvstfdx + diff --git a/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll b/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll new file mode 100644 index 0000000000000..e7ab92db6efc9 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll @@ -0,0 +1,217 @@ +; RUN: llc -verify-machineinstrs < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { +entry: + br label %vector.body + +; CHECK-LABEL: @foo +; Make sure that the offset constants we use are all even (only the last should be odd). +; CHECK-DAG: li {{[0-9]+}}, 1056 +; CHECK-DAG: li {{[0-9]+}}, 1088 +; CHECK-DAG: li {{[0-9]+}}, 1152 +; CHECK-DAG: li {{[0-9]+}}, 1216 +; CHECK-DAG: li {{[0-9]+}}, 1280 +; CHECK-DAG: li {{[0-9]+}}, 1344 +; CHECK-DAG: li {{[0-9]+}}, 1408 +; CHECK-DAG: li {{[0-9]+}}, 1472 +; CHECK-DAG: li {{[0-9]+}}, 1536 +; CHECK-DAG: li {{[0-9]+}}, 1600 +; CHECK-DAG: li {{[0-9]+}}, 1568 +; CHECK-DAG: li {{[0-9]+}}, 1664 +; CHECK-DAG: li {{[0-9]+}}, 1632 +; CHECK-DAG: li {{[0-9]+}}, 1728 +; CHECK-DAG: li {{[0-9]+}}, 1696 +; CHECK-DAG: li {{[0-9]+}}, 1792 +; CHECK-DAG: li {{[0-9]+}}, 1760 +; CHECK-DAG: li {{[0-9]+}}, 1856 +; CHECK-DAG: li {{[0-9]+}}, 1824 +; CHECK-DAG: li {{[0-9]+}}, 1920 +; CHECK-DAG: li {{[0-9]+}}, 1888 +; CHECK-DAG: li {{[0-9]+}}, 1984 +; CHECK-DAG: li {{[0-9]+}}, 1952 +; CHECK-DAG: li {{[0-9]+}}, 2016 +; CHECK-DAG: li {{[0-9]+}}, 1024 +; CHECK-DAG: li {{[0-9]+}}, 1120 +; CHECK-DAG: li {{[0-9]+}}, 1184 +; CHECK-DAG: li {{[0-9]+}}, 1248 +; CHECK-DAG: li {{[0-9]+}}, 1312 +; CHECK-DAG: li {{[0-9]+}}, 1376 +; CHECK-DAG: li {{[0-9]+}}, 1440 +; CHECK-DAG: li {{[0-9]+}}, 1504 +; CHECK-DAG: li {{[0-9]+}}, 2047 +; CHECK: blr + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ] + %0 = shl i64 %index, 1 + %1 = getelementptr inbounds double, double* %b, i64 %0 + %2 = bitcast double* %1 to <8 x double>* + %wide.vec = load <8 x double>, <8 x double>* %2, align 8 + %strided.vec = shufflevector <8 x double> %wide.vec, <8 x double> undef, <4 x i32> + %3 = fadd <4 x double> %strided.vec, + %4 = getelementptr inbounds double, double* %a, i64 %index + %5 = bitcast double* %4 to <4 x double>* + store <4 x double> %3, <4 x double>* %5, align 8 + %index.next = or i64 %index, 4 + %6 = shl i64 %index.next, 1 + %7 = getelementptr inbounds double, double* %b, i64 %6 + %8 = bitcast double* %7 to <8 x double>* + %wide.vec.1 = load <8 x double>, <8 x double>* %8, align 8 + %strided.vec.1 = shufflevector <8 x double> %wide.vec.1, <8 x double> undef, <4 x i32> + %9 = fadd <4 x double> %strided.vec.1, + %10 = getelementptr inbounds double, double* %a, i64 %index.next + %11 = bitcast double* %10 to <4 x double>* + store <4 x double> %9, <4 x double>* %11, align 8 + %index.next.1 = or i64 %index, 8 + %12 = shl i64 %index.next.1, 1 + %13 = getelementptr inbounds double, double* %b, i64 %12 + %14 = bitcast double* %13 to <8 x double>* + %wide.vec.2 = load <8 x double>, <8 x double>* %14, align 8 + %strided.vec.2 = shufflevector <8 x double> %wide.vec.2, <8 x double> undef, <4 x i32> + %15 = fadd <4 x double> %strided.vec.2, + %16 = getelementptr inbounds double, double* %a, i64 %index.next.1 + %17 = bitcast double* %16 to <4 x double>* + store <4 x double> %15, <4 x double>* %17, align 8 + %index.next.2 = or i64 %index, 12 + %18 = shl i64 %index.next.2, 1 + %19 = getelementptr inbounds double, double* %b, i64 %18 + %20 = bitcast double* %19 to <8 x double>* + %wide.vec.3 = load <8 x double>, <8 x double>* %20, align 8 + %strided.vec.3 = shufflevector <8 x double> %wide.vec.3, <8 x double> undef, <4 x i32> + %21 = fadd <4 x double> %strided.vec.3, + %22 = getelementptr inbounds double, double* %a, i64 %index.next.2 + %23 = bitcast double* %22 to <4 x double>* + store <4 x double> %21, <4 x double>* %23, align 8 + %index.next.3 = or i64 %index, 16 + %24 = shl i64 %index.next.3, 1 + %25 = getelementptr inbounds double, double* %b, i64 %24 + %26 = bitcast double* %25 to <8 x double>* + %wide.vec.4 = load <8 x double>, <8 x double>* %26, align 8 + %strided.vec.4 = shufflevector <8 x double> %wide.vec.4, <8 x double> undef, <4 x i32> + %27 = fadd <4 x double> %strided.vec.4, + %28 = getelementptr inbounds double, double* %a, i64 %index.next.3 + %29 = bitcast double* %28 to <4 x double>* + store <4 x double> %27, <4 x double>* %29, align 8 + %index.next.4 = or i64 %index, 20 + %30 = shl i64 %index.next.4, 1 + %31 = getelementptr inbounds double, double* %b, i64 %30 + %32 = bitcast double* %31 to <8 x double>* + %wide.vec.5 = load <8 x double>, <8 x double>* %32, align 8 + %strided.vec.5 = shufflevector <8 x double> %wide.vec.5, <8 x double> undef, <4 x i32> + %33 = fadd <4 x double> %strided.vec.5, + %34 = getelementptr inbounds double, double* %a, i64 %index.next.4 + %35 = bitcast double* %34 to <4 x double>* + store <4 x double> %33, <4 x double>* %35, align 8 + %index.next.5 = or i64 %index, 24 + %36 = shl i64 %index.next.5, 1 + %37 = getelementptr inbounds double, double* %b, i64 %36 + %38 = bitcast double* %37 to <8 x double>* + %wide.vec.6 = load <8 x double>, <8 x double>* %38, align 8 + %strided.vec.6 = shufflevector <8 x double> %wide.vec.6, <8 x double> undef, <4 x i32> + %39 = fadd <4 x double> %strided.vec.6, + %40 = getelementptr inbounds double, double* %a, i64 %index.next.5 + %41 = bitcast double* %40 to <4 x double>* + store <4 x double> %39, <4 x double>* %41, align 8 + %index.next.6 = or i64 %index, 28 + %42 = shl i64 %index.next.6, 1 + %43 = getelementptr inbounds double, double* %b, i64 %42 + %44 = bitcast double* %43 to <8 x double>* + %wide.vec.7 = load <8 x double>, <8 x double>* %44, align 8 + %strided.vec.7 = shufflevector <8 x double> %wide.vec.7, <8 x double> undef, <4 x i32> + %45 = fadd <4 x double> %strided.vec.7, + %46 = getelementptr inbounds double, double* %a, i64 %index.next.6 + %47 = bitcast double* %46 to <4 x double>* + store <4 x double> %45, <4 x double>* %47, align 8 + %index.next.7 = or i64 %index, 32 + %48 = shl i64 %index.next.7, 1 + %49 = getelementptr inbounds double, double* %b, i64 %48 + %50 = bitcast double* %49 to <8 x double>* + %wide.vec.8 = load <8 x double>, <8 x double>* %50, align 8 + %strided.vec.8 = shufflevector <8 x double> %wide.vec.8, <8 x double> undef, <4 x i32> + %51 = fadd <4 x double> %strided.vec.8, + %52 = getelementptr inbounds double, double* %a, i64 %index.next.7 + %53 = bitcast double* %52 to <4 x double>* + store <4 x double> %51, <4 x double>* %53, align 8 + %index.next.8 = or i64 %index, 36 + %54 = shl i64 %index.next.8, 1 + %55 = getelementptr inbounds double, double* %b, i64 %54 + %56 = bitcast double* %55 to <8 x double>* + %wide.vec.9 = load <8 x double>, <8 x double>* %56, align 8 + %strided.vec.9 = shufflevector <8 x double> %wide.vec.9, <8 x double> undef, <4 x i32> + %57 = fadd <4 x double> %strided.vec.9, + %58 = getelementptr inbounds double, double* %a, i64 %index.next.8 + %59 = bitcast double* %58 to <4 x double>* + store <4 x double> %57, <4 x double>* %59, align 8 + %index.next.9 = or i64 %index, 40 + %60 = shl i64 %index.next.9, 1 + %61 = getelementptr inbounds double, double* %b, i64 %60 + %62 = bitcast double* %61 to <8 x double>* + %wide.vec.10 = load <8 x double>, <8 x double>* %62, align 8 + %strided.vec.10 = shufflevector <8 x double> %wide.vec.10, <8 x double> undef, <4 x i32> + %63 = fadd <4 x double> %strided.vec.10, + %64 = getelementptr inbounds double, double* %a, i64 %index.next.9 + %65 = bitcast double* %64 to <4 x double>* + store <4 x double> %63, <4 x double>* %65, align 8 + %index.next.10 = or i64 %index, 44 + %66 = shl i64 %index.next.10, 1 + %67 = getelementptr inbounds double, double* %b, i64 %66 + %68 = bitcast double* %67 to <8 x double>* + %wide.vec.11 = load <8 x double>, <8 x double>* %68, align 8 + %strided.vec.11 = shufflevector <8 x double> %wide.vec.11, <8 x double> undef, <4 x i32> + %69 = fadd <4 x double> %strided.vec.11, + %70 = getelementptr inbounds double, double* %a, i64 %index.next.10 + %71 = bitcast double* %70 to <4 x double>* + store <4 x double> %69, <4 x double>* %71, align 8 + %index.next.11 = or i64 %index, 48 + %72 = shl i64 %index.next.11, 1 + %73 = getelementptr inbounds double, double* %b, i64 %72 + %74 = bitcast double* %73 to <8 x double>* + %wide.vec.12 = load <8 x double>, <8 x double>* %74, align 8 + %strided.vec.12 = shufflevector <8 x double> %wide.vec.12, <8 x double> undef, <4 x i32> + %75 = fadd <4 x double> %strided.vec.12, + %76 = getelementptr inbounds double, double* %a, i64 %index.next.11 + %77 = bitcast double* %76 to <4 x double>* + store <4 x double> %75, <4 x double>* %77, align 8 + %index.next.12 = or i64 %index, 52 + %78 = shl i64 %index.next.12, 1 + %79 = getelementptr inbounds double, double* %b, i64 %78 + %80 = bitcast double* %79 to <8 x double>* + %wide.vec.13 = load <8 x double>, <8 x double>* %80, align 8 + %strided.vec.13 = shufflevector <8 x double> %wide.vec.13, <8 x double> undef, <4 x i32> + %81 = fadd <4 x double> %strided.vec.13, + %82 = getelementptr inbounds double, double* %a, i64 %index.next.12 + %83 = bitcast double* %82 to <4 x double>* + store <4 x double> %81, <4 x double>* %83, align 8 + %index.next.13 = or i64 %index, 56 + %84 = shl i64 %index.next.13, 1 + %85 = getelementptr inbounds double, double* %b, i64 %84 + %86 = bitcast double* %85 to <8 x double>* + %wide.vec.14 = load <8 x double>, <8 x double>* %86, align 8 + %strided.vec.14 = shufflevector <8 x double> %wide.vec.14, <8 x double> undef, <4 x i32> + %87 = fadd <4 x double> %strided.vec.14, + %88 = getelementptr inbounds double, double* %a, i64 %index.next.13 + %89 = bitcast double* %88 to <4 x double>* + store <4 x double> %87, <4 x double>* %89, align 8 + %index.next.14 = or i64 %index, 60 + %90 = shl i64 %index.next.14, 1 + %91 = getelementptr inbounds double, double* %b, i64 %90 + %92 = bitcast double* %91 to <8 x double>* + %wide.vec.15 = load <8 x double>, <8 x double>* %92, align 8 + %strided.vec.15 = shufflevector <8 x double> %wide.vec.15, <8 x double> undef, <4 x i32> + %93 = fadd <4 x double> %strided.vec.15, + %94 = getelementptr inbounds double, double* %a, i64 %index.next.14 + %95 = bitcast double* %94 to <4 x double>* + store <4 x double> %93, <4 x double>* %95, align 8 + %index.next.15 = add nsw i64 %index, 64 + %96 = icmp eq i64 %index.next.15, 1600 + br i1 %96, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +attributes #0 = { nounwind "target-cpu"="a2q" } + diff --git a/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll b/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll new file mode 100644 index 0000000000000..fdee919fdfc32 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll @@ -0,0 +1,64 @@ +; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" + +define <4 x double> @foo(<4 x double>* %a) { +entry: + %r = load <4 x double>, <4 x double>* %a, align 32 + ret <4 x double> %r +; CHECK: qvlfdx +; CHECK: blr +} + +define <4 x double> @bar(<4 x double>* %a) { +entry: + %r = load <4 x double>, <4 x double>* %a, align 8 + %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 + %s = load <4 x double>, <4 x double>* %b, align 32 + %t = fadd <4 x double> %r, %s + ret <4 x double> %t +; CHECK: qvlpcldx +; CHECK: qvlfdx +; CHECK: qvfperm +; CHECK: blr +} + +define <4 x double> @bar1(<4 x double>* %a) { +entry: + %r = load <4 x double>, <4 x double>* %a, align 8 + %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 + %s = load <4 x double>, <4 x double>* %b, align 8 + %t = fadd <4 x double> %r, %s + ret <4 x double> %t +} + +define <4 x double> @bar2(<4 x double>* %a) { +entry: + %r = load <4 x double>, <4 x double>* %a, align 8 + %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 + %s = load <4 x double>, <4 x double>* %b, align 32 + %t = fadd <4 x double> %r, %s + ret <4 x double> %t +} + +define <4 x double> @bar3(<4 x double>* %a) { +entry: + %r = load <4 x double>, <4 x double>* %a, align 8 + %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 + %s = load <4 x double>, <4 x double>* %b, align 8 + %t = fadd <4 x double> %r, %s + ret <4 x double> %t +} + +define <4 x double> @bar4(<4 x double>* %a) { +entry: + %r = load <4 x double>, <4 x double>* %a, align 8 + %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 + %s = load <4 x double>, <4 x double>* %b, align 8 + %c = getelementptr <4 x double>, <4 x double>* %b, i32 1 + %t = load <4 x double>, <4 x double>* %c, align 8 + %u = fadd <4 x double> %r, %s + %v = fadd <4 x double> %u, %t + ret <4 x double> %v +} + diff --git a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll index d512f51a76e7a..e8fc409527588 100644 --- a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll +++ b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll @@ -1,4 +1,6 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits < %s | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" define void @test() align 2 { entry: diff --git a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir index dbe314b5251fe..e3aeb5605b42c 100644 --- a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir +++ b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir @@ -60,7 +60,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll index 20071ea1710c5..5c15145af2378 100644 --- a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll +++ b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll @@ -1,6 +1,7 @@ -; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" @aa = external global [256 x [256 x double]], align 32 @bb = external global [256 x [256 x double]], align 32 diff --git a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll index 9f458ebcf0a6e..80ac733156197 100644 --- a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll +++ b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll @@ -1225,5 +1225,576 @@ entry: ; CHECK: blr } +define <4 x double> @testqv4doubleslt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doubleslt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doubleult(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doubleult +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doublesle(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doublesle +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doubleule(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doubleule +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doubleeq(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doubleeq +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK: bclr 12, [[REG1]], 0 +; CHECK: qvfmr 1, 6 +; CHECK: blr +} + +define <4 x double> @testqv4doublesge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doublesge +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doubleuge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doubleuge +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doublesgt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doublesgt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doubleugt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doubleugt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x double> @testqv4doublene(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 + ret <4 x double> %cond + +; CHECK-LABEL: @testqv4doublene +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK: bclr 12, [[REG1]], 0 +; CHECK: qvfmr 1, 6 +; CHECK: blr +} + +define <4 x float> @testqv4floatslt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatslt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floatult(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatult +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floatsle(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatsle +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floatule(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatule +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floateq(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floateq +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK: bclr 12, [[REG1]], 0 +; CHECK: qvfmr 1, 6 +; CHECK: blr +} + +define <4 x float> @testqv4floatsge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatsge +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floatuge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatuge +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floatsgt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatsgt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floatugt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatugt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x float> @testqv4floatne(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 + ret <4 x float> %cond + +; CHECK-LABEL: @testqv4floatne +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK: bclr 12, [[REG1]], 0 +; CHECK: qvfmr 1, 6 +; CHECK: blr +} + +define <4 x i1> @testqv4i1slt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1slt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1ult(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1ult +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1sle(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1sle +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1ule(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1ule +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1eq(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1eq +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK: bclr 12, [[REG1]], 0 +; CHECK: qvfmr 1, 6 +; CHECK: blr +} + +define <4 x i1> @testqv4i1sge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1sge +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1uge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1uge +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB]] +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1sgt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1sgt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1ugt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1ugt +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] +; CHECK: .LBB[[BB1]]: +; CHECK: qvfmr 5, 6 +; CHECK: .LBB[[BB2]]: +; CHECK: qvfmr 1, 5 +; CHECK: blr +} + +define <4 x i1> @testqv4i1ne(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { +entry: + %cmp1 = fcmp oeq float %c3, %c4 + %cmp3tmp = fcmp oeq float %c1, %c2 + %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 + %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 + ret <4 x i1> %cond + +; CHECK-LABEL: @testqv4i1ne +; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 +; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 +; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK: bclr 12, [[REG1]], 0 +; CHECK: qvfmr 1, 6 +; CHECK: blr +} + attributes #0 = { nounwind readnone "target-cpu"="pwr7" } +attributes #1 = { nounwind readnone "target-cpu"="a2q" } diff --git a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll index 73fce78c33aa7..53d17d8668270 100644 --- a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll +++ b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s ; Check that llc does not crash due to an illegal APInt operation diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc.mir b/llvm/test/CodeGen/PowerPC/setcr_bc.mir index 564ee7d45957b..e9d81da681fcc 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir index 513cb85e1580a..582284d6d0a59 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/stwu-sched.ll b/llvm/test/CodeGen/PowerPC/stwu-sched.ll index 36afaf84a296b..0afd2ee406894 100644 --- a/llvm/test/CodeGen/PowerPC/stwu-sched.ll +++ b/llvm/test/CodeGen/PowerPC/stwu-sched.ll @@ -58,7 +58,7 @@ define void @initCombList(%0* nocapture, i32 signext) local_unnamed_addr #0 { ret void } -attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll index 79a368dd095ac..497add38e0444 100644 --- a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll @@ -327,6 +327,72 @@ entry: } +define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { +; CHECK-LABEL: test_l_qv4float: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 4, 15 +; CHECK-NEXT: qvlpclsx 0, 0, 3 +; CHECK-NEXT: qvlfsx 1, 3, 4 +; CHECK-NEXT: qvlfsx 2, 0, 3 +; CHECK-NEXT: qvfperm 1, 2, 1, 0 +; CHECK-NEXT: blr +entry: + %r = load <4 x float>, <4 x float>* %p, align 4 + ret <4 x float> %r + +} + +define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { +; CHECK-LABEL: test_l_qv8float: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 4, 31 +; CHECK-NEXT: qvlpclsx 1, 0, 3 +; CHECK-NEXT: qvlfsx 0, 3, 4 +; CHECK-NEXT: li 4, 16 +; CHECK-NEXT: qvlfsx 3, 3, 4 +; CHECK-NEXT: qvlfsx 4, 0, 3 +; CHECK-NEXT: qvfperm 2, 3, 0, 1 +; CHECK-NEXT: qvfperm 1, 4, 3, 1 +; CHECK-NEXT: blr +entry: + %r = load <8 x float>, <8 x float>* %p, align 4 + ret <8 x float> %r + +} + +define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { +; CHECK-LABEL: test_l_qv4double: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 4, 31 +; CHECK-NEXT: qvlpcldx 0, 0, 3 +; CHECK-NEXT: qvlfdx 1, 3, 4 +; CHECK-NEXT: qvlfdx 2, 0, 3 +; CHECK-NEXT: qvfperm 1, 2, 1, 0 +; CHECK-NEXT: blr +entry: + %r = load <4 x double>, <4 x double>* %p, align 8 + ret <4 x double> %r + +} + +define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { +; CHECK-LABEL: test_l_qv8double: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 4, 63 +; CHECK-NEXT: qvlpcldx 1, 0, 3 +; CHECK-NEXT: qvlfdx 0, 3, 4 +; CHECK-NEXT: li 4, 32 +; CHECK-NEXT: qvlfdx 3, 3, 4 +; CHECK-NEXT: qvlfdx 4, 0, 3 +; CHECK-NEXT: qvfperm 2, 3, 0, 1 +; CHECK-NEXT: qvfperm 1, 4, 3, 1 +; CHECK-NEXT: blr +entry: + %r = load <8 x double>, <8 x double>* %p, align 8 + ret <8 x double> %r + +} + define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { ; CHECK-LABEL: test_s_v16i8: ; CHECK: # %bb.0: # %entry @@ -471,6 +537,89 @@ entry: } +define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { +; CHECK-LABEL: test_s_qv4float: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 0, 1, 3 +; CHECK-NEXT: stfs 1, 0(3) +; CHECK-NEXT: stfs 0, 12(3) +; CHECK-NEXT: qvesplati 0, 1, 2 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: stfs 0, 8(3) +; CHECK-NEXT: stfs 1, 4(3) +; CHECK-NEXT: blr +entry: + store <4 x float> %v, <4 x float>* %p, align 4 + ret void + +} + +define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { +; CHECK-LABEL: test_s_qv8float: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 0, 2, 3 +; CHECK-NEXT: stfs 2, 16(3) +; CHECK-NEXT: stfs 0, 28(3) +; CHECK-NEXT: qvesplati 0, 2, 2 +; CHECK-NEXT: qvesplati 2, 2, 1 +; CHECK-NEXT: stfs 1, 0(3) +; CHECK-NEXT: stfs 0, 24(3) +; CHECK-NEXT: qvesplati 0, 1, 3 +; CHECK-NEXT: stfs 2, 20(3) +; CHECK-NEXT: qvesplati 2, 1, 2 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: stfs 0, 12(3) +; CHECK-NEXT: stfs 2, 8(3) +; CHECK-NEXT: stfs 1, 4(3) +; CHECK-NEXT: blr +entry: + store <8 x float> %v, <8 x float>* %p, align 4 + ret void + +} + +define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { +; CHECK-LABEL: test_s_qv4double: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 0, 1, 3 +; CHECK-NEXT: stfd 1, 0(3) +; CHECK-NEXT: stfd 0, 24(3) +; CHECK-NEXT: qvesplati 0, 1, 2 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: stfd 0, 16(3) +; CHECK-NEXT: stfd 1, 8(3) +; CHECK-NEXT: blr +entry: + store <4 x double> %v, <4 x double>* %p, align 8 + ret void + +} + +define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { +; CHECK-LABEL: test_s_qv8double: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: qvesplati 0, 2, 3 +; CHECK-NEXT: stfd 2, 32(3) +; CHECK-NEXT: stfd 0, 56(3) +; CHECK-NEXT: qvesplati 0, 2, 2 +; CHECK-NEXT: qvesplati 2, 2, 1 +; CHECK-NEXT: stfd 1, 0(3) +; CHECK-NEXT: stfd 0, 48(3) +; CHECK-NEXT: qvesplati 0, 1, 3 +; CHECK-NEXT: stfd 2, 40(3) +; CHECK-NEXT: qvesplati 2, 1, 2 +; CHECK-NEXT: qvesplati 1, 1, 1 +; CHECK-NEXT: stfd 0, 24(3) +; CHECK-NEXT: stfd 2, 16(3) +; CHECK-NEXT: stfd 1, 8(3) +; CHECK-NEXT: blr +entry: + store <8 x double> %v, <8 x double>* %p, align 8 + ret void + +} + attributes #0 = { nounwind "target-cpu"="pwr7" } +attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/CodeGen/PowerPC/uwtables.ll b/llvm/test/CodeGen/PowerPC/uwtables.ll index e302934ab8d6b..7523d04d73d38 100644 --- a/llvm/test/CodeGen/PowerPC/uwtables.ll +++ b/llvm/test/CodeGen/PowerPC/uwtables.ll @@ -47,5 +47,5 @@ declare i32 @__gxx_personality_v0(...) declare void @__cxa_call_unexpected(i8*) local_unnamed_addr -attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll index 33f3d82c3683d..36da7add88015 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll @@ -55,6 +55,21 @@ define i32 @bar2() { ; CHECK: store <2 x i64> zeroinitializer, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 8) to <2 x i64>*), align 8 ; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls +; Check QPX vector argument. +define i32 @bar3() "target-features"="+qpx" { + %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i32 2, <4 x double> ) + ret i32 %1 +} + +; That one is even stranger: the parameter save area starts at offset 48 from +; (32-byte aligned) stack pointer, the vector parameter is at 96 bytes from +; the stack pointer, so its offset from parameter save area is misaligned. +; CHECK-LABEL: @bar3 +; CHECK: store i32 0, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 4) to i32*), align 8 +; CHECK: store i32 0, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 12) to i32*), align 8 +; CHECK: store <4 x i64> zeroinitializer, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 40) to <4 x i64>*), align 8 +; CHECK: store {{.*}} 72, {{.*}} @__msan_va_arg_overflow_size_tls + ; Check i64 array. define i32 @bar4() { %1 = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2]) diff --git a/llvm/test/MC/Disassembler/PowerPC/qpx.txt b/llvm/test/MC/Disassembler/PowerPC/qpx.txt new file mode 100644 index 0000000000000..00e598bd4356e --- /dev/null +++ b/llvm/test/MC/Disassembler/PowerPC/qpx.txt @@ -0,0 +1,371 @@ +# RUN: llvm-mc --disassemble %s -triple powerpc64-bgq-linux -mcpu=a2q | FileCheck %s + +# CHECK: qvfabs 3, 5 +0x10 0x60 0x2a 0x10 + +# CHECK: qvfadd 3, 4, 5 +0x10 0x64 0x28 0x2a + +# CHECK: qvfadds 3, 4, 5 +0x00 0x64 0x28 0x2a + +# CHECK: qvfandc 3, 4, 5 +0x10 0x64 0x2a 0x08 + +# CHECK: qvfand 3, 4, 5 +0x10 0x64 0x28 0x88 + +# CHECK: qvfcfid 3, 5 +0x10 0x60 0x2e 0x9c + +# CHECK: qvfcfids 3, 5 +0x00 0x60 0x2e 0x9c + +# CHECK: qvfcfidu 3, 5 +0x10 0x60 0x2f 0x9c + +# CHECK: qvfcfidus 3, 5 +0x00 0x60 0x2f 0x9c + +# CHECK: qvfclr 3 +0x10 0x63 0x18 0x08 + +# CHECK: qvfcpsgn 3, 4, 5 +0x10 0x64 0x28 0x10 + +# CHECK: qvfctfb 3, 4 +0x10 0x64 0x22 0x88 + +# CHECK: qvfctid 3, 5 +0x10 0x60 0x2e 0x5c + +# CHECK: qvfctidu 3, 5 +0x10 0x60 0x2f 0x5c + +# CHECK: qvfctiduz 3, 5 +0x10 0x60 0x2f 0x5e + +# CHECK: qvfctidz 3, 5 +0x10 0x60 0x2e 0x5e + +# CHECK: qvfctiw 3, 5 +0x10 0x60 0x28 0x1c + +# CHECK: qvfctiwu 3, 5 +0x10 0x60 0x29 0x1c + +# CHECK: qvfctiwuz 3, 5 +0x10 0x60 0x29 0x1e + +# CHECK: qvfctiwz 3, 5 +0x10 0x60 0x28 0x1e + +# CHECK: qvfequ 3, 4, 5 +0x10 0x64 0x2c 0x88 + +# CHECK: qvflogical 3, 4, 5, 12 +0x10 0x64 0x2e 0x08 + +# CHECK: qvfmadd 3, 4, 6, 5 +0x10 0x64 0x29 0xba + +# CHECK: qvfmadds 3, 4, 6, 5 +0x00 0x64 0x29 0xba + +# CHECK: qvfmr 3, 5 +0x10 0x60 0x28 0x90 + +# CHECK: qvfmsub 3, 4, 6, 5 +0x10 0x64 0x29 0xb8 + +# CHECK: qvfmsubs 3, 4, 6, 5 +0x00 0x64 0x29 0xb8 + +# CHECK: qvfmul 3, 4, 6 +0x10 0x64 0x01 0xb2 + +# CHECK: qvfmuls 3, 4, 6 +0x00 0x64 0x01 0xb2 + +# CHECK: qvfnabs 3, 5 +0x10 0x60 0x29 0x10 + +# CHECK: qvfnand 3, 4, 5 +0x10 0x64 0x2f 0x08 + +# CHECK: qvfneg 3, 5 +0x10 0x60 0x28 0x50 + +# CHECK: qvfnmadd 3, 4, 6, 5 +0x10 0x64 0x29 0xbe + +# CHECK: qvfnmadds 3, 4, 6, 5 +0x00 0x64 0x29 0xbe + +# CHECK: qvfnmsub 3, 4, 6, 5 +0x10 0x64 0x29 0xbc + +# CHECK: qvfnmsubs 3, 4, 6, 5 +0x00 0x64 0x29 0xbc + +# CHECK: qvfnor 3, 4, 5 +0x10 0x64 0x2c 0x08 + +# CHECK: qvfnot 3, 4 +0x10 0x64 0x25 0x08 + +# CHECK: qvforc 3, 4, 5 +0x10 0x64 0x2e 0x88 + +# CHECK: qvfor 3, 4, 5 +0x10 0x64 0x2b 0x88 + +# CHECK: qvfperm 3, 4, 5, 6 +0x10 0x64 0x29 0x8c + +# CHECK: qvfre 3, 5 +0x10 0x60 0x28 0x30 + +# CHECK: qvfres 3, 5 +0x00 0x60 0x28 0x30 + +# CHECK: qvfrim 3, 5 +0x10 0x60 0x2b 0xd0 + +# CHECK: qvfrin 3, 5 +0x10 0x60 0x2b 0x10 + +# CHECK: qvfrip 3, 5 +0x10 0x60 0x2b 0x90 + +# CHECK: qvfriz 3, 5 +0x10 0x60 0x2b 0x50 + +# CHECK: qvfrsp 3, 5 +0x10 0x60 0x28 0x18 + +# CHECK: qvfrsqrte 3, 5 +0x10 0x60 0x28 0x34 + +# CHECK: qvfrsqrtes 3, 5 +0x00 0x60 0x28 0x34 + +# CHECK: qvfsel 3, 4, 6, 5 +0x10 0x64 0x29 0xae + +# CHECK: qvfset 3 +0x10 0x63 0x1f 0x88 + +# CHECK: qvfsub 3, 4, 5 +0x10 0x64 0x28 0x28 + +# CHECK: qvfsubs 3, 4, 5 +0x00 0x64 0x28 0x28 + +# CHECK: qvfxmadd 3, 4, 6, 5 +0x10 0x64 0x29 0x92 + +# CHECK: qvfxmadds 3, 4, 6, 5 +0x00 0x64 0x29 0x92 + +# CHECK: qvfxmul 3, 4, 6 +0x10 0x64 0x01 0xa2 + +# CHECK: qvfxmuls 3, 4, 6 +0x00 0x64 0x01 0xa2 + +# CHECK: qvfxor 3, 4, 5 +0x10 0x64 0x2b 0x08 + +# CHECK: qvfxxcpnmadd 3, 4, 6, 5 +0x10 0x64 0x29 0x86 + +# CHECK: qvfxxcpnmadds 3, 4, 6, 5 +0x00 0x64 0x29 0x86 + +# CHECK: qvfxxmadd 3, 4, 6, 5 +0x10 0x64 0x29 0x82 + +# CHECK: qvfxxmadds 3, 4, 6, 5 +0x00 0x64 0x29 0x82 + +# CHECK: qvfxxnpmadd 3, 4, 6, 5 +0x10 0x64 0x29 0x96 + +# CHECK: qvfxxnpmadds 3, 4, 6, 5 +0x00 0x64 0x29 0x96 + +# CHECK: qvlfcduxa 3, 9, 11 +0x7c 0x69 0x58 0xcf + +# CHECK: qvlfcdux 3, 9, 11 +0x7c 0x69 0x58 0xce + +# CHECK: qvlfcdxa 3, 10, 11 +0x7c 0x6a 0x58 0x8f + +# CHECK: qvlfcdx 3, 10, 11 +0x7c 0x6a 0x58 0x8e + +# CHECK: qvlfcsuxa 3, 9, 11 +0x7c 0x69 0x58 0x4f + +# CHECK: qvlfcsux 3, 9, 11 +0x7c 0x69 0x58 0x4e + +# CHECK: qvlfcsxa 3, 10, 11 +0x7c 0x6a 0x58 0x0f + +# CHECK: qvlfcsx 3, 10, 11 +0x7c 0x6a 0x58 0x0e + +# CHECK: qvlfduxa 3, 9, 11 +0x7c 0x69 0x5c 0xcf + +# CHECK: qvlfdux 3, 9, 11 +0x7c 0x69 0x5c 0xce + +# CHECK: qvlfdxa 3, 10, 11 +0x7c 0x6a 0x5c 0x8f + +# CHECK: qvlfdx 3, 10, 11 +0x7c 0x6a 0x5c 0x8e + +# CHECK: qvlfiwaxa 3, 10, 11 +0x7c 0x6a 0x5e 0xcf + +# CHECK: qvlfiwax 3, 10, 11 +0x7c 0x6a 0x5e 0xce + +# CHECK: qvlfiwzxa 3, 10, 11 +0x7c 0x6a 0x5e 0x8f + +# CHECK: qvlfiwzx 3, 10, 11 +0x7c 0x6a 0x5e 0x8e + +# CHECK: qvlfsuxa 3, 9, 11 +0x7c 0x69 0x5c 0x4f + +# CHECK: qvlfsux 3, 9, 11 +0x7c 0x69 0x5c 0x4e + +# CHECK: qvlfsxa 3, 10, 11 +0x7c 0x6a 0x5c 0x0f + +# CHECK: qvlfsx 3, 10, 11 +0x7c 0x6a 0x5c 0x0e + +# CHECK: qvlpcldx 3, 10, 11 +0x7c 0x6a 0x5c 0x8c + +# CHECK: qvlpclsx 3, 10, 11 +0x7c 0x6a 0x5c 0x0c + +# CHECK: qvlpcrdx 3, 10, 11 +0x7c 0x6a 0x58 0x8c + +# CHECK: qvlpcrsx 3, 10, 11 +0x7c 0x6a 0x58 0x0c + +# CHECK: qvstfcduxa 2, 9, 11 +0x7c 0x49 0x59 0xcf + +# CHECK: qvstfcduxia 2, 9, 11 +0x7c 0x49 0x59 0xcb + +# CHECK: qvstfcduxi 2, 9, 11 +0x7c 0x49 0x59 0xca + +# CHECK: qvstfcdux 2, 9, 11 +0x7c 0x49 0x59 0xce + +# CHECK: qvstfcdxa 2, 10, 11 +0x7c 0x4a 0x59 0x8f + +# CHECK: qvstfcdxia 2, 10, 11 +0x7c 0x4a 0x59 0x8b + +# CHECK: qvstfcdxi 2, 10, 11 +0x7c 0x4a 0x59 0x8a + +# CHECK: qvstfcdx 2, 10, 11 +0x7c 0x4a 0x59 0x8e + +# CHECK: qvstfcsuxa 2, 9, 11 +0x7c 0x49 0x59 0x4f + +# CHECK: qvstfcsuxia 2, 9, 11 +0x7c 0x49 0x59 0x4b + +# CHECK: qvstfcsuxi 2, 9, 11 +0x7c 0x49 0x59 0x4a + +# CHECK: qvstfcsux 2, 9, 11 +0x7c 0x49 0x59 0x4e + +# CHECK: qvstfcsxa 2, 10, 11 +0x7c 0x4a 0x59 0x0f + +# CHECK: qvstfcsxia 2, 10, 11 +0x7c 0x4a 0x59 0x0b + +# CHECK: qvstfcsxi 2, 10, 11 +0x7c 0x4a 0x59 0x0a + +# CHECK: qvstfcsx 2, 10, 11 +0x7c 0x4a 0x59 0x0e + +# CHECK: qvstfduxa 2, 9, 11 +0x7c 0x49 0x5d 0xcf + +# CHECK: qvstfduxia 2, 9, 11 +0x7c 0x49 0x5d 0xcb + +# CHECK: qvstfduxi 2, 9, 11 +0x7c 0x49 0x5d 0xca + +# CHECK: qvstfdux 2, 9, 11 +0x7c 0x49 0x5d 0xce + +# CHECK: qvstfdxa 2, 10, 11 +0x7c 0x4a 0x5d 0x8f + +# CHECK: qvstfdxia 2, 10, 11 +0x7c 0x4a 0x5d 0x8b + +# CHECK: qvstfdxi 2, 10, 11 +0x7c 0x4a 0x5d 0x8a + +# CHECK: qvstfdx 2, 10, 11 +0x7c 0x4a 0x5d 0x8e + +# CHECK: qvstfiwxa 2, 10, 11 +0x7c 0x4a 0x5f 0x8f + +# CHECK: qvstfiwx 2, 10, 11 +0x7c 0x4a 0x5f 0x8e + +# CHECK: qvstfsuxa 2, 9, 11 +0x7c 0x49 0x5d 0x4f + +# CHECK: qvstfsuxia 2, 9, 11 +0x7c 0x49 0x5d 0x4b + +# CHECK: qvstfsuxi 2, 9, 11 +0x7c 0x49 0x5d 0x4a + +# CHECK: qvstfsux 2, 9, 11 +0x7c 0x49 0x5d 0x4e + +# CHECK: qvstfsxa 2, 10, 11 +0x7c 0x4a 0x5d 0x0f + +# CHECK: qvstfsxia 2, 10, 11 +0x7c 0x4a 0x5d 0x0b + +# CHECK: qvstfsxi 2, 10, 11 +0x7c 0x4a 0x5d 0x0a + +# CHECK: qvstfsx 2, 10, 11 +0x7c 0x4a 0x5d 0x0e + diff --git a/llvm/test/MC/PowerPC/qpx.s b/llvm/test/MC/PowerPC/qpx.s new file mode 100644 index 0000000000000..a1fb2090f8fff --- /dev/null +++ b/llvm/test/MC/PowerPC/qpx.s @@ -0,0 +1,252 @@ +# RUN: llvm-mc -triple powerpc64-bgq-linux --show-encoding %s | FileCheck %s + +# CHECK: qvfabs 3, 5 # encoding: [0x10,0x60,0x2a,0x10] + qvfabs %q3, %q5 + +# CHECK: qvfabs 3, 5 # encoding: [0x10,0x60,0x2a,0x10] + qvfabs 3, 5 +# CHECK: qvfadd 3, 4, 5 # encoding: [0x10,0x64,0x28,0x2a] + qvfadd 3, 4, 5 +# CHECK: qvfadds 3, 4, 5 # encoding: [0x00,0x64,0x28,0x2a] + qvfadds 3, 4, 5 +# CHECK: qvfandc 3, 4, 5 # encoding: [0x10,0x64,0x2a,0x08] + qvfandc 3, 4, 5 +# CHECK: qvfand 3, 4, 5 # encoding: [0x10,0x64,0x28,0x88] + qvfand 3, 4, 5 +# CHECK: qvfcfid 3, 5 # encoding: [0x10,0x60,0x2e,0x9c] + qvfcfid 3, 5 +# CHECK: qvfcfids 3, 5 # encoding: [0x00,0x60,0x2e,0x9c] + qvfcfids 3, 5 +# CHECK: qvfcfidu 3, 5 # encoding: [0x10,0x60,0x2f,0x9c] + qvfcfidu 3, 5 +# CHECK: qvfcfidus 3, 5 # encoding: [0x00,0x60,0x2f,0x9c] + qvfcfidus 3, 5 +# CHECK: qvfclr 3 # encoding: [0x10,0x63,0x18,0x08] + qvfclr 3 +# CHECK: qvfcpsgn 3, 4, 5 # encoding: [0x10,0x64,0x28,0x10] + qvfcpsgn 3, 4, 5 +# CHECK: qvfctfb 3, 4 # encoding: [0x10,0x64,0x22,0x88] + qvfctfb 3, 4 +# CHECK: qvfctid 3, 5 # encoding: [0x10,0x60,0x2e,0x5c] + qvfctid 3, 5 +# CHECK: qvfctidu 3, 5 # encoding: [0x10,0x60,0x2f,0x5c] + qvfctidu 3, 5 +# CHECK: qvfctiduz 3, 5 # encoding: [0x10,0x60,0x2f,0x5e] + qvfctiduz 3, 5 +# CHECK: qvfctidz 3, 5 # encoding: [0x10,0x60,0x2e,0x5e] + qvfctidz 3, 5 +# CHECK: qvfctiw 3, 5 # encoding: [0x10,0x60,0x28,0x1c] + qvfctiw 3, 5 +# CHECK: qvfctiwu 3, 5 # encoding: [0x10,0x60,0x29,0x1c] + qvfctiwu 3, 5 +# CHECK: qvfctiwuz 3, 5 # encoding: [0x10,0x60,0x29,0x1e] + qvfctiwuz 3, 5 +# CHECK: qvfctiwz 3, 5 # encoding: [0x10,0x60,0x28,0x1e] + qvfctiwz 3, 5 +# CHECK: qvfequ 3, 4, 5 # encoding: [0x10,0x64,0x2c,0x88] + qvfequ 3, 4, 5 +# CHECK: qvflogical 3, 4, 5, 12 # encoding: [0x10,0x64,0x2e,0x08] + qvflogical 3, 4, 5, 12 +# CHECK: qvfmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xba] + qvfmadd 3, 4, 6, 5 +# CHECK: qvfmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xba] + qvfmadds 3, 4, 6, 5 +# CHECK: qvfmr 3, 5 # encoding: [0x10,0x60,0x28,0x90] + qvfmr 3, 5 +# CHECK: qvfmsub 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xb8] + qvfmsub 3, 4, 6, 5 +# CHECK: qvfmsubs 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xb8] + qvfmsubs 3, 4, 6, 5 +# CHECK: qvfmul 3, 4, 6 # encoding: [0x10,0x64,0x01,0xb2] + qvfmul 3, 4, 6 +# CHECK: qvfmuls 3, 4, 6 # encoding: [0x00,0x64,0x01,0xb2] + qvfmuls 3, 4, 6 +# CHECK: qvfnabs 3, 5 # encoding: [0x10,0x60,0x29,0x10] + qvfnabs 3, 5 +# CHECK: qvfnand 3, 4, 5 # encoding: [0x10,0x64,0x2f,0x08] + qvfnand 3, 4, 5 +# CHECK: qvfneg 3, 5 # encoding: [0x10,0x60,0x28,0x50] + qvfneg 3, 5 +# CHECK: qvfnmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xbe] + qvfnmadd 3, 4, 6, 5 +# CHECK: qvfnmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xbe] + qvfnmadds 3, 4, 6, 5 +# CHECK: qvfnmsub 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xbc] + qvfnmsub 3, 4, 6, 5 +# CHECK: qvfnmsubs 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xbc] + qvfnmsubs 3, 4, 6, 5 +# CHECK: qvfnor 3, 4, 5 # encoding: [0x10,0x64,0x2c,0x08] + qvfnor 3, 4, 5 +# CHECK: qvfnot 3, 4 # encoding: [0x10,0x64,0x25,0x08] + qvfnot 3, 4 +# CHECK: qvforc 3, 4, 5 # encoding: [0x10,0x64,0x2e,0x88] + qvforc 3, 4, 5 +# CHECK: qvfor 3, 4, 5 # encoding: [0x10,0x64,0x2b,0x88] + qvfor 3, 4, 5 +# CHECK: qvfperm 3, 4, 5, 6 # encoding: [0x10,0x64,0x29,0x8c] + qvfperm 3, 4, 5, 6 +# CHECK: qvfre 3, 5 # encoding: [0x10,0x60,0x28,0x30] + qvfre 3, 5 +# CHECK: qvfres 3, 5 # encoding: [0x00,0x60,0x28,0x30] + qvfres 3, 5 +# CHECK: qvfrim 3, 5 # encoding: [0x10,0x60,0x2b,0xd0] + qvfrim 3, 5 +# CHECK: qvfrin 3, 5 # encoding: [0x10,0x60,0x2b,0x10] + qvfrin 3, 5 +# CHECK: qvfrip 3, 5 # encoding: [0x10,0x60,0x2b,0x90] + qvfrip 3, 5 +# CHECK: qvfriz 3, 5 # encoding: [0x10,0x60,0x2b,0x50] + qvfriz 3, 5 +# CHECK: qvfrsp 3, 5 # encoding: [0x10,0x60,0x28,0x18] + qvfrsp 3, 5 +# CHECK: qvfrsqrte 3, 5 # encoding: [0x10,0x60,0x28,0x34] + qvfrsqrte 3, 5 +# CHECK: qvfrsqrtes 3, 5 # encoding: [0x00,0x60,0x28,0x34] + qvfrsqrtes 3, 5 +# CHECK: qvfsel 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xae] + qvfsel 3, 4, 6, 5 +# CHECK: qvfset 3 # encoding: [0x10,0x63,0x1f,0x88] + qvfset 3 +# CHECK: qvfsub 3, 4, 5 # encoding: [0x10,0x64,0x28,0x28] + qvfsub 3, 4, 5 +# CHECK: qvfsubs 3, 4, 5 # encoding: [0x00,0x64,0x28,0x28] + qvfsubs 3, 4, 5 +# CHECK: qvfxmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x92] + qvfxmadd 3, 4, 6, 5 +# CHECK: qvfxmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x92] + qvfxmadds 3, 4, 6, 5 +# CHECK: qvfxmul 3, 4, 6 # encoding: [0x10,0x64,0x01,0xa2] + qvfxmul 3, 4, 6 +# CHECK: qvfxmuls 3, 4, 6 # encoding: [0x00,0x64,0x01,0xa2] + qvfxmuls 3, 4, 6 +# CHECK: qvfxor 3, 4, 5 # encoding: [0x10,0x64,0x2b,0x08] + qvfxor 3, 4, 5 +# CHECK: qvfxxcpnmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x86] + qvfxxcpnmadd 3, 4, 6, 5 +# CHECK: qvfxxcpnmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x86] + qvfxxcpnmadds 3, 4, 6, 5 +# CHECK: qvfxxmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x82] + qvfxxmadd 3, 4, 6, 5 +# CHECK: qvfxxmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x82] + qvfxxmadds 3, 4, 6, 5 +# CHECK: qvfxxnpmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x96] + qvfxxnpmadd 3, 4, 6, 5 +# CHECK: qvfxxnpmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x96] + qvfxxnpmadds 3, 4, 6, 5 +# CHECK: qvlfcduxa 3, 9, 11 # encoding: [0x7c,0x69,0x58,0xcf] + qvlfcduxa 3, 9, 11 +# CHECK: qvlfcdux 3, 9, 11 # encoding: [0x7c,0x69,0x58,0xce] + qvlfcdux 3, 9, 11 +# CHECK: qvlfcdxa 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8f] + qvlfcdxa 3, 10, 11 +# CHECK: qvlfcdx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8e] + qvlfcdx 3, 10, 11 +# CHECK: qvlfcsuxa 3, 9, 11 # encoding: [0x7c,0x69,0x58,0x4f] + qvlfcsuxa 3, 9, 11 +# CHECK: qvlfcsux 3, 9, 11 # encoding: [0x7c,0x69,0x58,0x4e] + qvlfcsux 3, 9, 11 +# CHECK: qvlfcsxa 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0f] + qvlfcsxa 3, 10, 11 +# CHECK: qvlfcsx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0e] + qvlfcsx 3, 10, 11 +# CHECK: qvlfduxa 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0xcf] + qvlfduxa 3, 9, 11 +# CHECK: qvlfdux 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0xce] + qvlfdux 3, 9, 11 +# CHECK: qvlfdxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8f] + qvlfdxa 3, 10, 11 +# CHECK: qvlfdx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8e] + qvlfdx 3, 10, 11 +# CHECK: qvlfiwaxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0xcf] + qvlfiwaxa 3, 10, 11 +# CHECK: qvlfiwax 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0xce] + qvlfiwax 3, 10, 11 +# CHECK: qvlfiwzxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0x8f] + qvlfiwzxa 3, 10, 11 +# CHECK: qvlfiwzx 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0x8e] + qvlfiwzx 3, 10, 11 +# CHECK: qvlfsuxa 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0x4f] + qvlfsuxa 3, 9, 11 +# CHECK: qvlfsux 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0x4e] + qvlfsux 3, 9, 11 +# CHECK: qvlfsxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0f] + qvlfsxa 3, 10, 11 +# CHECK: qvlfsx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0e] + qvlfsx 3, 10, 11 +# CHECK: qvlpcldx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8c] + qvlpcldx 3, 10, 11 +# CHECK: qvlpclsx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0c] + qvlpclsx 3, 10, 11 +# CHECK: qvlpcrdx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8c] + qvlpcrdx 3, 10, 11 +# CHECK: qvlpcrsx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0c] + qvlpcrsx 3, 10, 11 +# CHECK: qvstfcduxa 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xcf] + qvstfcduxa 2, 9, 11 +# CHECK: qvstfcduxia 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xcb] + qvstfcduxia 2, 9, 11 +# CHECK: qvstfcduxi 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xca] + qvstfcduxi 2, 9, 11 +# CHECK: qvstfcdux 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xce] + qvstfcdux 2, 9, 11 +# CHECK: qvstfcdxa 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8f] + qvstfcdxa 2, 10, 11 +# CHECK: qvstfcdxia 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8b] + qvstfcdxia 2, 10, 11 +# CHECK: qvstfcdxi 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8a] + qvstfcdxi 2, 10, 11 +# CHECK: qvstfcdx 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8e] + qvstfcdx 2, 10, 11 +# CHECK: qvstfcsuxa 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4f] + qvstfcsuxa 2, 9, 11 +# CHECK: qvstfcsuxia 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4b] + qvstfcsuxia 2, 9, 11 +# CHECK: qvstfcsuxi 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4a] + qvstfcsuxi 2, 9, 11 +# CHECK: qvstfcsux 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4e] + qvstfcsux 2, 9, 11 +# CHECK: qvstfcsxa 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0f] + qvstfcsxa 2, 10, 11 +# CHECK: qvstfcsxia 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0b] + qvstfcsxia 2, 10, 11 +# CHECK: qvstfcsxi 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0a] + qvstfcsxi 2, 10, 11 +# CHECK: qvstfcsx 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0e] + qvstfcsx 2, 10, 11 +# CHECK: qvstfduxa 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xcf] + qvstfduxa 2, 9, 11 +# CHECK: qvstfduxia 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xcb] + qvstfduxia 2, 9, 11 +# CHECK: qvstfduxi 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xca] + qvstfduxi 2, 9, 11 +# CHECK: qvstfdux 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xce] + qvstfdux 2, 9, 11 +# CHECK: qvstfdxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8f] + qvstfdxa 2, 10, 11 +# CHECK: qvstfdxia 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8b] + qvstfdxia 2, 10, 11 +# CHECK: qvstfdxi 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8a] + qvstfdxi 2, 10, 11 +# CHECK: qvstfdx 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8e] + qvstfdx 2, 10, 11 +# CHECK: qvstfiwxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5f,0x8f] + qvstfiwxa 2, 10, 11 +# CHECK: qvstfiwx 2, 10, 11 # encoding: [0x7c,0x4a,0x5f,0x8e] + qvstfiwx 2, 10, 11 +# CHECK: qvstfsuxa 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4f] + qvstfsuxa 2, 9, 11 +# CHECK: qvstfsuxia 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4b] + qvstfsuxia 2, 9, 11 +# CHECK: qvstfsuxi 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4a] + qvstfsuxi 2, 9, 11 +# CHECK: qvstfsux 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4e] + qvstfsux 2, 9, 11 +# CHECK: qvstfsxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0f] + qvstfsxa 2, 10, 11 +# CHECK: qvstfsxia 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0b] + qvstfsxia 2, 10, 11 +# CHECK: qvstfsxi 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0a] + qvstfsxi 2, 10, 11 +# CHECK: qvstfsx 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0e] + qvstfsx 2, 10, 11 + diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll b/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll index 67c22f9470779..c45c48d502343 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll @@ -4,7 +4,7 @@ ; RUN: opt -attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" define void @test(i32 signext %n) { ; IS__TUNIT____: Function Attrs: nofree noreturn nosync nounwind readnone diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll index faf7041bfc387..6cd77a59df6b1 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" %"struct.std::complex" = type { { float, float } } diff --git a/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll index a57693a1da38e..2a61fff15ade0 100644 --- a/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" %"struct.std::complex" = type { { float, float } } diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll index 5a6daa2c9a008..4c5f18a26657c 100644 --- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll +++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll @@ -4,7 +4,7 @@ ; RUN: opt -passes="function(ee-instrument),function(ee-instrument),cgscc(inline),function(post-inline-ee-instrument),function(post-inline-ee-instrument)" -S < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" define void @leaf_function() #0 { entry: diff --git a/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll b/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll new file mode 100644 index 0000000000000..e9710df5670cd --- /dev/null +++ b/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll @@ -0,0 +1,165 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +declare <4 x double> @llvm.ppc.qpx.qvlfs(i8*) #1 + +define <4 x double> @test1(<4 x float>* %h) #0 { +entry: + %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 + %hv = bitcast <4 x float>* %h1 to i8* + %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv) + +; CHECK-LABEL: @test1 +; CHECK: @llvm.ppc.qpx.qvlfs +; CHECK: ret <4 x double> + + %v0 = load <4 x float>, <4 x float>* %h, align 8 + %v0e = fpext <4 x float> %v0 to <4 x double> + %a = fadd <4 x double> %v0e, %vl + ret <4 x double> %a +} + +define <4 x double> @test1a(<4 x float>* align 16 %h) #0 { +entry: + %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 + %hv = bitcast <4 x float>* %h1 to i8* + %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv) + +; CHECK-LABEL: @test1a +; CHECK-NOT: @llvm.ppc.qpx.qvlfs +; CHECK-NOT: load <4 x double> +; CHECK: ret <4 x double> + + %v0 = load <4 x float>, <4 x float>* %h, align 8 + %v0e = fpext <4 x float> %v0 to <4 x double> + %a = fadd <4 x double> %v0e, %vl + ret <4 x double> %a +} + +declare void @llvm.ppc.qpx.qvstfs(<4 x double>, i8*) #0 + +define <4 x float> @test2(<4 x float>* %h, <4 x double> %d) #0 { +entry: + %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 + %hv = bitcast <4 x float>* %h1 to i8* + call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv) + + %v0 = load <4 x float>, <4 x float>* %h, align 8 + ret <4 x float> %v0 + +; CHECK-LABEL: @test2 +; CHECK: @llvm.ppc.qpx.qvstfs +; CHECK: ret <4 x float> +} + +define <4 x float> @test2a(<4 x float>* align 16 %h, <4 x double> %d) #0 { +entry: + %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 + %hv = bitcast <4 x float>* %h1 to i8* + call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv) + + %v0 = load <4 x float>, <4 x float>* %h, align 8 + ret <4 x float> %v0 + +; CHECK-LABEL: @test2 +; CHECK: fptrunc <4 x double> %d to <4 x float> +; CHECK-NOT: @llvm.ppc.qpx.qvstfs +; CHECK-NOT: store <4 x double> +; CHECK: ret <4 x float> +} + +declare <4 x double> @llvm.ppc.qpx.qvlfd(i8*) #1 + +define <4 x double> @test1l(<4 x double>* %h) #0 { +entry: + %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 + %hv = bitcast <4 x double>* %h1 to i8* + %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) + +; CHECK-LABEL: @test1l +; CHECK: @llvm.ppc.qpx.qvlfd +; CHECK: ret <4 x double> + + %v0 = load <4 x double>, <4 x double>* %h, align 8 + %a = fadd <4 x double> %v0, %vl + ret <4 x double> %a +} + +define <4 x double> @test1ln(<4 x double>* align 16 %h) #0 { +entry: + %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 + %hv = bitcast <4 x double>* %h1 to i8* + %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) + +; CHECK-LABEL: @test1ln +; CHECK: @llvm.ppc.qpx.qvlfd +; CHECK: ret <4 x double> + + %v0 = load <4 x double>, <4 x double>* %h, align 8 + %a = fadd <4 x double> %v0, %vl + ret <4 x double> %a +} + +define <4 x double> @test1la(<4 x double>* align 32 %h) #0 { +entry: + %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 + %hv = bitcast <4 x double>* %h1 to i8* + %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) + +; CHECK-LABEL: @test1la +; CHECK-NOT: @llvm.ppc.qpx.qvlfd +; CHECK: ret <4 x double> + + %v0 = load <4 x double>, <4 x double>* %h, align 8 + %a = fadd <4 x double> %v0, %vl + ret <4 x double> %a +} + +declare void @llvm.ppc.qpx.qvstfd(<4 x double>, i8*) #0 + +define <4 x double> @test2l(<4 x double>* %h, <4 x double> %d) #0 { +entry: + %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 + %hv = bitcast <4 x double>* %h1 to i8* + call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) + + %v0 = load <4 x double>, <4 x double>* %h, align 8 + ret <4 x double> %v0 + +; CHECK-LABEL: @test2l +; CHECK: @llvm.ppc.qpx.qvstfd +; CHECK: ret <4 x double> +} + +define <4 x double> @test2ln(<4 x double>* align 16 %h, <4 x double> %d) #0 { +entry: + %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 + %hv = bitcast <4 x double>* %h1 to i8* + call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) + + %v0 = load <4 x double>, <4 x double>* %h, align 8 + ret <4 x double> %v0 + +; CHECK-LABEL: @test2ln +; CHECK: @llvm.ppc.qpx.qvstfd +; CHECK: ret <4 x double> +} + +define <4 x double> @test2la(<4 x double>* align 32 %h, <4 x double> %d) #0 { +entry: + %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 + %hv = bitcast <4 x double>* %h1 to i8* + call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) + + %v0 = load <4 x double>, <4 x double>* %h, align 8 + ret <4 x double> %v0 + +; CHECK-LABEL: @test2l +; CHECK-NOT: @llvm.ppc.qpx.qvstfd +; CHECK: ret <4 x double> +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + diff --git a/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll b/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll index 68c75af14f3e9..ea46fd0d5a8f8 100644 --- a/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll +++ b/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll @@ -1,6 +1,7 @@ -; RUN: opt -mcpu=a2 -loop-data-prefetch -mtriple=powerpc64le-unknown-linux -enable-ppc-prefetching -S < %s | FileCheck %s -; RUN: opt -mcpu=a2 -passes=loop-data-prefetch -mtriple=powerpc64le-unknown-linux -enable-ppc-prefetching -S < %s | FileCheck %s +; RUN: opt -mcpu=a2 -loop-data-prefetch -S < %s | FileCheck %s +; RUN: opt -mcpu=a2 -passes=loop-data-prefetch -S < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" define void @foo(double* nocapture %a, double* nocapture readonly %b) { entry: diff --git a/llvm/test/Transforms/LoopSimplify/dup-preds.ll b/llvm/test/Transforms/LoopSimplify/dup-preds.ll index 362d834686d41..c9253fa51a65f 100644 --- a/llvm/test/Transforms/LoopSimplify/dup-preds.ll +++ b/llvm/test/Transforms/LoopSimplify/dup-preds.ll @@ -1,6 +1,6 @@ ; RUN: opt -loop-simplify -S %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" define fastcc void @do_update_md([3 x float]* nocapture readonly %x) #0 { entry: diff --git a/llvm/test/Transforms/LoopUnroll/pr14167.ll b/llvm/test/Transforms/LoopUnroll/pr14167.ll index 3097c234fb933..9aac70115d9ae 100644 --- a/llvm/test/Transforms/LoopUnroll/pr14167.ll +++ b/llvm/test/Transforms/LoopUnroll/pr14167.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -S -loop-unroll -unroll-runtime | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" define void @test1() nounwind { ; Ensure that we don't crash when the trip count == -1. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll new file mode 100644 index 0000000000000..9fdfb6f90e7bf --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll @@ -0,0 +1,40 @@ +; RUN: opt -S -basic-aa -loop-vectorize < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %c) #0 { +entry: + br label %for.body + +; CHECK-LABEL: @foo +; CHECK: fmul <4 x double> %{{[^,]+}}, +; CHECK-NEXT: fmul <4 x double> %{{[^,]+}}, + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %mul = fmul double %0, 2.000000e+00 + %mul3 = fmul double %0, %mul + %arrayidx5 = getelementptr inbounds double, double* %c, i64 %indvars.iv + %1 = load double, double* %arrayidx5, align 8 + %mul6 = fmul double %1, 3.000000e+00 + %mul9 = fmul double %1, %mul6 + %add = fadd double %mul3, %mul9 + %mul12 = fmul double %0, 4.000000e+00 + %mul15 = fmul double %mul12, %1 + %add16 = fadd double %mul15, %add + %add17 = fadd double %add16, 1.000000e+00 + %arrayidx19 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add17, double* %arrayidx19, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind "target-cpu"="a2q" } + diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll index cddddba579473..8abc25ece35c6 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -loop-vectorize < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" ; Function Attrs: nounwind define zeroext i32 @test() #0 { diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll index 5bf7e1a695011..999ff74ad5881 100644 --- a/llvm/test/Transforms/NewGVN/pr31483.ll +++ b/llvm/test/Transforms/NewGVN/pr31483.ll @@ -100,7 +100,7 @@ declare signext i32 @zot(i8*, ...) #1 ; Function Attrs: nounwind declare void @llvm.va_end(i8*) #2 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll index 20c44384504e2..8f97225ca446b 100644 --- a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll +++ b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -ipsccp < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux" +target triple = "powerpc64-bgq-linux" define void @test(i32 signext %n) { diff --git a/llvm/unittests/ADT/TripleTest.cpp b/llvm/unittests/ADT/TripleTest.cpp index 1852d7b6a1b0d..dc7a28c72f208 100644 --- a/llvm/unittests/ADT/TripleTest.cpp +++ b/llvm/unittests/ADT/TripleTest.cpp @@ -111,6 +111,41 @@ TEST(TripleTest, ParsedIDs) { EXPECT_EQ(Triple::Linux, T.getOS()); EXPECT_EQ(Triple::Musl, T.getEnvironment()); + T = Triple("powerpc-bgp-linux"); + EXPECT_EQ(Triple::ppc, T.getArch()); + EXPECT_EQ(Triple::BGP, T.getVendor()); + EXPECT_EQ(Triple::Linux, T.getOS()); + EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + + T = Triple("powerpc-bgp-cnk"); + EXPECT_EQ(Triple::ppc, T.getArch()); + EXPECT_EQ(Triple::BGP, T.getVendor()); + EXPECT_EQ(Triple::CNK, T.getOS()); + EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + + T = Triple("ppc-bgp-linux"); + EXPECT_EQ(Triple::ppc, T.getArch()); + EXPECT_EQ(Triple::BGP, T.getVendor()); + EXPECT_EQ(Triple::Linux, T.getOS()); + EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + + T = Triple("ppc32-bgp-linux"); + EXPECT_EQ(Triple::ppc, T.getArch()); + EXPECT_EQ(Triple::BGP, T.getVendor()); + EXPECT_EQ(Triple::Linux, T.getOS()); + EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + + T = Triple("powerpc64-bgq-linux"); + EXPECT_EQ(Triple::ppc64, T.getArch()); + EXPECT_EQ(Triple::BGQ, T.getVendor()); + EXPECT_EQ(Triple::Linux, T.getOS()); + EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + + T = Triple("ppc64-bgq-linux"); + EXPECT_EQ(Triple::ppc64, T.getArch()); + EXPECT_EQ(Triple::BGQ, T.getVendor()); + EXPECT_EQ(Triple::Linux, T.getOS()); + T = Triple("powerpc-ibm-aix"); EXPECT_EQ(Triple::ppc, T.getArch()); EXPECT_EQ(Triple::IBM, T.getVendor()); diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn index 3a452fc6e0601..043a672a76e1e 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn @@ -61,6 +61,7 @@ static_library("LLVMPowerPCCodeGen") { "PPCMachineScheduler.cpp", "PPCMacroFusion.cpp", "PPCPreEmitPeephole.cpp", + "PPCQPXLoadSplat.cpp", "PPCReduceCRLogicals.cpp", "PPCRegisterInfo.cpp", "PPCSubtarget.cpp", diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index bb6cee740ace7..933573bc810cb 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1116,6 +1116,9 @@ extern kmp_uint64 __kmp_now_nsec(); #if KMP_OS_WINDOWS #define KMP_INIT_WAIT 64U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */ +#elif KMP_OS_CNK +#define KMP_INIT_WAIT 16U /* initial number of spin-tests */ +#define KMP_NEXT_WAIT 8U /* susequent number of spin-tests */ #elif KMP_OS_LINUX #define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index f6fb1e602c297..b5c641cc7273c 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -680,6 +680,17 @@ void __kmpc_flush(ident_t *loc) { // Nothing to see here move along #elif KMP_ARCH_PPC64 // Nothing needed here (we have a real MB above). +#if KMP_OS_CNK + // The flushing thread needs to yield here; this prevents a + // busy-waiting thread from saturating the pipeline. flush is + // often used in loops like this: + // while (!flag) { + // #pragma omp flush(flag) + // } + // and adding the yield here is good for at least a 10x speedup + // when running >2 threads per core (on the NAS LU benchmark). + __kmp_yield(); +#endif #else #error Unknown or unsupported architecture #endif diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h index b80e54777e8c2..e54f6812b8b34 100644 --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -159,7 +159,7 @@ extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck); #define KMP_LOCK_ACQUIRED_NEXT 0 #ifndef KMP_USE_FUTEX #define KMP_USE_FUTEX \ - (KMP_OS_LINUX && \ + (KMP_OS_LINUX && !KMP_OS_CNK && \ (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)) #endif #if KMP_USE_FUTEX diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index 33735cf455c7e..bfe7765b2a967 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -69,7 +69,7 @@ #error Unknown compiler #endif -#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) +#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_CNK #define KMP_AFFINITY_SUPPORTED 1 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64 #define KMP_GROUP_AFFINITY 1 diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index 4296ca31d67d9..779c08e9771d5 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -22,6 +22,7 @@ #define KMP_OS_OPENBSD 0 #define KMP_OS_DARWIN 0 #define KMP_OS_WINDOWS 0 +#define KMP_OS_CNK 0 #define KMP_OS_HURD 0 #define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */ @@ -65,6 +66,11 @@ #define KMP_OS_OPENBSD 1 #endif +#if (defined __bgq__) +#undef KMP_OS_CNK +#define KMP_OS_CNK 1 +#endif + #if (defined __GNU__) #undef KMP_OS_HURD #define KMP_OS_HURD 1 diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index 16059a3762bf4..8090ff759fe1b 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -1433,8 +1433,13 @@ __kmp_invoke_microtask: add 12, 0, 12 neg 12, 12 -// We need to make sure that the stack frame stays aligned (to 16 bytes). +// We need to make sure that the stack frame stays aligned (to 16 bytes, except +// under the BG/Q CNK, where it must be to 32 bytes). +# if KMP_OS_CNK + li 0, -32 +# else li 0, -16 +# endif and 12, 0, 12 // Establish the local stack frame. diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 58cc4d25f6080..3b5910fc95e89 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -31,7 +31,7 @@ #include #include -#if KMP_OS_LINUX +#if KMP_OS_LINUX && !KMP_OS_CNK #include #if KMP_USE_FUTEX // We should really include , but that causes compatibility problems on diff --git a/polly/lib/External/isl/config.sub b/polly/lib/External/isl/config.sub index bc4db70f82abf..1d8e98bcee23a 100755 --- a/polly/lib/External/isl/config.sub +++ b/polly/lib/External/isl/config.sub @@ -152,6 +152,9 @@ case $os in os= basic_machine=$1 ;; + -bluegene*) + os=-cnk + ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 @@ -536,6 +539,10 @@ case $basic_machine in basic_machine=bfin-`echo "$basic_machine" | sed 's/^[^-]*-//'` os=-linux ;; + bluegene*) + basic_machine=powerpc-ibm + os=-cnk + ;; c54x-*) basic_machine=tic54x-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; @@ -1357,7 +1364,7 @@ case $os in # Each alternative MUST end in a * to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ @@ -1721,7 +1728,7 @@ case $basic_machine in -sunos*) vendor=sun ;; - -aix*) + -cnk*|-aix*) vendor=ibm ;; -beos*) diff --git a/polly/lib/External/ppcg/config.sub b/polly/lib/External/ppcg/config.sub index d97f3009f9f09..6205f8423d6aa 100644 --- a/polly/lib/External/ppcg/config.sub +++ b/polly/lib/External/ppcg/config.sub @@ -160,6 +160,9 @@ case $os in os= basic_machine=$1 ;; + -bluegene*) + os=-cnk + ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 @@ -521,6 +524,10 @@ case $basic_machine in basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; + bluegene*) + basic_machine=powerpc-ibm + os=-cnk + ;; c54x-*) basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; @@ -1337,7 +1344,7 @@ case $os in # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ @@ -1702,7 +1709,7 @@ case $basic_machine in -sunos*) vendor=sun ;; - -aix*) + -cnk*|-aix*) vendor=ibm ;; -beos*) From 4b5412b5dbc87096e420de5172837b4bd5ab9485 Mon Sep 17 00:00:00 2001 From: Sridhar Gopinath Date: Fri, 24 Jul 2020 10:44:48 -0700 Subject: [PATCH 0246/1035] Fix the move constructor of MMI to move MachineFunctions map The move constructor of MachineModuleInfo currently does not copy the MachineFunctions map. This commit fixes this issue. Patch by Sridhar Gopinath. Thanks! Differential Revision: https://reviews.llvm.org/D84274 --- llvm/lib/CodeGen/MachineModuleInfo.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp index f866c7ca53c68..be08f0ae31171 100644 --- a/llvm/lib/CodeGen/MachineModuleInfo.cpp +++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp @@ -178,7 +178,8 @@ void MachineModuleInfo::finalize() { MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI) : TM(std::move(MMI.TM)), Context(MMI.TM.getMCAsmInfo(), MMI.TM.getMCRegisterInfo(), - MMI.TM.getObjFileLowering(), nullptr, nullptr, false) { + MMI.TM.getObjFileLowering(), nullptr, nullptr, false), + MachineFunctions(std::move(MMI.MachineFunctions)) { ObjFileMMI = MMI.ObjFileMMI; CurCallSite = MMI.CurCallSite; UsesMSVCFloatingPoint = MMI.UsesMSVCFloatingPoint; From ef748b58d3b3edfaf0278d454cb30f7816c04aee Mon Sep 17 00:00:00 2001 From: Fred Riss Date: Mon, 27 Jul 2020 13:51:07 -0700 Subject: [PATCH 0247/1035] [lldb] NFC: Use early exit in ArchSpec::IsEqualTo --- lldb/source/Utility/ArchSpec.cpp | 113 +++++++++++++++---------------- 1 file changed, 53 insertions(+), 60 deletions(-) diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index a77ae8633070e..cd382a322da70 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -1010,77 +1010,70 @@ static bool IsCompatibleEnvironment(llvm::Triple::EnvironmentType lhs, bool ArchSpec::IsEqualTo(const ArchSpec &rhs, bool exact_match) const { // explicitly ignoring m_distribution_id in this method. - if (GetByteOrder() != rhs.GetByteOrder()) + if (GetByteOrder() != rhs.GetByteOrder() || + !cores_match(GetCore(), rhs.GetCore(), true, exact_match)) return false; - const ArchSpec::Core lhs_core = GetCore(); - const ArchSpec::Core rhs_core = rhs.GetCore(); + const llvm::Triple &lhs_triple = GetTriple(); + const llvm::Triple &rhs_triple = rhs.GetTriple(); + + const llvm::Triple::VendorType lhs_triple_vendor = lhs_triple.getVendor(); + const llvm::Triple::VendorType rhs_triple_vendor = rhs_triple.getVendor(); + if (lhs_triple_vendor != rhs_triple_vendor) { + const bool rhs_vendor_specified = rhs.TripleVendorWasSpecified(); + const bool lhs_vendor_specified = TripleVendorWasSpecified(); + // Both architectures had the vendor specified, so if they aren't equal + // then we return false + if (rhs_vendor_specified && lhs_vendor_specified) + return false; - const bool core_match = cores_match(lhs_core, rhs_core, true, exact_match); - - if (core_match) { - const llvm::Triple &lhs_triple = GetTriple(); - const llvm::Triple &rhs_triple = rhs.GetTriple(); - - const llvm::Triple::VendorType lhs_triple_vendor = lhs_triple.getVendor(); - const llvm::Triple::VendorType rhs_triple_vendor = rhs_triple.getVendor(); - if (lhs_triple_vendor != rhs_triple_vendor) { - const bool rhs_vendor_specified = rhs.TripleVendorWasSpecified(); - const bool lhs_vendor_specified = TripleVendorWasSpecified(); - // Both architectures had the vendor specified, so if they aren't equal - // then we return false - if (rhs_vendor_specified && lhs_vendor_specified) - return false; - - // Only fail if both vendor types are not unknown - if (lhs_triple_vendor != llvm::Triple::UnknownVendor && - rhs_triple_vendor != llvm::Triple::UnknownVendor) - return false; - } + // Only fail if both vendor types are not unknown + if (lhs_triple_vendor != llvm::Triple::UnknownVendor && + rhs_triple_vendor != llvm::Triple::UnknownVendor) + return false; + } - const llvm::Triple::OSType lhs_triple_os = lhs_triple.getOS(); - const llvm::Triple::OSType rhs_triple_os = rhs_triple.getOS(); - const llvm::Triple::EnvironmentType lhs_triple_env = + const llvm::Triple::OSType lhs_triple_os = lhs_triple.getOS(); + const llvm::Triple::OSType rhs_triple_os = rhs_triple.getOS(); + const llvm::Triple::EnvironmentType lhs_triple_env = lhs_triple.getEnvironment(); - const llvm::Triple::EnvironmentType rhs_triple_env = + const llvm::Triple::EnvironmentType rhs_triple_env = rhs_triple.getEnvironment(); - if (!exact_match) { - // x86_64-apple-ios-macabi, x86_64-apple-macosx are compatible, no match. - if ((lhs_triple_os == llvm::Triple::IOS && - lhs_triple_env == llvm::Triple::MacABI && - rhs_triple_os == llvm::Triple::MacOSX) || - (lhs_triple_os == llvm::Triple::MacOSX && - rhs_triple_os == llvm::Triple::IOS && - rhs_triple_env == llvm::Triple::MacABI)) - return true; - } - - if (lhs_triple_os != rhs_triple_os) { - const bool rhs_os_specified = rhs.TripleOSWasSpecified(); - const bool lhs_os_specified = TripleOSWasSpecified(); - // Both architectures had the OS specified, so if they aren't equal then - // we return false - if (rhs_os_specified && lhs_os_specified) - return false; - - // Only fail if both os types are not unknown - if (lhs_triple_os != llvm::Triple::UnknownOS && - rhs_triple_os != llvm::Triple::UnknownOS) - return false; - } + if (!exact_match) { + // x86_64-apple-ios-macabi, x86_64-apple-macosx are compatible, no match. + if ((lhs_triple_os == llvm::Triple::IOS && + lhs_triple_env == llvm::Triple::MacABI && + rhs_triple_os == llvm::Triple::MacOSX) || + (lhs_triple_os == llvm::Triple::MacOSX && + rhs_triple_os == llvm::Triple::IOS && + rhs_triple_env == llvm::Triple::MacABI)) + return true; + } - // x86_64-apple-ios-macabi and x86_64-apple-ios are not compatible. - if (lhs_triple_os == llvm::Triple::IOS && - rhs_triple_os == llvm::Triple::IOS && - (lhs_triple_env == llvm::Triple::MacABI || - rhs_triple_env == llvm::Triple::MacABI) && - lhs_triple_env != rhs_triple_env) + if (lhs_triple_os != rhs_triple_os) { + const bool rhs_os_specified = rhs.TripleOSWasSpecified(); + const bool lhs_os_specified = TripleOSWasSpecified(); + // Both architectures had the OS specified, so if they aren't equal then + // we return false + if (rhs_os_specified && lhs_os_specified) return false; - return IsCompatibleEnvironment(lhs_triple_env, rhs_triple_env); + // Only fail if both os types are not unknown + if (lhs_triple_os != llvm::Triple::UnknownOS && + rhs_triple_os != llvm::Triple::UnknownOS) + return false; } - return false; + + // x86_64-apple-ios-macabi and x86_64-apple-ios are not compatible. + if (lhs_triple_os == llvm::Triple::IOS && + rhs_triple_os == llvm::Triple::IOS && + (lhs_triple_env == llvm::Triple::MacABI || + rhs_triple_env == llvm::Triple::MacABI) && + lhs_triple_env != rhs_triple_env) + return false; + + return IsCompatibleEnvironment(lhs_triple_env, rhs_triple_env); } void ArchSpec::UpdateCore() { From c37bb5e2a541df867d371278f6ddfaf85c299771 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 27 Jul 2020 14:58:42 -0700 Subject: [PATCH 0248/1035] [DFSan] Remove unused DataFlowSanitizer vars Reviewed By: morehouse Differential Revision: https://reviews.llvm.org/D84704 --- .../include/llvm/Transforms/Instrumentation.h | 3 +- .../Instrumentation/DataFlowSanitizer.cpp | 58 +++++-------------- 2 files changed, 17 insertions(+), 44 deletions(-) diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h index d4373d7b39eaa..f2e084041dd67 100644 --- a/llvm/include/llvm/Transforms/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation.h @@ -144,8 +144,7 @@ ModulePass *createInstrOrderFilePass(); // Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation ModulePass *createDataFlowSanitizerPass( - const std::vector &ABIListFiles = std::vector(), - void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr); + const std::vector &ABIListFiles = std::vector()); // Options for sanitizer coverage instrumentation. struct SanitizerCoverageOptions { diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 2846319007318..a5a785cb55f78 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -344,8 +344,6 @@ class DataFlowSanitizer : public ModulePass { ConstantInt *ShadowPtrMul; Constant *ArgTLS; Constant *RetvalTLS; - void *(*GetArgTLSPtr)(); - void *(*GetRetvalTLSPtr)(); FunctionType *GetArgTLSTy; FunctionType *GetRetvalTLSTy; Constant *GetArgTLS; @@ -395,9 +393,8 @@ class DataFlowSanitizer : public ModulePass { public: static char ID; - DataFlowSanitizer( - const std::vector &ABIListFiles = std::vector(), - void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr); + DataFlowSanitizer(const std::vector &ABIListFiles = + std::vector()); bool doInitialization(Module &M) override; bool runOnModule(Module &M) override; @@ -490,17 +487,14 @@ char DataFlowSanitizer::ID; INITIALIZE_PASS(DataFlowSanitizer, "dfsan", "DataFlowSanitizer: dynamic data flow analysis.", false, false) -ModulePass * -llvm::createDataFlowSanitizerPass(const std::vector &ABIListFiles, - void *(*getArgTLS)(), - void *(*getRetValTLS)()) { - return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS); +ModulePass *llvm::createDataFlowSanitizerPass( + const std::vector &ABIListFiles) { + return new DataFlowSanitizer(ABIListFiles); } DataFlowSanitizer::DataFlowSanitizer( - const std::vector &ABIListFiles, void *(*getArgTLS)(), - void *(*getRetValTLS)()) - : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) { + const std::vector &ABIListFiles) + : ModulePass(ID) { std::vector AllABIListFiles(std::move(ABIListFiles)); AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), ClABIListFiles.end()); @@ -613,22 +607,6 @@ bool DataFlowSanitizer::doInitialization(Module &M) { FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs, /*isVarArg=*/false); - if (GetArgTLSPtr) { - Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); - ArgTLS = nullptr; - GetArgTLSTy = FunctionType::get(PointerType::getUnqual(ArgTLSTy), false); - GetArgTLS = ConstantExpr::getIntToPtr( - ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)), - PointerType::getUnqual(GetArgTLSTy)); - } - if (GetRetvalTLSPtr) { - RetvalTLS = nullptr; - GetRetvalTLSTy = FunctionType::get(PointerType::getUnqual(ShadowTy), false); - GetRetvalTLS = ConstantExpr::getIntToPtr( - ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)), - PointerType::getUnqual(GetRetvalTLSTy)); - } - ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); return true; } @@ -816,20 +794,16 @@ bool DataFlowSanitizer::runOnModule(Module &M) { bool Changed = false; - if (!GetArgTLSPtr) { - Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); - ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); - if (GlobalVariable *G = dyn_cast(ArgTLS)) { - Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; - G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); - } + Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); + ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); + if (GlobalVariable *G = dyn_cast(ArgTLS)) { + Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); } - if (!GetRetvalTLSPtr) { - RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy); - if (GlobalVariable *G = dyn_cast(RetvalTLS)) { - Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; - G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); - } + RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy); + if (GlobalVariable *G = dyn_cast(RetvalTLS)) { + Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); } ExternalShadowMask = From e57464151d4c4912a7ec4d6fd0920056b2f75c7c Mon Sep 17 00:00:00 2001 From: peter klausler Date: Wed, 22 Jul 2020 16:56:37 -0700 Subject: [PATCH 0249/1035] [flang] Allow omission of comma in FORMAT(1PE5.2) in runtime A comma is not required between a scale factor and a following data edit descriptor (C1302). Reviewed By: PeteSteinfeld Differential Revision: https://reviews.llvm.org/D84369 --- flang/runtime/format-implementation.h | 12 +++++++----- flang/unittests/Runtime/hello.cpp | 2 ++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h index a4453cd172ea5..ad8bbcbcdcbdf 100644 --- a/flang/runtime/format-implementation.h +++ b/flang/runtime/format-implementation.h @@ -330,11 +330,13 @@ int FormatControl::CueUpNextDataEdit(Context &context, bool stop) { offset_ += *repeat; } else if (ch >= 'A' && ch <= 'Z') { int start{offset_ - 1}; - CharType next{Capitalize(PeekNext())}; - if (next >= 'A' && next <= 'Z') { - ++offset_; - } else { - next = '\0'; + CharType next{'\0'}; + if (ch != 'P') { // 1PE5.2 - comma not required (C1302) + CharType peek{Capitalize(PeekNext())}; + if (peek >= 'A' && peek <= 'Z') { + next = peek; + ++offset_; + } } if (ch == 'E' || (!next && diff --git a/flang/unittests/Runtime/hello.cpp b/flang/unittests/Runtime/hello.cpp index f6db4a8e47dc8..c38aedf4f6549 100644 --- a/flang/unittests/Runtime/hello.cpp +++ b/flang/unittests/Runtime/hello.cpp @@ -175,6 +175,7 @@ int main() { {"(E32.17E0,';')", " 0.00000000000000000E+0;"}, {"(G32.17E0,';')", " 0.0000000000000000 ;"}, {"(1P,E32.17,';')", " 0.00000000000000000E+00;"}, + {"(1PE32.17,';')", " 0.00000000000000000E+00;"}, // no comma {"(1P,F32.17,';')", " 0.00000000000000000;"}, {"(1P,G32.17,';')", " 0.0000000000000000 ;"}, {"(2P,E32.17,';')", " 00.0000000000000000E+00;"}, @@ -195,6 +196,7 @@ int main() { {"(E32.17E4,';')", " 0.10000000000000000E+0001;"}, {"(G32.17E4,';')", " 1.0000000000000000 ;"}, {"(1P,E32.17,';')", " 1.00000000000000000E+00;"}, + {"(1PE32.17,';')", " 1.00000000000000000E+00;"}, // no comma {"(1P,F32.17,';')", " 0.10000000000000000;"}, {"(1P,G32.17,';')", " 1.0000000000000000 ;"}, {"(ES32.17,';')", " 1.00000000000000000E+00;"}, From adb28e0fb2b0e97ea9dce422c09b36979cf7cd2f Mon Sep 17 00:00:00 2001 From: Francesco Petrogalli Date: Mon, 6 Jul 2020 17:46:59 +0000 Subject: [PATCH 0250/1035] [llvm][CodeGen] Addressing modes for SVE ldN. Reviewers: c-rhodes, efriedma, sdesmalen Subscribers: huihuiz, tschuett, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D77251 --- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 64 ++- .../sve-intrinsics-ldN-reg+imm-addr-mode.ll | 495 ++++++++++++++++++ .../sve-intrinsics-ldN-reg+reg-addr-mode.ll | 259 +++++++++ 3 files changed, 798 insertions(+), 20 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index dbd7db7ee8e6f..7799ebfbd68e6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -245,7 +245,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { unsigned SubRegIdx); void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc); + void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, + unsigned Opc_rr, unsigned Opc_ri); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -1434,14 +1435,23 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, } void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, - const unsigned Opc) { + unsigned Scale, unsigned Opc_ri, + unsigned Opc_rr) { + assert(Scale < 4 && "Invalid scaling value."); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); + // Optimize addressing mode. + SDValue Base, Offset; + unsigned Opc; + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( + N, Opc_rr, Opc_ri, N->getOperand(2), + CurDAG->getTargetConstant(0, DL, MVT::i64), Scale); + SDValue Ops[] = {N->getOperand(1), // Predicate - N->getOperand(2), // Memory operand - CurDAG->getTargetConstant(0, DL, MVT::i64), Chain}; + Base, // Memory operand + Offset, Chain}; const EVT ResTys[] = {MVT::Untyped, MVT::Other}; @@ -4726,51 +4736,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case AArch64ISD::SVE_LD2_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); + SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); + SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM); + SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM); + SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D); return; } break; } case AArch64ISD::SVE_LD3_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); + SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); + SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM); + SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM); + SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D); return; } break; } case AArch64ISD::SVE_LD4_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); + SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); + SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM); + SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM); + SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D); return; } break; @@ -4790,10 +4800,14 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM, /// When \p PredVT is a scalable vector predicate in the form /// MVT::nxxi1, it builds the correspondent scalable vector of -/// integers MVT::nxxi s.t. M x bits = 128. If the input +/// integers MVT::nxxi s.t. M x bits = 128. When targeting +/// structured vectors (NumVec >1), the output data type is +/// MVT::nxxi s.t. M x bits = 128. If the input /// PredVT is not in the form MVT::nxxi1, it returns an invalid /// EVT. -static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { +static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT, + unsigned NumVec) { + assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors."); if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1) return EVT(); @@ -4803,7 +4817,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { ElementCount EC = PredVT.getVectorElementCount(); EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min); - EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC); + EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec); + return MemVT; } @@ -4827,6 +4842,15 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { return cast(Root->getOperand(3))->getVT(); case AArch64ISD::ST1_PRED: return cast(Root->getOperand(4))->getVT(); + case AArch64ISD::SVE_LD2_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2); + case AArch64ISD::SVE_LD3_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3); + case AArch64ISD::SVE_LD4_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4); default: break; } @@ -4842,7 +4866,7 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { // We are using an SVE prefetch intrinsic. Type must be inferred // from the width of the predicate. return getPackedVectorTypeFromPredicateType( - Ctx, Root->getOperand(2)->getValueType(0)); + Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1); } /// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll new file mode 100644 index 0000000000000..1ffa78ec27352 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll @@ -0,0 +1,495 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s + +; NOTE: invalid, upper and lower bound immediate values of the regimm +; addressing mode are checked only for the byte version of each +; instruction (`ldb`), as the code for detecting the immediate is +; common to all instructions, and varies only for the number of +; elements of the structure store, which is = 2, 3, 4. + +; ld2b +define @ld2.nxv32i8( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 2 +%base_ptr = bitcast * %base to i8* +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_lower_bound: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_upper_bound: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_not_multiple_of_2( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_not_multiple_of_2: +; CHECK: rdvl x[[OFFSET:[0-9]]], #3 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 3 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_outside_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_outside_lower_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #-18 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -18 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_outside_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_outside_upper_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #16 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 16 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +; ld2h +define @ld2.nxv16i16( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv16i16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i16 * +%res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) +ret %res +} + +define @ld2.nxv16f16( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv16f16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to half * +%res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %Pg, half *%base_ptr) +ret %res +} + +define @ld2.nxv16bf16( %Pg, * %addr) #0 { +; CHECK-LABEL: ld2.nxv16bf16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #12, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 12 +%base_ptr = bitcast * %base to bfloat * +%res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) +ret %res +} + +; ld2w +define @ld2.nxv8i32( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv8i32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i32 * +%res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) +ret %res +} + +define @ld2.nxv8f32( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv8f32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to float * +%res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %Pg, float *%base_ptr) +ret %res +} + +; ld2d +define @ld2.nxv4i64( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv4i64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i64 * +%res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) +ret %res +} + +define @ld2.nxv4f64( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv4f64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to double * +%res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %Pg, double *%base_ptr) +ret %res +} + +; ld3b +define @ld3.nxv48i8( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 3 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_lower_bound: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_upper_bound: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_not_multiple_of_3_01( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_01: +; CHECK: rdvl x[[OFFSET:[0-9]]], #4 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 4 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_not_multiple_of_3_02( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_02: +; CHECK: rdvl x[[OFFSET:[0-9]]], #5 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 5 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_outside_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_outside_lower_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #-27 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -27 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_outside_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_outside_upper_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #24 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 24 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +; ld3h +define @ld3.nxv24i16( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv24i16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i16 * +%res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) +ret %res +} + +define @ld3.nxv24f16( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv24f16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to half * +%res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %Pg, half *%base_ptr) +ret %res +} + +define @ld3.nxv24bf16( %Pg, *%addr) #0 { +; CHECK-LABEL: ld3.nxv24bf16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to bfloat * +%res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) +ret %res +} + +; ld3w +define @ld3.nxv12i32( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv12i32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i32 * +%res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) +ret %res +} + +define @ld3.nxv12f32( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv12f32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to float * +%res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %Pg, float *%base_ptr) +ret %res +} + +; ld3d +define @ld3.nxv6i64( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv6i64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i64 * +%res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) +ret %res +} + +define @ld3.nxv6f64( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv6f64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to double * +%res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %Pg, double *%base_ptr) +ret %res +} + +; ; ld4b +define @ld4.nxv64i8( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 4 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_lower_bound: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_upper_bound: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_not_multiple_of_4_01( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_01: +; CHECK: rdvl x[[OFFSET:[0-9]]], #5 +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 5 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_not_multiple_of_4_02( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_02: +; CHECK: rdvl x[[OFFSET:[0-9]]], #6 +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 6 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_not_multiple_of_4_03( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_03: +; CHECK: rdvl x[[OFFSET:[0-9]]], #7 +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 7 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_outside_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_outside_lower_bound: +; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #4) #9) +; xM = -9 * 2^6 +; xP = RDVL * 2^-4 +; xOFFSET = RDVL * 2^-4 * -9 * 2^6 = RDVL * -36 +; CHECK: rdvl x[[N:[0-9]]], #1 +; CHECK-DAG: mov x[[M:[0-9]]], #-576 +; CHECK-DAG: lsr x[[P:[0-9]]], x[[N]], #4 +; CHECK-DAG: mul x[[OFFSET:[0-9]]], x[[P]], x[[M]] +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -36 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_outside_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_outside_upper_bound: +; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #16) #2) +; xM = 2^9 +; xP = RDVL * 2^-4 +; xOFFSET = RDVL * 2^-4 * 2^9 = RDVL * 32 +; CHECK: rdvl x[[N:[0-9]]], #1 +; CHECK-DAG: mov w[[M:[0-9]]], #512 +; CHECK-DAG: lsr x[[P:[0-9]]], x[[N]], #4 +; CHECK-DAG: mul x[[OFFSET:[0-9]]], x[[P]], x[[M]] +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 32 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +; ld4h +define @ld4.nxv32i16( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv32i16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #8, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 8 +%base_ptr = bitcast * %base to i16 * +%res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) +ret %res +} + +define @ld4.nxv32f16( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv32f16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to half * +%res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %Pg, half *%base_ptr) +ret %res +} + +define @ld4.nxv32bf16( %Pg, *%addr) #0 { +; CHECK-LABEL: ld4.nxv32bf16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to bfloat * +%res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) +ret %res +} + +; ld4w +define @ld4.nxv16i32( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv16i32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to i32 * +%res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) +ret %res +} + +define @ld4.nxv16f32( %Pg, * %addr) { +; CHECK-LABEL: ld4.nxv16f32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to float * +%res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %Pg, float *%base_ptr) +ret %res +} + +; ld4d +define @ld4.nxv8i64( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv8i64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to i64 * +%res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) +ret %res +} + +define @ld4.nxv8f64( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv8f64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to double * +%res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %Pg, double * %base_ptr) +ret %res +} + +declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll new file mode 100644 index 0000000000000..ab59c84137958 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll @@ -0,0 +1,259 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s + +; ld2b +define @ld2.nxv32i8( %Pg, i8 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv32i8: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret +%addr2 = getelementptr i8, i8 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%addr2) +ret %res +} + +; ld2h +define @ld2.nxv16i16( %Pg, i16 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv16i16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr i16, i16 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %Pg, i16 *%addr2) +ret %res +} + +define @ld2.nxv16f16( %Pg, half *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv16f16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr half, half * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %Pg, half *%addr2) +ret %res +} + +define @ld2.nxv16bf16( %Pg, bfloat *%addr, i64 %a) #0 { +; CHECK-LABEL: ld2.nxv16bf16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) +ret %res +} + +; ld2w +define @ld2.nxv8i32( %Pg, i32 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv8i32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr i32, i32 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %Pg, i32 *%addr2) +ret %res +} + +define @ld2.nxv8f32( %Pg, float *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv8f32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr float, float * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %Pg, float *%addr2) +ret %res +} + +; ld2d +define @ld2.nxv4i64( %Pg, i64 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv4i64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr i64, i64 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %Pg, i64 *%addr2) +ret %res +} + +define @ld2.nxv4f64( %Pg, double *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv4f64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr double, double * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %Pg, double *%addr2) +ret %res +} + +; ld3b +define @ld3.nxv48i8( %Pg, i8 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv48i8: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret +%addr2 = getelementptr i8, i8 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%addr2) +ret %res +} + +; ld3h +define @ld3.nxv24i16( %Pg, i16 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv24i16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr i16, i16 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %Pg, i16 *%addr2) +ret %res +} + +define @ld3.nxv24f16( %Pg, half *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv24f16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr half, half * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %Pg, half *%addr2) +ret %res +} + +define @ld3.nxv24bf16( %Pg, bfloat *%addr, i64 %a) #0 { +; CHECK-LABEL: ld3.nxv24bf16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) +ret %res +} + +; ld3w +define @ld3.nxv12i32( %Pg, i32 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv12i32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr i32, i32 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %Pg, i32 *%addr2) +ret %res +} + +define @ld3.nxv12f32( %Pg, float *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv12f32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr float, float * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %Pg, float *%addr2) +ret %res +} + +; ld3d +define @ld3.nxv6i64( %Pg, i64 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv6i64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr i64, i64 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %Pg, i64 *%addr2) +ret %res +} + +define @ld3.nxv6f64( %Pg, double *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv6f64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr double, double * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %Pg, double *%addr2) +ret %res +} + +; ld4b +define @ld4.nxv64i8( %Pg, i8 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv64i8: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret +%addr2 = getelementptr i8, i8 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%addr2) +ret %res +} + +; ld4h +define @ld4.nxv32i16( %Pg, i16 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv32i16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr i16, i16 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %Pg, i16 *%addr2) +ret %res +} + +define @ld4.nxv32f16( %Pg, half *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv32f16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr half, half * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %Pg, half *%addr2) +ret %res +} + +define @ld4.nxv32bf16( %Pg, bfloat *%addr, i64 %a) #0 { +; CHECK-LABEL: ld4.nxv32bf16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) +ret %res +} + +; ld4w +define @ld4.nxv16i32( %Pg, i32 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv16i32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr i32, i32 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %Pg, i32 *%addr2) +ret %res +} + +define @ld4.nxv16f32( %Pg, float *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv16f32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr float, float * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %Pg, float *%addr2) +ret %res +} + +; ld4d +define @ld4.nxv8i64( %Pg, i64 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv8i64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr i64, i64 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %Pg, i64 *%addr2) +ret %res +} + +define @ld4.nxv8f64( %Pg, double *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv8f64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr double, double * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %Pg, double *%addr2) +ret %res +} + +declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" } From ee3feef5aaaa3c385fbe08bdb2d48829ad440b56 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 13 Jul 2020 08:59:38 -0400 Subject: [PATCH 0251/1035] TableGen/GlobalISel: Allow output instructions with multiple defs The DAG behavior allows matchching input patterns with a single result to the first result of an output instruction that defines multiple results. The remaining defs are implicitly dead. This starts to fix using manual selection for AMDGPU add/sub (although it's still needed, mostly because it's also still needed for G_PTR_ADD). --- .../AMDGPU/GlobalISel/inst-select-add.mir | 3 +- .../Common/GlobalISelEmitterCommon.td | 3 + .../GlobalISelEmitter-output-discard.td | 27 ++++++++ llvm/utils/TableGen/GlobalISelEmitter.cpp | 61 ++++++++++++++----- 4 files changed, 78 insertions(+), 16 deletions(-) create mode 100644 llvm/test/TableGen/GlobalISelEmitter-output-discard.td diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir index baed490c07581..47b4a5c400d37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir @@ -94,8 +94,7 @@ body: | ; GFX6-LABEL: name: add_neg_inline_const_64_to_sub_s32_v ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -64, implicit $exec - ; GFX6: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: %2:vgpr_32, dead %3:sreg_64 = V_SUB_CO_U32_e64 [[COPY]], 64, 0, implicit $exec ; GFX6: S_ENDPGM 0, implicit %2 ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_v ; GFX9: liveins: $vgpr0 diff --git a/llvm/test/TableGen/Common/GlobalISelEmitterCommon.td b/llvm/test/TableGen/Common/GlobalISelEmitterCommon.td index 2fe84fb95296e..9ae6c8be7dc2c 100644 --- a/llvm/test/TableGen/Common/GlobalISelEmitterCommon.td +++ b/llvm/test/TableGen/Common/GlobalISelEmitterCommon.td @@ -12,6 +12,9 @@ def GPR32Op : RegisterOperand; def F0 : Register<"f0"> { let Namespace = "MyTarget"; } def FPR32 : RegisterClass<"MyTarget", [f32], 32, (add F0)>; def FPR32Op : RegisterOperand; +def B0 : Register<"b0"> { let Namespace = "MyTarget"; } +def GPR8 : RegisterClass<"MyTarget", [i8], 8, (add B0)>; + def p0 : PtrValueType ; class I Pat> diff --git a/llvm/test/TableGen/GlobalISelEmitter-output-discard.td b/llvm/test/TableGen/GlobalISelEmitter-output-discard.td new file mode 100644 index 0000000000000..c755d8377e61d --- /dev/null +++ b/llvm/test/TableGen/GlobalISelEmitter-output-discard.td @@ -0,0 +1,27 @@ +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s + +include "llvm/Target/Target.td" +include "GlobalISelEmitterCommon.td" + +// Test that extra explicit results are treated as dead defs. +def ADD_CO : I<(outs GPR32:$dst, GPR8:$flag), + (ins GPR32Op:$src0, GPR32Op:$src1), []>; + +// GISEL: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_ADD, +// GISEL-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// GISEL-NEXT: // (add:{ *:[i32] } i32:{ *:[i32] }:$src0, i32:{ *:[i32] }:$src1) => (ADD_CO:{ *:[i32] }:{ *:[i8] } GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1) +// GISEL-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s8, +// GISEL-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::ADD_CO, +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// GISEL-NEXT: GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/RegState::Define|RegState::Dead, +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src0 +// GISEL-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // src1 +// GISEL-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// GISEL-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +def : Pat < + (add i32:$src0, i32:$src1), + (ADD_CO GPR32:$src0, GPR32:$src1) +>; diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 4e8dcc52fc202..a9ebf8f1beaf2 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -2628,12 +2628,14 @@ class TempRegRenderer : public OperandRenderer { unsigned TempRegID; const CodeGenSubRegIndex *SubRegIdx; bool IsDef; + bool IsDead; public: TempRegRenderer(unsigned InsnID, unsigned TempRegID, bool IsDef = false, - const CodeGenSubRegIndex *SubReg = nullptr) + const CodeGenSubRegIndex *SubReg = nullptr, + bool IsDead = false) : OperandRenderer(OR_Register), InsnID(InsnID), TempRegID(TempRegID), - SubRegIdx(SubReg), IsDef(IsDef) {} + SubRegIdx(SubReg), IsDef(IsDef), IsDead(IsDead) {} static bool classof(const OperandRenderer *R) { return R->getKind() == OR_TempRegister; @@ -2650,9 +2652,13 @@ class TempRegRenderer : public OperandRenderer { << MatchTable::Comment("TempRegID") << MatchTable::IntValue(TempRegID) << MatchTable::Comment("TempRegFlags"); - if (IsDef) - Table << MatchTable::NamedValue("RegState::Define"); - else + if (IsDef) { + SmallString<32> RegFlags; + RegFlags += "RegState::Define"; + if (IsDead) + RegFlags += "|RegState::Dead"; + Table << MatchTable::NamedValue(RegFlags); + } else Table << MatchTable::IntValue(0); if (SubRegIdx) @@ -3394,7 +3400,11 @@ class GlobalISelEmitter { Expected createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst); - void importExplicitDefRenderers(BuildMIAction &DstMIBuilder); + + Expected + importExplicitDefRenderers(action_iterator InsertPt, RuleMatcher &M, + BuildMIAction &DstMIBuilder, + const TreePatternNode *Dst); Expected importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M, @@ -4220,7 +4230,9 @@ Expected GlobalISelEmitter::createAndImportInstructionRenderer( CopyToPhysRegMIBuilder.addRenderer(PhysInput.first); } - importExplicitDefRenderers(DstMIBuilder); + if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst) + .takeError()) + return std::move(Error); if (auto Error = importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst) .takeError()) @@ -4372,13 +4384,34 @@ Expected GlobalISelEmitter::createInstructionRenderer( DstI); } -void GlobalISelEmitter::importExplicitDefRenderers( - BuildMIAction &DstMIBuilder) { +Expected GlobalISelEmitter::importExplicitDefRenderers( + action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder, + const TreePatternNode *Dst) { const CodeGenInstruction *DstI = DstMIBuilder.getCGI(); - for (unsigned I = 0; I < DstI->Operands.NumDefs; ++I) { - const CGIOperandList::OperandInfo &DstIOperand = DstI->Operands[I]; - DstMIBuilder.addRenderer(DstIOperand.Name); + const unsigned NumDefs = DstI->Operands.NumDefs; + if (NumDefs == 0) + return InsertPt; + + DstMIBuilder.addRenderer(DstI->Operands[0].Name); + + // Patterns only handle a single result, so any result after the first is an + // implicitly dead def. + for (unsigned I = 1; I < NumDefs; ++I) { + const TypeSetByHwMode &ExtTy = Dst->getExtType(I); + if (!ExtTy.isMachineValueType()) + return failedImport("unsupported typeset"); + + auto OpTy = MVTToLLT(ExtTy.getMachineValueType().SimpleTy); + if (!OpTy) + return failedImport("unsupported type"); + + unsigned TempRegID = M.allocateTempRegID(); + InsertPt = + M.insertAction(InsertPt, *OpTy, TempRegID); + DstMIBuilder.addRenderer(TempRegID, true, nullptr, true); } + + return InsertPt; } Expected GlobalISelEmitter::importExplicitUseRenderers( @@ -4814,8 +4847,8 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { auto &DstI = Target.getInstruction(DstOp); StringRef DstIName = DstI.TheDef->getName(); - if (DstI.Operands.NumDefs != Src->getExtTypes().size()) - return failedImport("Src pattern results and dst MI defs are different (" + + if (DstI.Operands.NumDefs < Src->getExtTypes().size()) + return failedImport("Src pattern result has more defs than dst MI (" + to_string(Src->getExtTypes().size()) + " def(s) vs " + to_string(DstI.Operands.NumDefs) + " def(s))"); From f9fec0447e12da9e8cf4b628f6d45f4941e7d182 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 27 Jul 2020 15:36:16 -0700 Subject: [PATCH 0252/1035] [llvm] Make ZLIB handling compatible with multi-configuration generators The CMAKE_BUILD_TYPE is only meaningful to single-configuration generators (such as make and Ninja). For multi-configuration generators like Xcode and MSVC this variable won't be set, resulting in a CMake error. --- llvm/lib/Support/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 82b7ceb856f25..7b45dc628160e 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -200,8 +200,11 @@ add_llvm_component_library(LLVMSupport set(llvm_system_libs ${system_libs}) if(LLVM_ENABLE_ZLIB) - string(TOUPPER ${CMAKE_BUILD_TYPE} build_type) - get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION_${build_type}) + # CMAKE_BUILD_TYPE is only meaningful to single-configuration generators. + if(CMAKE_BUILD_TYPE) + string(TOUPPER ${CMAKE_BUILD_TYPE} build_type) + get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION_${build_type}) + endif() if(NOT zlib_library) get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION) endif() From 8b81d0633f1ebaf1392e6509b34694de05c94de7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 21 Jul 2020 20:53:57 -0400 Subject: [PATCH 0253/1035] AMDGPU: global_atomic_csub is not always dereferenceable --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index fa2a96d33952b..c92065f7495a2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1137,7 +1137,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile; return true; } From ce944af33c1e011b69665b6892eb3ea142afcdf0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 10:17:49 -0400 Subject: [PATCH 0254/1035] AMDGPU/GlobalISel: Mark G_ATOMICRMW_{NAND|FSUB} as lower These aren't implemented and we're still relying on the AtomicExpand pass, but mark these as lower to eliminate a few of the few remaining no rules defined cases. --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f1962db35bc01..bf0ebd322aa9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1493,6 +1493,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_FCOPYSIGN, G_ATOMIC_CMPXCHG_WITH_SUCCESS, + G_ATOMICRMW_NAND, + G_ATOMICRMW_FSUB, G_READ_REGISTER, G_WRITE_REGISTER, From 5f802be4e554f4eeb2c2c4137bed91b5b4ae740e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 21 Jul 2020 19:29:37 -0400 Subject: [PATCH 0255/1035] GlobalISel: Don't fail translate on intrinsics with metadata --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 9 ++++---- .../GlobalISel/irtranslator-metadata.ll | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index e6b3d51bf1ec4..6433c13e990bf 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1758,10 +1758,6 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { MIB->copyIRFlags(CI); for (auto &Arg : enumerate(CI.arg_operands())) { - // Some intrinsics take metadata parameters. Reject them. - if (isa(Arg.value())) - return false; - // If this is required to be an immediate, don't materialize it in a // register. if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) { @@ -1774,6 +1770,11 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { } else { MIB.addFPImm(cast(Arg.value())); } + } else if (auto MD = dyn_cast(Arg.value())) { + auto *MDN = dyn_cast(MD->getMetadata()); + if (!MDN) // This was probably an MDString. + return false; + MIB.addMetadata(MDN); } else { ArrayRef VRegs = getOrCreateVRegs(*Arg.value()); if (VRegs.size() > 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll new file mode 100644 index 0000000000000..6a3b3b8128661 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -stop-after=irtranslator -o - %s | FileCheck %s +; Make sure intrinsics with metadata arguments are translated + +define i32 @reloc_constant() { + ; CHECK-LABEL: name: reloc_constant + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0 + ; CHECK: $vgpr0 = COPY [[INT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = call i32 @llvm.amdgcn.reloc.constant(metadata !0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.reloc.constant(metadata) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } + +!0 = !{!"arst"} From f1d4db4f0cdcbfeaee0840bf8a4fb5dc1b9b56fd Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Thu, 16 Jul 2020 15:46:54 -0700 Subject: [PATCH 0256/1035] [GraphDiff] Use class method getChildren instead of GraphTraits. Summary: Use getChildren() method in GraphDiff instead of GraphTraits. This simplifies the code and allows for refactorigns inside GraphDiff. All usecase need not have a light-weight/copyable range. Clean GraphTraits implementation. Reviewers: dblaikie Subscribers: hiraditya, llvm-commits, george.burgess.iv Tags: #llvm Differential Revision: https://reviews.llvm.org/D84562 --- .../llvm/Analysis/IteratedDominanceFrontier.h | 8 +- llvm/include/llvm/Analysis/MemorySSAUpdater.h | 2 - llvm/include/llvm/Support/CFGDiff.h | 98 +------------------ llvm/lib/Analysis/MemorySSAUpdater.cpp | 11 +-- 4 files changed, 8 insertions(+), 111 deletions(-) diff --git a/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h b/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h index fb66052851563..8166b52aa226c 100644 --- a/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h +++ b/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h @@ -73,13 +73,7 @@ ChildrenGetterTy::get(const NodeRef &N) { return {Children.begin(), Children.end()}; } - using SnapShotBBPairTy = - std::pair *, OrderedNodeTy>; - - ChildrenTy Ret; - for (const auto &SnapShotBBPair : children({GD, N})) - Ret.emplace_back(SnapShotBBPair.second); - return Ret; + return GD->template getChildren(N); } } // end of namespace IDFCalculatorDetail diff --git a/llvm/include/llvm/Analysis/MemorySSAUpdater.h b/llvm/include/llvm/Analysis/MemorySSAUpdater.h index 20588ef083c59..d41b932099794 100644 --- a/llvm/include/llvm/Analysis/MemorySSAUpdater.h +++ b/llvm/include/llvm/Analysis/MemorySSAUpdater.h @@ -52,8 +52,6 @@ class LoopBlocksRPO; using ValueToValueMapTy = ValueMap; using PhiToDefMap = SmallDenseMap; using CFGUpdate = cfg::Update; -using GraphDiffInvBBPair = - std::pair *, Inverse>; class MemorySSAUpdater { private: diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h index 269984872bfac..9cbf311f68014 100644 --- a/llvm/include/llvm/Support/CFGDiff.h +++ b/llvm/include/llvm/Support/CFGDiff.h @@ -30,44 +30,6 @@ // a non-inversed graph, the children are naturally the successors when // InverseEdge is false and the predecessors when InverseEdge is true. -// We define two base clases that call into GraphDiff, one for successors -// (CFGSuccessors), where InverseEdge is false, and one for predecessors -// (CFGPredecessors), where InverseEdge is true. -// FIXME: Further refactoring may merge the two base classes into a single one -// templated / parametrized on using succ_iterator/pred_iterator and false/true -// for the InverseEdge. - -// CFGViewChildren and CFGViewPredecessors, both can be parametrized to -// consider the graph inverted or not (i.e. InverseGraph). Successors -// implicitly has InverseEdge = false and Predecessors implicitly has -// InverseEdge = true (see calls to GraphDiff methods in there). The GraphTraits -// instantiations that follow define the value of InverseGraph. - -// GraphTraits instantiations: -// - GraphDiff is equivalent to InverseGraph = false -// - GraphDiff> is equivalent to InverseGraph = true -// - second pair item is BasicBlock *, then InverseEdge = false (so it inherits -// from CFGViewChildren). -// - second pair item is Inverse, then InverseEdge = true (so it -// inherits from CFGViewPredecessors). - -// The 4 GraphTraits are as follows: -// 1. std::pair *, BasicBlock *>> : -// CFGViewChildren -// Regular CFG, children means successors, InverseGraph = false, -// InverseEdge = false. -// 2. std::pair> *, BasicBlock *>> : -// CFGViewChildren -// Reverse the graph, get successors but reverse-apply updates, -// InverseGraph = true, InverseEdge = false. -// 3. std::pair *, Inverse>> : -// CFGViewPredecessors -// Regular CFG, reverse edges, so children mean predecessors, -// InverseGraph = false, InverseEdge = true. -// 4. std::pair> *, Inverse> -// : CFGViewPredecessors -// Reverse the graph and the edges, InverseGraph = true, InverseEdge = true. - namespace llvm { namespace detail { @@ -87,9 +49,9 @@ template auto reverse_if(Range &&R) { } } // namespace detail -// GraphDiff defines a CFG snapshot: given a set of Update, provide -// utilities to skip edges marked as deleted and return a set of edges marked as -// newly inserted. The current diff treats the CFG as a graph rather than a +// GraphDiff defines a CFG snapshot: given a set of Update, provides +// a getChildren method to get a Node's children based on the additional updates +// in the snapshot. The current diff treats the CFG as a graph rather than a // multigraph. Added edges are pruned to be unique, and deleted edges will // remove all existing edges between two blocks. template class GraphDiff { @@ -191,7 +153,6 @@ template class GraphDiff { } using VectRet = SmallVector; - template VectRet getChildren(NodePtr N) const { using DirectedNodeT = std::conditional_t, NodePtr>; @@ -228,59 +189,6 @@ template class GraphDiff { LLVM_DUMP_METHOD void dump() const { print(dbgs()); } #endif }; - -template > -struct CFGViewChildren { - using DataRef = const GraphDiff *; - using NodeRef = std::pair; - - template - static auto makeChildRange(Range &&R, DataRef DR) { - using Iter = WrappedPairNodeDataIterator(R).begin()), NodeRef, DataRef>; - return make_range(Iter(R.begin(), DR), Iter(R.end(), DR)); - } - - static auto children(NodeRef N) { - - // filter iterator init: - auto R = make_range(GT::child_begin(N.second), GT::child_end(N.second)); - auto RR = detail::reverse_if(R); - // This lambda is copied into the iterators and persists to callers, ensure - // captures are by value or otherwise have sufficient lifetime. - auto First = make_filter_range(makeChildRange(RR, N.first), [N](NodeRef C) { - return !C.first->ignoreChild(N.second, C.second, InverseEdge); - }); - - // new inserts iterator init: - auto InsertVec = N.first->getAddedChildren(N.second, InverseEdge); - auto Second = makeChildRange(InsertVec, N.first); - - auto CR = concat(First, Second); - - // concat_range contains references to other ranges, returning it would - // leave those references dangling - the iterators contain - // other iterators by value so they're safe to return. - return make_range(CR.begin(), CR.end()); - } - - static auto child_begin(NodeRef N) { - return children(N).begin(); - } - - static auto child_end(NodeRef N) { - return children(N).end(); - } - - using ChildIteratorType = decltype(child_end(std::declval())); -}; - -template -struct GraphTraits *, T>> - : CFGViewChildren {}; -template -struct GraphTraits *, Inverse>> - : CFGViewChildren, B, true> {}; } // end namespace llvm #endif // LLVM_SUPPORT_CFGDIFF_H diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp index 85af091772e7e..21cbdcd67147a 100644 --- a/llvm/lib/Analysis/MemorySSAUpdater.cpp +++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp @@ -832,8 +832,8 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef Updates, // Check number of predecessors, we only care if there's more than one. unsigned Count = 0; BasicBlock *Pred = nullptr; - for (auto &Pair : children({GD, BB})) { - Pred = Pair.second; + for (auto *Pi : GD->template getChildren(BB)) { + Pred = Pi; Count++; if (Count == 2) break; @@ -926,8 +926,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef Updates, auto *BB = BBPredPair.first; const auto &AddedBlockSet = BBPredPair.second.Added; auto &PrevBlockSet = BBPredPair.second.Prev; - for (auto &Pair : children({GD, BB})) { - BasicBlock *Pi = Pair.second; + for (auto *Pi : GD->template getChildren(BB)) { if (!AddedBlockSet.count(Pi)) PrevBlockSet.insert(Pi); EdgeCountMap[{Pi, BB}]++; @@ -1078,10 +1077,8 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef Updates, for (unsigned I = 0, E = IDFPhi->getNumIncomingValues(); I < E; ++I) IDFPhi->setIncomingValue(I, GetLastDef(IDFPhi->getIncomingBlock(I))); } else { - for (auto &Pair : children({GD, BBIDF})) { - BasicBlock *Pi = Pair.second; + for (auto *Pi : GD->template getChildren(BBIDF)) IDFPhi->addIncoming(GetLastDef(Pi), Pi); - } } } } From f250eb37cd4fabcc9f222ca2da80b62d110d9fff Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 27 Jul 2020 19:22:05 -0400 Subject: [PATCH 0257/1035] [OpenMP][Docs] Update `present` modifier status --- clang/docs/OpenMPSupport.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 26fbfab96bc8c..a1d1b120bcecc 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -268,5 +268,7 @@ want to help with the implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | loop extension | Loop tiling transformation | :part:`claimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device extension | 'present' map type modifier | :part:`worked on` | D83061, D83062 | +| device extension | 'present' map type modifier | :part:`mostly done` | D83061, D83062, D84422 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device extension | 'present' motion modifier | :part:`worked on` | D84711, D84712 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ From 64d99cc6abed78c00a2a7863b02ce54911a5264f Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Mon, 27 Jul 2020 11:55:52 -0700 Subject: [PATCH 0258/1035] [CMake] Move find_package(ZLIB) to LLVMConfig This way, downstream projects don't have to invoke find_package(ZLIB) reducing the amount of boilerplate. Differential Revision: https://reviews.llvm.org/D84691 --- clang/CMakeLists.txt | 4 ---- lld/CMakeLists.txt | 4 ---- lldb/cmake/modules/LLDBStandalone.cmake | 4 ---- llvm/cmake/modules/LLVMConfig.cmake.in | 3 +++ mlir/examples/standalone/CMakeLists.txt | 4 ---- 5 files changed, 3 insertions(+), 16 deletions(-) diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 1c4c22b1aaad0..1a6a20a271f36 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -114,10 +114,6 @@ if( CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR ) option(CLANG_ENABLE_BOOTSTRAP "Generate the clang bootstrap target" OFF) option(LLVM_ENABLE_LIBXML2 "Use libxml2 if available." ON) - if(LLVM_ENABLE_ZLIB) - find_package(ZLIB) - endif() - include(AddLLVM) include(TableGen) include(HandleLLVMOptions) diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt index bcfc2c6270b3f..e9bd1bd29c5cf 100644 --- a/lld/CMakeLists.txt +++ b/lld/CMakeLists.txt @@ -51,10 +51,6 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin) find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) - if(LLVM_ENABLE_ZLIB) - find_package(ZLIB) - endif() - include(AddLLVM) include(TableGen) include(HandleLLVMOptions) diff --git a/lldb/cmake/modules/LLDBStandalone.cmake b/lldb/cmake/modules/LLDBStandalone.cmake index edd2b34ec8655..94781c3583744 100644 --- a/lldb/cmake/modules/LLDBStandalone.cmake +++ b/lldb/cmake/modules/LLDBStandalone.cmake @@ -74,10 +74,6 @@ endif() # CMake modules to be in that directory as well. list(APPEND CMAKE_MODULE_PATH "${LLVM_DIR}") -if(LLVM_ENABLE_ZLIB) - find_package(ZLIB) -endif() - include(AddLLVM) include(TableGen) include(HandleLLVMOptions) diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in index e729a839f614d..17cc5eacc57b7 100644 --- a/llvm/cmake/modules/LLVMConfig.cmake.in +++ b/llvm/cmake/modules/LLVMConfig.cmake.in @@ -50,6 +50,9 @@ set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@) set(LLVM_ENABLE_UNWIND_TABLES @LLVM_ENABLE_UNWIND_TABLES@) set(LLVM_ENABLE_ZLIB @LLVM_ENABLE_ZLIB@) +if(LLVM_ENABLE_ZLIB) + find_package(ZLIB) +endif() set(LLVM_LIBXML2_ENABLED @LLVM_LIBXML2_ENABLED@) diff --git a/mlir/examples/standalone/CMakeLists.txt b/mlir/examples/standalone/CMakeLists.txt index 59d3c693546f4..3f46dda4e4f64 100644 --- a/mlir/examples/standalone/CMakeLists.txt +++ b/mlir/examples/standalone/CMakeLists.txt @@ -29,10 +29,6 @@ set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}") list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}") -if(LLVM_ENABLE_ZLIB) - find_package(ZLIB) -endif() - include(TableGen) include(AddLLVM) include(AddMLIR) From 8120eba5fce378083ef22651f2b7b6dcaa54a098 Mon Sep 17 00:00:00 2001 From: Fred Riss Date: Mon, 27 Jul 2020 17:05:27 -0700 Subject: [PATCH 0259/1035] [lldb/ArchSpec] Always match simulator environment in IsEqualTo Summary: Initially, Apple simulator binarie triples didn't use a `-simulator` environment and were just differentiated based on the architecture. For example, `x86_64-apple-ios` would obviously be a simualtor as iOS doesn't run on x86_64. With Catalyst, we made the disctinction explicit and today, all simulator triples (even the legacy ones) are constructed with an environment. This is especially important on Apple Silicon were the architecture is not different from the one of the simulated device. This change makes the simulator part of the environment always part of the criteria to detect whether 2 `ArchSpec`s are equal or compatible. Reviewers: aprantl Subscribers: inglorion, dexonsmith, lldb-commits Tags: #lldb Differential Revision: https://reviews.llvm.org/D84716 --- lldb/source/Utility/ArchSpec.cpp | 6 ++++++ lldb/unittests/Utility/ArchSpecTest.cpp | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index cd382a322da70..6e4f1b5326dd9 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -987,6 +987,12 @@ static bool IsCompatibleEnvironment(llvm::Triple::EnvironmentType lhs, if (lhs == rhs) return true; + // Apple simulators are a different platform than what they simulate. + // As the environments are different at this point, if one of them is a + // simulator, then they are different. + if (lhs == llvm::Triple::Simulator || rhs == llvm::Triple::Simulator) + return false; + // If any of the environment is unknown then they are compatible if (lhs == llvm::Triple::UnknownEnvironment || rhs == llvm::Triple::UnknownEnvironment) diff --git a/lldb/unittests/Utility/ArchSpecTest.cpp b/lldb/unittests/Utility/ArchSpecTest.cpp index a8f43ed7dc7c9..ad0a8ac18cd19 100644 --- a/lldb/unittests/Utility/ArchSpecTest.cpp +++ b/lldb/unittests/Utility/ArchSpecTest.cpp @@ -306,6 +306,14 @@ TEST(ArchSpecTest, Compatibility) { ASSERT_FALSE(A.IsExactMatch(B)); ASSERT_FALSE(A.IsCompatibleMatch(B)); } + { + ArchSpec A("arm64-apple-ios"); + ArchSpec B("arm64-apple-ios-simulator"); + ASSERT_FALSE(A.IsExactMatch(B)); + ASSERT_FALSE(A.IsCompatibleMatch(B)); + ASSERT_FALSE(B.IsCompatibleMatch(A)); + ASSERT_FALSE(B.IsCompatibleMatch(A)); + } { ArchSpec A("arm64-*-*"); ArchSpec B("arm64-apple-ios"); From 18c725e735b1bfb756c2320d812bb095f2c98574 Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Mon, 27 Jul 2020 16:41:55 -0700 Subject: [PATCH 0260/1035] [DomTree] Remove dead code.[NFC] --- llvm/include/llvm/Support/GenericDomTreeConstruction.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h index 709276ab7a29f..5a1f03c879db4 100644 --- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h +++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h @@ -92,13 +92,9 @@ struct SemiNCAInfo { BatchUpdateInfo *BatchUpdates; using BatchUpdatePtr = BatchUpdateInfo *; - std::unique_ptr EmptyGD; // If BUI is a nullptr, then there's no batch update in progress. - SemiNCAInfo(BatchUpdatePtr BUI) : BatchUpdates(BUI) { - if (!BatchUpdates) - EmptyGD = std::make_unique(); - } + SemiNCAInfo(BatchUpdatePtr BUI) : BatchUpdates(BUI) {} void clear() { NumToNode = {nullptr}; // Restore to initial state with a dummy start node. From fbca317694bf635bff4309768df11259387eb371 Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Fri, 24 Jul 2020 16:29:58 -0700 Subject: [PATCH 0261/1035] [CFGDiff] Refactor Succ/Pred maps. Summary: Refactor Succ/Pred maps to have a single map lookup when constructing children. The preivous desing made sense when used by GraphTraits. This more closely matches the previous approach in DomTree. Reviewers: dblaikie Subscribers: llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D84567 --- llvm/include/llvm/ADT/STLExtras.h | 8 ++ llvm/include/llvm/Support/CFGDiff.h | 115 ++++++++++++---------------- 2 files changed, 57 insertions(+), 66 deletions(-) diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index 92eea4e83f693..b9b5e175f50ea 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -1653,6 +1653,14 @@ void erase_if(Container &C, UnaryPredicate P) { C.erase(remove_if(C, P), C.end()); } +/// Wrapper function to remove a value from a container: +/// +/// C.erase(remove(C.begin(), C.end(), V), C.end()); +template +void erase_value(Container &C, ValueType V) { + C.erase(std::remove(C.begin(), C.end(), V), C.end()); +} + /// Given a sequence container Cont, replace the range [ContIt, ContEnd) with /// the range [ValIt, ValEnd) (which is not from the same container). template diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h index 9cbf311f68014..a4a4b2ca44b1f 100644 --- a/llvm/include/llvm/Support/CFGDiff.h +++ b/llvm/include/llvm/Support/CFGDiff.h @@ -55,21 +55,18 @@ template auto reverse_if(Range &&R) { // multigraph. Added edges are pruned to be unique, and deleted edges will // remove all existing edges between two blocks. template class GraphDiff { - using UpdateMapType = SmallDenseMap>; - struct EdgesInsertedDeleted { - UpdateMapType Succ; - UpdateMapType Pred; + struct DeletesInserts { + SmallVector DI[2]; }; - // Store Deleted edges on position 0, and Inserted edges on position 1. - EdgesInsertedDeleted Edges[2]; + using UpdateMapType = SmallDenseMap; + UpdateMapType Succ; + UpdateMapType Pred; + // By default, it is assumed that, given a CFG and a set of updates, we wish // to apply these updates as given. If UpdatedAreReverseApplied is set, the // updates will be applied in reverse: deleted edges are considered re-added // and inserted edges are considered deleted when returning children. bool UpdatedAreReverseApplied; - // Using a singleton empty vector for all node requests with no - // children. - SmallVector Empty; // Keep the list of legalized updates for a deterministic order of updates // when using a GraphDiff for incremental updates in the DominatorTree. @@ -77,14 +74,19 @@ template class GraphDiff { SmallVector, 4> LegalizedUpdates; void printMap(raw_ostream &OS, const UpdateMapType &M) const { - for (auto Pair : M) - for (auto Child : Pair.second) { - OS << "("; - Pair.first->printAsOperand(OS, false); - OS << ", "; - Child->printAsOperand(OS, false); - OS << ") "; + StringRef DIText[2] = {"Delete", "Insert"}; + for (auto Pair : M) { + for (unsigned IsInsert = 0; IsInsert <= 1; ++IsInsert) { + OS << DIText[IsInsert] << " edges: \n"; + for (auto Child : Pair.second.DI[IsInsert]) { + OS << "("; + Pair.first->printAsOperand(OS, false); + OS << ", "; + Child->printAsOperand(OS, false); + OS << ") "; + } } + } OS << "\n"; } @@ -93,13 +95,11 @@ template class GraphDiff { GraphDiff(ArrayRef> Updates, bool ReverseApplyUpdates = false) { cfg::LegalizeUpdates(Updates, LegalizedUpdates, InverseGraph); - // The legalized updates are stored in reverse so we can pop_back when doing - // incremental updates. for (auto U : LegalizedUpdates) { unsigned IsInsert = (U.getKind() == cfg::UpdateKind::Insert) == !ReverseApplyUpdates; - Edges[IsInsert].Succ[U.getFrom()].push_back(U.getTo()); - Edges[IsInsert].Pred[U.getTo()].push_back(U.getFrom()); + Succ[U.getFrom()].DI[IsInsert].push_back(U.getTo()); + Pred[U.getTo()].DI[IsInsert].push_back(U.getFrom()); } UpdatedAreReverseApplied = ReverseApplyUpdates; } @@ -115,73 +115,56 @@ template class GraphDiff { auto U = LegalizedUpdates.pop_back_val(); unsigned IsInsert = (U.getKind() == cfg::UpdateKind::Insert) == !UpdatedAreReverseApplied; - auto &SuccList = Edges[IsInsert].Succ[U.getFrom()]; + auto &SuccDIList = Succ[U.getFrom()]; + auto &SuccList = SuccDIList.DI[IsInsert]; assert(SuccList.back() == U.getTo()); SuccList.pop_back(); - if (SuccList.empty()) - Edges[IsInsert].Succ.erase(U.getFrom()); + if (SuccList.empty() && SuccDIList.DI[!IsInsert].empty()) + Succ.erase(U.getFrom()); - auto &PredList = Edges[IsInsert].Pred[U.getTo()]; + auto &PredDIList = Pred[U.getTo()]; + auto &PredList = PredDIList.DI[IsInsert]; assert(PredList.back() == U.getFrom()); PredList.pop_back(); - if (PredList.empty()) - Edges[IsInsert].Pred.erase(U.getTo()); + if (PredList.empty() && PredDIList.DI[!IsInsert].empty()) + Pred.erase(U.getTo()); return U; } - bool ignoreChild(const NodePtr BB, NodePtr EdgeEnd, bool InverseEdge) const { - // Used to filter nullptr in clang. - if (EdgeEnd == nullptr) - return true; - auto &DeleteChildren = - (InverseEdge != InverseGraph) ? Edges[0].Pred : Edges[0].Succ; - auto It = DeleteChildren.find(BB); - if (It == DeleteChildren.end()) - return false; - auto &EdgesForBB = It->second; - return llvm::find(EdgesForBB, EdgeEnd) != EdgesForBB.end(); - } - - iterator_range::const_iterator> - getAddedChildren(const NodePtr BB, bool InverseEdge) const { - auto &InsertChildren = - (InverseEdge != InverseGraph) ? Edges[1].Pred : Edges[1].Succ; - auto It = InsertChildren.find(BB); - if (It == InsertChildren.end()) - return make_range(Empty.begin(), Empty.end()); - return make_range(It->second.begin(), It->second.end()); - } - using VectRet = SmallVector; template VectRet getChildren(NodePtr N) const { using DirectedNodeT = std::conditional_t, NodePtr>; auto R = children(N); - auto CurrentCFGChildren = detail::reverse_if(R); + VectRet Res = VectRet(detail::reverse_if(R)); + + // Remove nullptr children for clang. + llvm::erase_value(Res, nullptr); + + auto &Children = (InverseEdge != InverseGraph) ? Pred : Succ; + auto It = Children.find(N); + if (It == Children.end()) + return Res; + + // Remove children present in the CFG but not in the snapshot. + for (auto *Child : It->second.DI[0]) + llvm::erase_value(Res, Child); - VectRet UpdatedCFGChildren; - for (auto Child : CurrentCFGChildren) - if (Child && !ignoreChild(N, Child, InverseEdge)) - UpdatedCFGChildren.push_back(Child); + // Add children present in the snapshot for not in the real CFG. + auto &AddedChildren = It->second.DI[1]; + Res.insert(Res.end(), AddedChildren.begin(), AddedChildren.end()); - auto AddedCFGChildren = getAddedChildren(N, InverseEdge); - UpdatedCFGChildren.insert(UpdatedCFGChildren.end(), - AddedCFGChildren.begin(), AddedCFGChildren.end()); - return UpdatedCFGChildren; + return Res; } void print(raw_ostream &OS) const { OS << "===== GraphDiff: CFG edge changes to create a CFG snapshot. \n" "===== (Note: notion of children/inverse_children depends on " "the direction of edges and the graph.)\n"; - OS << "Children to insert:\n\t"; - printMap(OS, Edges[1].Succ); - OS << "Children to delete:\n\t"; - printMap(OS, Edges[0].Succ); - OS << "Inverse_children to insert:\n\t"; - printMap(OS, Edges[1].Pred); - OS << "Inverse_children to delete:\n\t"; - printMap(OS, Edges[0].Pred); + OS << "Children to delete/insert:\n\t"; + printMap(OS, Succ); + OS << "Inverse_children to delete/insert:\n\t"; + printMap(OS, Pred); OS << "\n"; } From 930fc0b300b01890d3cafabfa85a8a50b2ca890e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 27 Jul 2020 20:29:53 -0400 Subject: [PATCH 0262/1035] TableGen: Check if pattern outputs matches instruction defs Attempt to fix address sanitizer bots when building ARM. --- llvm/utils/TableGen/GlobalISelEmitter.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index a9ebf8f1beaf2..52fe9b2af2f08 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -4394,6 +4394,11 @@ Expected GlobalISelEmitter::importExplicitDefRenderers( DstMIBuilder.addRenderer(DstI->Operands[0].Name); + // Some instructions have multiple defs, but are missing a type entry + // (e.g. s_cc_out operands). + if (Dst->getExtTypes().size() < NumDefs) + return failedImport("unhandled discarded def"); + // Patterns only handle a single result, so any result after the first is an // implicitly dead def. for (unsigned I = 1; I < NumDefs; ++I) { From 6bf989b9474ace6a35021e6123d13b7fd59bf9f4 Mon Sep 17 00:00:00 2001 From: Xing GUO Date: Tue, 28 Jul 2020 09:53:59 +0800 Subject: [PATCH 0263/1035] [llvm-readelf] Fix emitting incorrect number of spaces in '--hex-dump'. This patch helps teach llvm-readelf to emit a correct number spaces when dumping in hex format. Before this patch, when the hex data doesn't fill the 4th column, some spaces are missing. ``` Hex dump of section '.sec': 0x00000000 00000000 00000000 00000000 00000000 ................ 0x00000010 00000000 00000000 00000000 0000 .............. ``` After this patch: ``` Hex dump of section '.sec': 0x00000000 00000000 00000000 00000000 00000000 ................ 0x00000010 00000000 00000000 00000000 0000 .............. ``` Reviewed By: grimar Differential Revision: https://reviews.llvm.org/D84640 --- .../test/tools/llvm-readobj/ELF/hex-dump.test | 43 ++++++++++++++++++- llvm/tools/llvm-readobj/ObjDumper.cpp | 5 ++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/llvm/test/tools/llvm-readobj/ELF/hex-dump.test b/llvm/test/tools/llvm-readobj/ELF/hex-dump.test index b1f54009110e2..5954153c63451 100644 --- a/llvm/test/tools/llvm-readobj/ELF/hex-dump.test +++ b/llvm/test/tools/llvm-readobj/ELF/hex-dump.test @@ -1,6 +1,6 @@ ## Test that the -x alias can be used flexibly. Create a baseline and ensure ## all other combinations are identical. -# RUN: yaml2obj %s -o %t +# RUN: yaml2obj --docnum=1 %s -o %t # RUN: llvm-readelf --file-header --hex-dump=.shstrtab %t > %t.hexdump.out # RUN: llvm-readelf -h --hex-dump .shstrtab %t > %t.hexdump.1 # RUN: llvm-readelf -h -x .shstrtab %t > %t.hexdump.2 @@ -48,3 +48,44 @@ FileHeader: Data: ELFDATA2LSB Type: ET_DYN Machine: EM_386 + +## Test that llvm-readelf emits a correct amount of spaces between the hex data +## and its ascii representation. + +## a) When the hex data doesn't fill the column whose index isn't 4. +# RUN: yaml2obj --docnum=2 -DSIZE=18 %s -o %t2.out1 +# RUN: llvm-readelf --hex-dump=.sec %t2.out1 | \ +# RUN: FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES1 + +# SPACES1:Hex dump of section '.sec': +# SPACES1-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................ +# SPACES1-NEXT:0x00000010 0000 .. + +## b) When the hex data doesn't fill the column whose index is 4. +# RUN: yaml2obj --docnum=2 -DSIZE=30 %s -o %t2.out2 +# RUN: llvm-readelf --hex-dump=.sec %t2.out2 | \ +# RUN: FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES2 + +# SPACES2:Hex dump of section '.sec': +# SPACES2-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................ +# SPACES2-NEXT:0x00000010 00000000 00000000 00000000 0000 .............. + +## c) When the hex data fills the column. +# RUN: yaml2obj --docnum=2 -DSIZE=28 %s -o %t2.out3 +# RUN: llvm-readelf --hex-dump=.sec %t2.out3 | \ +# RUN: FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES3 + +# SPACES3:Hex dump of section '.sec': +# SPACES3-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................ +# SPACES3-NEXT:0x00000010 00000000 00000000 00000000 ............ + +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_386 +Sections: + - Name: .sec + Type: SHT_PROGBITS + Size: [[SIZE]] diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp index ce61f1c53a4dd..7b7c9553827f5 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.cpp +++ b/llvm/tools/llvm-readobj/ObjDumper.cpp @@ -155,8 +155,9 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile *Obj, // Least, if we cut in a middle of a row, we add the remaining characters, // which is (8 - (k * 2)). if (i < 4) - W.startLine() << format("%*c", (4 - i) * 8 + (4 - i) + (8 - (k * 2)), - ' '); + W.startLine() << format("%*c", (4 - i) * 8 + (4 - i), ' '); + if (k < 4) + W.startLine() << format("%*c", 8 - k * 2, ' '); TmpSecPtr = SecPtr; for (i = 0; TmpSecPtr + i < SecEnd && i < 16; ++i) From 23d6525cbdc9de7cbfe7640d1e9e4f25a0c5dd85 Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Mon, 27 Jul 2020 17:28:06 -0700 Subject: [PATCH 0264/1035] Don't form a 'context-independent expr' reference to a member during name annotation. Instead, defer forming the member access expression or DeclRefExpr until we build the use of ClassifyName's result. Just build an UnresolvedLookupExpr to track the LookupResult until we're ready to consume it. This also reverts commit 2f7269b6773de2750f9cd1417ef5f21cd6cf7a91 (other than its testcase). That change was an attempted workaround for the same problem. --- clang/include/clang/Basic/TokenKinds.def | 8 ++-- clang/include/clang/Sema/Sema.h | 25 +++++----- clang/lib/Parse/ParseDecl.cpp | 2 +- clang/lib/Parse/ParseExpr.cpp | 18 ++----- clang/lib/Parse/ParseTentative.cpp | 9 ---- clang/lib/Parse/Parser.cpp | 5 +- clang/lib/Sema/SemaDecl.cpp | 47 ++++++++++++++----- clang/lib/Sema/SemaExprMember.cpp | 12 ++--- .../test/SemaTemplate/member-access-expr.cpp | 11 +++++ 9 files changed, 74 insertions(+), 63 deletions(-) diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index 2b353269ed52d..daaa54c3db7c2 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -757,10 +757,10 @@ ANNOTATION(non_type_undeclared) // annotation for an undeclared identifier that // was assumed to be an ADL-only function name ANNOTATION(non_type_dependent) // annotation for an assumed non-type member of // a dependent base class -ANNOTATION(primary_expr) // annotation for a primary expression -ANNOTATION( - uneval_primary_expr) // annotation for a primary expression which should be - // transformed to potentially evaluated +ANNOTATION(overload_set) // annotation for an unresolved overload set +ANNOTATION(primary_expr) // annotation for a primary expression, used when + // tentatively parsing a lambda init-capture or ObjC + // message send ANNOTATION(decltype) // annotation for a decltype expression, // e.g., "decltype(foo.bar())" diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 0721720f79085..63e2d0d17fca2 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -2117,9 +2117,11 @@ class Sema final { /// resolved. ActOnNameClassifiedAsDependentNonType should be called to /// convert the result to an expression. NC_DependentNonType, - /// The name was classified as a non-type, and an expression representing - /// that name has been formed. - NC_ContextIndependentExpr, + /// The name was classified as an overload set, and an expression + /// representing that overload set has been formed. + /// ActOnNameClassifiedAsOverloadSet should be called to form a suitable + /// expression referencing the overload set. + NC_OverloadSet, /// The name was classified as a template whose specializations are types. NC_TypeTemplate, /// The name was classified as a variable template name. @@ -2156,8 +2158,8 @@ class Sema final { return NameClassification(NC_Unknown); } - static NameClassification ContextIndependentExpr(ExprResult E) { - NameClassification Result(NC_ContextIndependentExpr); + static NameClassification OverloadSet(ExprResult E) { + NameClassification Result(NC_OverloadSet); Result.Expr = E; return Result; } @@ -2209,7 +2211,7 @@ class Sema final { NameClassificationKind getKind() const { return Kind; } ExprResult getExpression() const { - assert(Kind == NC_ContextIndependentExpr); + assert(Kind == NC_OverloadSet); return Expr; } @@ -2289,6 +2291,8 @@ class Sema final { NamedDecl *Found, SourceLocation NameLoc, const Token &NextToken); + /// Act on the result of classifying a name as an overload set. + ExprResult ActOnNameClassifiedAsOverloadSet(Scope *S, Expr *OverloadSet); /// Describes the detailed kind of a template name. Used in diagnostics. enum class TemplateNameKindForDiagnostics { @@ -4846,11 +4850,10 @@ class Sema final { Expr *baseObjectExpr = nullptr, SourceLocation opLoc = SourceLocation()); - ExprResult BuildPossibleImplicitMemberExpr(const CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - LookupResult &R, - const TemplateArgumentListInfo *TemplateArgs, - const Scope *S); + ExprResult BuildPossibleImplicitMemberExpr( + const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, LookupResult &R, + const TemplateArgumentListInfo *TemplateArgs, const Scope *S, + UnresolvedLookupExpr *AsULE = nullptr); ExprResult BuildImplicitMemberExpr(const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, LookupResult &R, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index c87d240a8206a..7b3a98edb3726 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -2837,7 +2837,7 @@ Parser::DiagnoseMissingSemiAfterTagDefinition(DeclSpec &DS, AccessSpecifier AS, case Sema::NC_Unknown: case Sema::NC_NonType: case Sema::NC_DependentNonType: - case Sema::NC_ContextIndependentExpr: + case Sema::NC_OverloadSet: case Sema::NC_VarTemplate: case Sema::NC_FunctionTemplate: case Sema::NC_Concept: diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 81e87582c6ade..4f662f00e1df3 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -1007,23 +1007,11 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, Res = Actions.ActOnCXXNullPtrLiteral(ConsumeToken()); break; - case tok::annot_uneval_primary_expr: case tok::annot_primary_expr: + case tok::annot_overload_set: Res = getExprAnnotation(Tok); - if (SavedKind == tok::annot_uneval_primary_expr) { - if (Expr *E = Res.get()) { - if (!E->isTypeDependent() && !E->containsErrors()) { - // TransformToPotentiallyEvaluated expects that it will still be in a - // (temporary) unevaluated context and then looks through that context - // to build it in the surrounding context. So we need to push an - // unevaluated context to balance things out. - EnterExpressionEvaluationContext Unevaluated( - Actions, Sema::ExpressionEvaluationContext::Unevaluated, - Sema::ReuseLambdaContextDecl); - Res = Actions.TransformToPotentiallyEvaluated(Res.get()); - } - } - } + if (!Res.isInvalid() && Tok.getKind() == tok::annot_overload_set) + Res = Actions.ActOnNameClassifiedAsOverloadSet(getCurScope(), Res.get()); ConsumeAnnotationToken(); if (!Res.isInvalid() && Tok.is(tok::less)) checkPotentialAngleBracket(Res); diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp index f026f3a1bfb29..d0f1d2e09a872 100644 --- a/clang/lib/Parse/ParseTentative.cpp +++ b/clang/lib/Parse/ParseTentative.cpp @@ -1276,15 +1276,6 @@ Parser::isCXXDeclarationSpecifier(Parser::TPResult BracedCastResult, // this is ambiguous. Typo-correct to type and expression keywords and // to types and identifiers, in order to try to recover from errors. TentativeParseCCC CCC(Next); - // Tentative parsing may not be done in the right evaluation context - // for the ultimate expression. Enter an unevaluated context to prevent - // Sema from immediately e.g. treating this lookup as a potential ODR-use. - // If we generate an expression annotation token and the parser actually - // claims it as an expression, we'll transform the expression to a - // potentially-evaluated one then. - EnterExpressionEvaluationContext Unevaluated( - Actions, Sema::ExpressionEvaluationContext::Unevaluated, - Sema::ReuseLambdaContextDecl); switch (TryAnnotateName(&CCC)) { case ANK_Error: return TPResult::Error; diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 764d4e8e9d522..45cf855cf8c9b 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1691,9 +1691,8 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC) { return ANK_Success; } - case Sema::NC_ContextIndependentExpr: - Tok.setKind(Actions.isUnevaluatedContext() ? tok::annot_uneval_primary_expr - : tok::annot_primary_expr); + case Sema::NC_OverloadSet: + Tok.setKind(tok::annot_overload_set); setExprAnnotation(Tok, Classification.getExpression()); Tok.setAnnotationEndLoc(NameLoc); if (SS.isNotEmpty()) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 53f5132a46b65..869e4de02cc41 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -1184,23 +1184,20 @@ Sema::NameClassification Sema::ClassifyName(Scope *S, CXXScopeSpec &SS, return ParsedType::make(T); } - // FIXME: This is context-dependent. We need to defer building the member - // expression until the classification is consumed. - if (FirstDecl->isCXXClassMember()) - return NameClassification::ContextIndependentExpr( - BuildPossibleImplicitMemberExpr(SS, SourceLocation(), Result, nullptr, - S)); - // If we already know which single declaration is referenced, just annotate - // that declaration directly. + // that declaration directly. Defer resolving even non-overloaded class + // member accesses, as we need to defer certain access checks until we know + // the context. bool ADL = UseArgumentDependentLookup(SS, Result, NextToken.is(tok::l_paren)); - if (Result.isSingleResult() && !ADL) + if (Result.isSingleResult() && !ADL && !FirstDecl->isCXXClassMember()) return NameClassification::NonType(Result.getRepresentativeDecl()); - // Build an UnresolvedLookupExpr. Note that this doesn't depend on the - // context in which we performed classification, so it's safe to do now. - return NameClassification::ContextIndependentExpr( - BuildDeclarationNameExpr(SS, Result, ADL)); + // Otherwise, this is an overload set that we will need to resolve later. + Result.suppressDiagnostics(); + return NameClassification::OverloadSet(UnresolvedLookupExpr::Create( + Context, Result.getNamingClass(), SS.getWithLocInContext(Context), + Result.getLookupNameInfo(), ADL, Result.isOverloadedResult(), + Result.begin(), Result.end())); } ExprResult @@ -1240,6 +1237,30 @@ ExprResult Sema::ActOnNameClassifiedAsNonType(Scope *S, const CXXScopeSpec &SS, return BuildDeclarationNameExpr(SS, Result, ADL); } +ExprResult Sema::ActOnNameClassifiedAsOverloadSet(Scope *S, Expr *E) { + // For an implicit class member access, transform the result into a member + // access expression if necessary. + auto *ULE = cast(E); + if ((*ULE->decls_begin())->isCXXClassMember()) { + CXXScopeSpec SS; + SS.Adopt(ULE->getQualifierLoc()); + + // Reconstruct the lookup result. + LookupResult Result(*this, ULE->getName(), ULE->getNameLoc(), + LookupOrdinaryName); + Result.setNamingClass(ULE->getNamingClass()); + for (auto I = ULE->decls_begin(), E = ULE->decls_end(); I != E; ++I) + Result.addDecl(*I, I.getAccess()); + Result.resolveKind(); + return BuildPossibleImplicitMemberExpr(SS, SourceLocation(), Result, + nullptr, S); + } + + // Otherwise, this is already in the form we needed, and no further checks + // are necessary. + return ULE; +} + Sema::TemplateNameKindForDiagnostics Sema::getTemplateNameKindForDiagnostics(TemplateName Name) { auto *TD = Name.getAsTemplateDecl(); diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index ebfc1ec4b9749..466d1fe59c715 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -231,12 +231,10 @@ static void diagnoseInstanceReference(Sema &SemaRef, } /// Builds an expression which might be an implicit member expression. -ExprResult -Sema::BuildPossibleImplicitMemberExpr(const CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - LookupResult &R, - const TemplateArgumentListInfo *TemplateArgs, - const Scope *S) { +ExprResult Sema::BuildPossibleImplicitMemberExpr( + const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, LookupResult &R, + const TemplateArgumentListInfo *TemplateArgs, const Scope *S, + UnresolvedLookupExpr *AsULE) { switch (ClassifyImplicitMemberAccess(*this, R)) { case IMA_Instance: return BuildImplicitMemberExpr(SS, TemplateKWLoc, R, TemplateArgs, true, S); @@ -257,7 +255,7 @@ Sema::BuildPossibleImplicitMemberExpr(const CXXScopeSpec &SS, case IMA_Unresolved_StaticContext: if (TemplateArgs || TemplateKWLoc.isValid()) return BuildTemplateIdExpr(SS, TemplateKWLoc, R, false, TemplateArgs); - return BuildDeclarationNameExpr(SS, R, false); + return AsULE ? AsULE : BuildDeclarationNameExpr(SS, R, false); case IMA_Error_StaticContext: case IMA_Error_Unrelated: diff --git a/clang/test/SemaTemplate/member-access-expr.cpp b/clang/test/SemaTemplate/member-access-expr.cpp index 36d6022577289..d6627b954a287 100644 --- a/clang/test/SemaTemplate/member-access-expr.cpp +++ b/clang/test/SemaTemplate/member-access-expr.cpp @@ -160,3 +160,14 @@ namespace test6 { } }; } + +namespace test7 { + struct C { void g(); }; + template struct A { + T x; + static void f() { + (x.g()); // expected-error {{invalid use of member 'x' in static member function}} + } + }; + void h() { A::f(); } +} From a23f62343cb79a3306fa64545db1d61c2d76b9ca Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Wed, 8 Jul 2020 15:19:44 -0700 Subject: [PATCH 0265/1035] Supplement instr profile with sample profile. PGO profile is usually more precise than sample profile. However, PGO profile needs to be collected from loadtest and loadtest may not be representative enough to the production workload. Sample profile collected from production can be used as a supplement -- for functions cold in loadtest but warm/hot in production, we can scale up the related function in PGO profile if the function is warm or hot in sample profile. The implementation contains changes in compiler side and llvm-profdata side. Given an instr profile and a sample profile, for a function cold in PGO profile but warm/hot in sample profile, llvm-profdata will either mark all the counters in the profile to be -1 or scale up the max count in the function to be above hot threshold, depending on the zero counter ratio in the profile. The assumption is if there are too many counters being zero in the function profile, the profile is more likely to cause harm than good, then llvm-profdata will mark all the counters to be -1 indicating the function is hot but the profile is unaccountable. In compiler side, if a function profile with all -1 counters is seen, the function entry count will be set to be above hot threshold but its internal profile will be dropped. In the long run, it may be useful to let compiler support using PGO profile and sample profile at the same time, but that requires more careful design and more substantial changes to make two profiles work seamlessly. The patch here serves as a simple intermediate solution. Differential Revision: https://reviews.llvm.org/D81981 --- llvm/docs/CommandGuide/llvm-profdata.rst | 24 +++ llvm/include/llvm/ProfileData/InstrProf.h | 12 +- .../llvm/ProfileData/InstrProfWriter.h | 2 + llvm/lib/ProfileData/InstrProf.cpp | 15 +- llvm/lib/ProfileData/InstrProfWriter.cpp | 2 +- .../lib/ProfileData/ProfileSummaryBuilder.cpp | 11 +- .../Instrumentation/PGOInstrumentation.cpp | 25 ++- .../PGOProfile/Inputs/sample-profile.proftext | 12 ++ .../PGOProfile/Inputs/suppl-profile.proftext | 15 ++ .../Transforms/PGOProfile/suppl-profile.ll | 37 ++++ .../llvm-profdata/Inputs/mix_instr.proftext | 25 +++ .../llvm-profdata/Inputs/mix_sample.proftext | 17 ++ .../tools/llvm-profdata/overflow-instr.test | 14 +- .../suppl-instr-with-sample.test | 102 +++++++++ llvm/tools/llvm-profdata/llvm-profdata.cpp | 201 ++++++++++++++++++ 15 files changed, 488 insertions(+), 26 deletions(-) create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/sample-profile.proftext create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/suppl-profile.proftext create mode 100644 llvm/test/Transforms/PGOProfile/suppl-profile.ll create mode 100644 llvm/test/tools/llvm-profdata/Inputs/mix_instr.proftext create mode 100644 llvm/test/tools/llvm-profdata/Inputs/mix_sample.proftext create mode 100644 llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst index 13a66dc48cef4..647232020e4b1 100644 --- a/llvm/docs/CommandGuide/llvm-profdata.rst +++ b/llvm/docs/CommandGuide/llvm-profdata.rst @@ -161,6 +161,30 @@ OPTIONS coverage for the optimized target. This option can only be used with sample-based profile in extbinary format. +.. option:: -supplement-instr-with-sample=path_to_sample_profile + + Supplement an instrumentation profile with sample profile. The sample profile + is the input of the flag. Output will be in instrumentation format (only works + with -instr). + +.. option:: -zero-counter-threshold=threshold_float_number + + For the function which is cold in instr profile but hot in sample profile, if + the ratio of the number of zero counters divided by the the total number of + counters is above the threshold, the profile of the function will be regarded + as being harmful for performance and will be dropped. + +.. option:: -instr-prof-cold-threshold=threshold_int_number + + User specified cold threshold for instr profile which will override the cold + threshold got from profile summary. + +.. option:: -suppl-min-size-threshold=threshold_int_number + + If the size of a function is smaller than the threshold, assume it can be + inlined by PGO early inliner and it will not be adjusted based on sample + profile. + EXAMPLES ^^^^^^^^ Basic Usage diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index a3359ca901339..50c4857537812 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -678,8 +678,8 @@ struct InstrProfValueSiteRecord { /// Optionally scale merged counts by \p Weight. void merge(InstrProfValueSiteRecord &Input, uint64_t Weight, function_ref Warn); - /// Scale up value profile data counts. - void scale(uint64_t Weight, function_ref Warn); + /// Scale up value profile data counts by N (Numerator) / D (Denominator). + void scale(uint64_t N, uint64_t D, function_ref Warn); /// Compute the overlap b/w this record and Input record. void overlap(InstrProfValueSiteRecord &Input, uint32_t ValueKind, @@ -753,8 +753,8 @@ struct InstrProfRecord { function_ref Warn); /// Scale up profile counts (including value profile data) by - /// \p Weight. - void scale(uint64_t Weight, function_ref Warn); + /// a factor of (N / D). + void scale(uint64_t N, uint64_t D, function_ref Warn); /// Sort value profile data (per site) by count. void sortValueData() { @@ -839,8 +839,8 @@ struct InstrProfRecord { uint64_t Weight, function_ref Warn); - // Scale up value profile data count. - void scaleValueProfData(uint32_t ValueKind, uint64_t Weight, + // Scale up value profile data count by N (Numerator) / D (Denominator). + void scaleValueProfData(uint32_t ValueKind, uint64_t N, uint64_t D, function_ref Warn); }; diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h index 2d69bba26a29c..35c2669d55a69 100644 --- a/llvm/include/llvm/ProfileData/InstrProfWriter.h +++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h @@ -48,6 +48,8 @@ class InstrProfWriter { InstrProfWriter(bool Sparse = false, bool InstrEntryBBEnabled = false); ~InstrProfWriter(); + StringMap &getProfileData() { return FunctionData; } + /// Add function counts for the given function. If there are already counts /// for this function and the hash and number of counts match, each counter is /// summed. Optionally scale counts by \p Weight. diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 9b429bf37d742..fb788ef4c7655 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -625,11 +625,11 @@ void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input, } } -void InstrProfValueSiteRecord::scale(uint64_t Weight, +void InstrProfValueSiteRecord::scale(uint64_t N, uint64_t D, function_ref Warn) { for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) { bool Overflowed; - I->Count = SaturatingMultiply(I->Count, Weight, &Overflowed); + I->Count = SaturatingMultiply(I->Count, N, &Overflowed) / D; if (Overflowed) Warn(instrprof_error::counter_overflow); } @@ -678,22 +678,23 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight, } void InstrProfRecord::scaleValueProfData( - uint32_t ValueKind, uint64_t Weight, + uint32_t ValueKind, uint64_t N, uint64_t D, function_ref Warn) { for (auto &R : getValueSitesForKind(ValueKind)) - R.scale(Weight, Warn); + R.scale(N, D, Warn); } -void InstrProfRecord::scale(uint64_t Weight, +void InstrProfRecord::scale(uint64_t N, uint64_t D, function_ref Warn) { + assert(D != 0 && "D cannot be 0"); for (auto &Count : this->Counts) { bool Overflowed; - Count = SaturatingMultiply(Count, Weight, &Overflowed); + Count = SaturatingMultiply(Count, N, &Overflowed) / D; if (Overflowed) Warn(instrprof_error::counter_overflow); } for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) - scaleValueProfData(Kind, Weight, Warn); + scaleValueProfData(Kind, N, D, Warn); } // Map indirect call target name hash to name string. diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 88445f186e835..d07668322354e 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -241,7 +241,7 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash, // We've never seen a function with this name and hash, add it. Dest = std::move(I); if (Weight > 1) - Dest.scale(Weight, MapWarn); + Dest.scale(Weight, 1, MapWarn); } else { // We're updating a function we've seen before. Dest.merge(I, Weight, MapWarn); diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp index 5d3a076409427..d2603097c550b 100644 --- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp +++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp @@ -119,13 +119,22 @@ std::unique_ptr InstrProfSummaryBuilder::getSummary() { } void InstrProfSummaryBuilder::addEntryCount(uint64_t Count) { - addCount(Count); NumFunctions++; + + // Skip invalid count. + if (Count == (uint64_t)-1) + return; + + addCount(Count); if (Count > MaxFunctionCount) MaxFunctionCount = Count; } void InstrProfSummaryBuilder::addInternalCount(uint64_t Count) { + // Skip invalid count. + if (Count == (uint64_t)-1) + return; + addCount(Count); if (Count > MaxInternalBlockCount) MaxInternalBlockCount = Count; diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c4a43abaa53cc..7a14f777b565a 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1020,7 +1020,8 @@ class PGOUseFunc { FreqAttr(FFA_Normal), IsCS(IsCS) {} // Read counts for the instrumented BB from profile. - bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros); + bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros, + bool &AllMinusOnes); // Populate the counts for all BBs. void populateCounters(); @@ -1203,7 +1204,8 @@ void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) { // Read the profile from ProfileFileName and assign the value to the // instrumented BB and the edges. This function also updates ProgramMaxCount. // Return true if the profile are successfully read, and false on errors. -bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) { +bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros, + bool &AllMinusOnes) { auto &Ctx = M->getContext(); Expected Result = PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash); @@ -1246,10 +1248,13 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++; LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n"); + AllMinusOnes = (CountFromProfile.size() > 0); uint64_t ValueSum = 0; for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) { LLVM_DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n"); ValueSum += CountFromProfile[I]; + if (CountFromProfile[I] != (uint64_t)-1) + AllMinusOnes = false; } AllZeros = (ValueSum == 0); @@ -1657,8 +1662,13 @@ static bool annotateAllFunctions( SplitIndirectBrCriticalEdges(F, BPI, BFI); PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS, InstrumentFuncEntry); + // When AllMinusOnes is true, it means the profile for the function + // is unrepresentative and this function is actually hot. Set the + // entry count of the function to be multiple times of hot threshold + // and drop all its internal counters. + bool AllMinusOnes = false; bool AllZeros = false; - if (!Func.readCounters(PGOReader.get(), AllZeros)) + if (!Func.readCounters(PGOReader.get(), AllZeros, AllMinusOnes)) continue; if (AllZeros) { F.setEntryCount(ProfileCount(0, Function::PCT_Real)); @@ -1666,6 +1676,15 @@ static bool annotateAllFunctions( ColdFunctions.push_back(&F); continue; } + const unsigned MultiplyFactor = 3; + if (AllMinusOnes) { + uint64_t HotThreshold = PSI->getHotCountThreshold(); + if (HotThreshold) + F.setEntryCount( + ProfileCount(HotThreshold * MultiplyFactor, Function::PCT_Real)); + HotFunctions.push_back(&F); + continue; + } Func.populateCounters(); Func.setBranchWeights(); Func.annotateValueSites(); diff --git a/llvm/test/Transforms/PGOProfile/Inputs/sample-profile.proftext b/llvm/test/Transforms/PGOProfile/Inputs/sample-profile.proftext new file mode 100644 index 0000000000000..0ab7207783eba --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/Inputs/sample-profile.proftext @@ -0,0 +1,12 @@ +test_simple_for:4000:4000 + 1: 1000 + 2: 1000 + 3: 1000 + 4: 1000 + +moo:10:10 + 1: 2 + 2: 2 + 3: 2 + 4: 2 + 5: 2 diff --git a/llvm/test/Transforms/PGOProfile/Inputs/suppl-profile.proftext b/llvm/test/Transforms/PGOProfile/Inputs/suppl-profile.proftext new file mode 100644 index 0000000000000..c82311b3e0c06 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/Inputs/suppl-profile.proftext @@ -0,0 +1,15 @@ +# :ir is the flag to indicate this is IR level profile. +:ir +test_simple_for +34137660316 +2 +0 +0 + +foo +2582734 +4 +1000 +270 +180 +760 diff --git a/llvm/test/Transforms/PGOProfile/suppl-profile.ll b/llvm/test/Transforms/PGOProfile/suppl-profile.ll new file mode 100644 index 0000000000000..e47838883dc6d --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/suppl-profile.ll @@ -0,0 +1,37 @@ +; Supplement instr profile suppl-profile.proftext with sample profile +; sample-profile.proftext. +; RUN: llvm-profdata merge -instr -suppl-min-size-threshold=0 \ +; RUN: -supplement-instr-with-sample=%p/Inputs/sample-profile.proftext \ +; RUN: %S/Inputs/suppl-profile.proftext -o %t.profdata +; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s +; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Check test_simple_for has a non-zero entry count and doesn't have any other +; prof metadata. +; CHECK: @test_simple_for(i32 %n) {{.*}} !prof ![[ENTRY_COUNT:[0-9]+]] +; CHECK-NOT: !prof ! +; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 540} +define i32 @test_simple_for(i32 %n) { +entry: + br label %for.cond + +for.cond: + %i = phi i32 [ 0, %entry ], [ %inc1, %for.inc ] + %sum = phi i32 [ 1, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %for.body, label %for.end + +for.body: + %inc = add nsw i32 %sum, 1 + br label %for.inc + +for.inc: + %inc1 = add nsw i32 %i, 1 + br label %for.cond + +for.end: + ret i32 %sum +} diff --git a/llvm/test/tools/llvm-profdata/Inputs/mix_instr.proftext b/llvm/test/tools/llvm-profdata/Inputs/mix_instr.proftext new file mode 100644 index 0000000000000..d7059e8c4cd7e --- /dev/null +++ b/llvm/test/tools/llvm-profdata/Inputs/mix_instr.proftext @@ -0,0 +1,25 @@ +:ir +foo +7 +5 +12 +13 +0 +0 +0 + +goo +5 +3 +0 +0 +0 + +moo +9 +4 +3000 +1000 +2000 +500 + diff --git a/llvm/test/tools/llvm-profdata/Inputs/mix_sample.proftext b/llvm/test/tools/llvm-profdata/Inputs/mix_sample.proftext new file mode 100644 index 0000000000000..f61ec7fd5ede7 --- /dev/null +++ b/llvm/test/tools/llvm-profdata/Inputs/mix_sample.proftext @@ -0,0 +1,17 @@ +foo:2000:2000 + 1: 2000 +goo:3000:1500 + 1: 1200 + 2: 800 + 3: 1000 +moo:1000:1000 + 1: 1000 +hoo:50:1 + 1: 1 + 2: 2 + 3: 3 + 4: 4 + 5: 5 + 6: 6 + 7: 7 + 8: 8 diff --git a/llvm/test/tools/llvm-profdata/overflow-instr.test b/llvm/test/tools/llvm-profdata/overflow-instr.test index 5b9a94af9b294..73acbd937dd3b 100644 --- a/llvm/test/tools/llvm-profdata/overflow-instr.test +++ b/llvm/test/tools/llvm-profdata/overflow-instr.test @@ -2,16 +2,14 @@ Tests for overflow when merging instrumented profiles. 1- Merge profile having maximum counts with itself and verify overflow detected and saturation occurred RUN: llvm-profdata merge -instr %p/Inputs/overflow-instr.proftext %p/Inputs/overflow-instr.proftext -o %t.out 2>&1 | FileCheck %s -check-prefix=MERGE_OVERFLOW -RUN: llvm-profdata show -instr %t.out | FileCheck %s --check-prefix=SHOW_OVERFLOW +RUN: llvm-profdata show -instr -all-functions -counts %t.out | FileCheck %s --check-prefix=SHOW_OVERFLOW MERGE_OVERFLOW: {{.*}}: overflow: Counter overflow -SHOW_OVERFLOW: Total functions: 1 -SHOW_OVERFLOW-NEXT: Maximum function count: 18446744073709551615 -SHOW_OVERFLOW-NEXT: Maximum internal block count: 18446744073709551615 +SHOW_OVERFLOW: Function count: 18446744073709551615 +SHOW_OVERFLOW-NEXT: Block counts: [18446744073709551615, 18446744073709551615] 2- Merge profile having maximum counts by itself and verify no overflow RUN: llvm-profdata merge -instr %p/Inputs/overflow-instr.proftext -o %t.out 2>&1 | FileCheck %s -check-prefix=MERGE_NO_OVERFLOW -allow-empty -RUN: llvm-profdata show -instr %t.out | FileCheck %s --check-prefix=SHOW_NO_OVERFLOW +RUN: llvm-profdata show -instr -all-functions -counts %t.out | FileCheck %s --check-prefix=SHOW_NO_OVERFLOW MERGE_NO_OVERFLOW-NOT: {{.*}}: overflow: Counter overflow -SHOW_NO_OVERFLOW: Total functions: 1 -SHOW_NO_OVERFLOW-NEXT: Maximum function count: 18446744073709551615 -SHOW_NO_OVERFLOW-NEXT: Maximum internal block count: 18446744073709551615 +SHOW_NO_OVERFLOW: Function count: 18446744073709551615 +SHOW_NO_OVERFLOW-NEXT: Block counts: [9223372036854775808, 18446744073709551615] diff --git a/llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test b/llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test new file mode 100644 index 0000000000000..29d3c7c66b0f3 --- /dev/null +++ b/llvm/test/tools/llvm-profdata/suppl-instr-with-sample.test @@ -0,0 +1,102 @@ +Some basic tests for supplementing instrumentation profile with sample profile. + +Test all of goo's counters will be set to -1. +RUN: llvm-profdata merge \ +RUN: -supplement-instr-with-sample=%p/Inputs/mix_sample.proftext \ +RUN: -suppl-min-size-threshold=0 %p/Inputs/mix_instr.proftext -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=MIX1 + +MIX1: foo: +MIX1-NEXT: Hash: 0x0000000000000007 +MIX1-NEXT: Counters: 5 +MIX1-NEXT: Block counts: [12, 13, 0, 0, 0] +MIX1: goo: +MIX1-NEXT: Hash: 0x0000000000000005 +MIX1-NEXT: Counters: 3 +MIX1-NEXT: Block counts: [18446744073709551615, 18446744073709551615, 18446744073709551615] +MIX1: moo: +MIX1-NEXT: Hash: 0x0000000000000009 +MIX1-NEXT: Counters: 4 +MIX1-NEXT: Block counts: [3000, 1000, 2000, 500] + +Test when the zero counter ratio of foo is higher than zero-counter-threshold. +RUN: llvm-profdata merge \ +RUN: -supplement-instr-with-sample=%p/Inputs/mix_sample.proftext \ +RUN: -suppl-min-size-threshold=0 -zero-counter-threshold=0.5 \ +RUN: -instr-prof-cold-threshold=30 %p/Inputs/mix_instr.proftext -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=MIX2 + +MIX2: foo: +MIX2-NEXT: Hash: 0x0000000000000007 +MIX2-NEXT: Counters: 5 +MIX2-NEXT: Block counts: [18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615] +MIX2: goo: +MIX2-NEXT: Hash: 0x0000000000000005 +MIX2-NEXT: Counters: 3 +MIX2-NEXT: Block counts: [18446744073709551615, 18446744073709551615, 18446744073709551615] +MIX2: moo: +MIX2-NEXT: Hash: 0x0000000000000009 +MIX2-NEXT: Counters: 4 +MIX2-NEXT: Block counts: [3000, 1000, 2000, 500] + +Test when the zero counter ratio of foo is lower than zero-counter-threshold. +RUN: llvm-profdata merge \ +RUN: -supplement-instr-with-sample=%p/Inputs/mix_sample.proftext \ +RUN: -suppl-min-size-threshold=0 -zero-counter-threshold=0.7 \ +RUN: -instr-prof-cold-threshold=30 %p/Inputs/mix_instr.proftext -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=MIX3 + +MIX3: foo: +MIX3-NEXT: Hash: 0x0000000000000007 +MIX3-NEXT: Counters: 5 +MIX3-NEXT: Block counts: [1384, 1500, 0, 0, 0] +MIX3: goo: +MIX3-NEXT: Hash: 0x0000000000000005 +MIX3-NEXT: Counters: 3 +MIX3-NEXT: Block counts: [18446744073709551615, 18446744073709551615, 18446744073709551615] +MIX3: moo: +MIX3-NEXT: Hash: 0x0000000000000009 +MIX3-NEXT: Counters: 4 +MIX3-NEXT: Block counts: [3000, 1000, 2000, 500] + +Test foo's profile won't be adjusted because its size is smaller +than suppl-min-size-threshold. +RUN: llvm-profdata merge \ +RUN: -supplement-instr-with-sample=%p/Inputs/mix_sample.proftext \ +RUN: -suppl-min-size-threshold=2 -zero-counter-threshold=0.7 \ +RUN: -instr-prof-cold-threshold=30 %p/Inputs/mix_instr.proftext -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=MIX4 + +MIX4: foo: +MIX4-NEXT: Hash: 0x0000000000000007 +MIX4-NEXT: Counters: 5 +MIX4-NEXT: Block counts: [12, 13, 0, 0, 0] +MIX4: goo: +MIX4-NEXT: Hash: 0x0000000000000005 +MIX4-NEXT: Counters: 3 +MIX4-NEXT: Block counts: [18446744073709551615, 18446744073709551615, 18446744073709551615] +MIX4: moo: +MIX4-NEXT: Hash: 0x0000000000000009 +MIX4-NEXT: Counters: 4 +MIX4-NEXT: Block counts: [3000, 1000, 2000, 500] + +Test profile summary won't be affected by -1 counter. +RUN: llvm-profdata merge \ +RUN: -supplement-instr-with-sample=%p/Inputs/mix_sample.proftext \ +RUN: -suppl-min-size-threshold=0 %p/Inputs/mix_instr.proftext -o %t +RUN: llvm-profdata show %t -detailed-summary | FileCheck %s --check-prefix=MIX5 + +MIX5: Instrumentation level: IR +MIX5-NEXT: Total functions: 3 +MIX5-NEXT: Maximum function count: 3000 +MIX5-NEXT: Maximum internal block count: 2000 +MIX5-NEXT: Total number of blocks: 9 +MIX5-NEXT: Total count: 6525 +MIX5-NEXT: Detailed summary: +MIX5-NEXT: 3 blocks with count >= 1000 account for 80 percentage of the total counts. +MIX5-NEXT: 3 blocks with count >= 1000 account for 90 percentage of the total counts. +MIX5-NEXT: 4 blocks with count >= 500 account for 95 percentage of the total counts. +MIX5-NEXT: 4 blocks with count >= 500 account for 99 percentage of the total counts. +MIX5-NEXT: 6 blocks with count >= 12 account for 99.9 percentage of the total counts. +MIX5-NEXT: 6 blocks with count >= 12 account for 99.99 percentage of the total counts. +MIX5-NEXT: 6 blocks with count >= 12 account for 99.999 percentage of the total counts. diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index 41f6a4d723eec..771aec89720eb 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -386,6 +386,172 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs, writeInstrProfile(OutputFilename, OutputFormat, Contexts[0]->Writer); } +/// The profile entry for a function in instrumentation profile. +struct InstrProfileEntry { + uint64_t MaxCount = 0; + float ZeroCounterRatio = 0.0; + InstrProfRecord *ProfRecord; + InstrProfileEntry(InstrProfRecord *Record); + InstrProfileEntry() = default; +}; + +InstrProfileEntry::InstrProfileEntry(InstrProfRecord *Record) { + ProfRecord = Record; + uint64_t CntNum = Record->Counts.size(); + uint64_t ZeroCntNum = 0; + for (size_t I = 0; I < CntNum; ++I) { + MaxCount = std::max(MaxCount, Record->Counts[I]); + ZeroCntNum += !Record->Counts[I]; + } + ZeroCounterRatio = (float)ZeroCntNum / CntNum; +} + +/// Either set all the counters in the instr profile entry \p IFE to -1 +/// in order to drop the profile or scale up the counters in \p IFP to +/// be above hot threshold. We use the ratio of zero counters in the +/// profile of a function to decide the profile is helpful or harmful +/// for performance, and to choose whether to scale up or drop it. +static void updateInstrProfileEntry(InstrProfileEntry &IFE, + uint64_t HotInstrThreshold, + float ZeroCounterThreshold) { + InstrProfRecord *ProfRecord = IFE.ProfRecord; + if (!IFE.MaxCount || IFE.ZeroCounterRatio > ZeroCounterThreshold) { + // If all or most of the counters of the function are zero, the + // profile is unaccountable and shuld be dropped. Reset all the + // counters to be -1 and PGO profile-use will drop the profile. + // All counters being -1 also implies that the function is hot so + // PGO profile-use will also set the entry count metadata to be + // above hot threshold. + for (size_t I = 0; I < ProfRecord->Counts.size(); ++I) + ProfRecord->Counts[I] = -1; + return; + } + + // Scale up the MaxCount to be multiple times above hot threshold. + const unsigned MultiplyFactor = 3; + uint64_t Numerator = HotInstrThreshold * MultiplyFactor; + uint64_t Denominator = IFE.MaxCount; + ProfRecord->scale(Numerator, Denominator, [&](instrprof_error E) { + warn(toString(make_error(E))); + }); +} + +const uint64_t ColdPercentileIdx = 15; +const uint64_t HotPercentileIdx = 11; + +/// Adjust the instr profile in \p WC based on the sample profile in +/// \p Reader. +static void +adjustInstrProfile(std::unique_ptr &WC, + std::unique_ptr &Reader, + unsigned SupplMinSizeThreshold, float ZeroCounterThreshold, + unsigned InstrProfColdThreshold) { + // Function to its entry in instr profile. + StringMap InstrProfileMap; + InstrProfSummaryBuilder IPBuilder(ProfileSummaryBuilder::DefaultCutoffs); + for (auto &PD : WC->Writer.getProfileData()) { + // Populate IPBuilder. + for (const auto &PDV : PD.getValue()) { + InstrProfRecord Record = PDV.second; + IPBuilder.addRecord(Record); + } + + // If a function has multiple entries in instr profile, skip it. + if (PD.getValue().size() != 1) + continue; + + // Initialize InstrProfileMap. + InstrProfRecord *R = &PD.getValue().begin()->second; + InstrProfileMap[PD.getKey()] = InstrProfileEntry(R); + } + + ProfileSummary InstrPS = *IPBuilder.getSummary(); + ProfileSummary SamplePS = Reader->getSummary(); + + // Compute cold thresholds for instr profile and sample profile. + uint64_t ColdSampleThreshold = + ProfileSummaryBuilder::getEntryForPercentile( + SamplePS.getDetailedSummary(), + ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx]) + .MinCount; + uint64_t HotInstrThreshold = + ProfileSummaryBuilder::getEntryForPercentile( + InstrPS.getDetailedSummary(), + ProfileSummaryBuilder::DefaultCutoffs[HotPercentileIdx]) + .MinCount; + uint64_t ColdInstrThreshold = + InstrProfColdThreshold + ? InstrProfColdThreshold + : ProfileSummaryBuilder::getEntryForPercentile( + InstrPS.getDetailedSummary(), + ProfileSummaryBuilder::DefaultCutoffs[ColdPercentileIdx]) + .MinCount; + + // Find hot/warm functions in sample profile which is cold in instr profile + // and adjust the profiles of those functions in the instr profile. + for (const auto &PD : Reader->getProfiles()) { + StringRef FName = PD.getKey(); + const sampleprof::FunctionSamples &FS = PD.getValue(); + auto It = InstrProfileMap.find(FName); + if (FS.getHeadSamples() > ColdSampleThreshold && + It != InstrProfileMap.end() && + It->second.MaxCount <= ColdInstrThreshold && + FS.getBodySamples().size() >= SupplMinSizeThreshold) { + updateInstrProfileEntry(It->second, HotInstrThreshold, + ZeroCounterThreshold); + } + } +} + +/// The main function to supplement instr profile with sample profile. +/// \Inputs contains the instr profile. \p SampleFilename specifies the +/// sample profile. \p OutputFilename specifies the output profile name. +/// \p OutputFormat specifies the output profile format. \p OutputSparse +/// specifies whether to generate sparse profile. \p SupplMinSizeThreshold +/// specifies the minimal size for the functions whose profile will be +/// adjusted. \p ZeroCounterThreshold is the threshold to check whether +/// a function contains too many zero counters and whether its profile +/// should be dropped. \p InstrProfColdThreshold is the user specified +/// cold threshold which will override the cold threshold got from the +/// instr profile summary. +static void supplementInstrProfile( + const WeightedFileVector &Inputs, StringRef SampleFilename, + StringRef OutputFilename, ProfileFormat OutputFormat, bool OutputSparse, + unsigned SupplMinSizeThreshold, float ZeroCounterThreshold, + unsigned InstrProfColdThreshold) { + if (OutputFilename.compare("-") == 0) + exitWithError("Cannot write indexed profdata format to stdout."); + if (Inputs.size() != 1) + exitWithError("Expect one input to be an instr profile."); + if (Inputs[0].Weight != 1) + exitWithError("Expect instr profile doesn't have weight."); + + StringRef InstrFilename = Inputs[0].Filename; + + // Read sample profile. + LLVMContext Context; + auto ReaderOrErr = + sampleprof::SampleProfileReader::create(SampleFilename.str(), Context); + if (std::error_code EC = ReaderOrErr.getError()) + exitWithErrorCode(EC, SampleFilename); + auto Reader = std::move(ReaderOrErr.get()); + if (std::error_code EC = Reader->read()) + exitWithErrorCode(EC, SampleFilename); + + // Read instr profile. + std::mutex ErrorLock; + SmallSet WriterErrorCodes; + auto WC = std::make_unique(OutputSparse, ErrorLock, + WriterErrorCodes); + loadInput(Inputs[0], nullptr, WC.get()); + if (WC->Errors.size() > 0) + exitWithError(std::move(WC->Errors[0].first), InstrFilename); + + adjustInstrProfile(WC, Reader, SupplMinSizeThreshold, ZeroCounterThreshold, + InstrProfColdThreshold); + writeInstrProfile(OutputFilename, OutputFormat, WC->Writer); +} + /// Make a copy of the given function samples with all symbol names remapped /// by the provided symbol remapper. static sampleprof::FunctionSamples @@ -680,6 +846,28 @@ static int merge_main(int argc, const char *argv[]) { cl::opt GenPartialProfile( "gen-partial-profile", cl::init(false), cl::Hidden, cl::desc("Generate a partial profile (only meaningful for -extbinary)")); + cl::opt SupplInstrWithSample( + "supplement-instr-with-sample", cl::init(""), cl::Hidden, + cl::desc("Supplement an instr profile with sample profile, to correct " + "the profile unrepresentativeness issue. The sample " + "profile is the input of the flag. Output will be in instr " + "format (The flag only works with -instr)")); + cl::opt ZeroCounterThreshold( + "zero-counter-threshold", cl::init(0.7), cl::Hidden, + cl::desc("For the function which is cold in instr profile but hot in " + "sample profile, if the ratio of the number of zero counters " + "divided by the the total number of counters is above the " + "threshold, the profile of the function will be regarded as " + "being harmful for performance and will be dropped. ")); + cl::opt SupplMinSizeThreshold( + "suppl-min-size-threshold", cl::init(10), cl::Hidden, + cl::desc("If the size of a function is smaller than the threshold, " + "assume it can be inlined by PGO early inliner and it won't " + "be adjusted based on sample profile. ")); + cl::opt InstrProfColdThreshold( + "instr-prof-cold-threshold", cl::init(0), cl::Hidden, + cl::desc("User specified cold threshold for instr profile which will " + "override the cold threshold got from profile summary. ")); cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n"); @@ -708,6 +896,17 @@ static int merge_main(int argc, const char *argv[]) { if (!RemappingFile.empty()) Remapper = SymbolRemapper::create(RemappingFile); + if (!SupplInstrWithSample.empty()) { + if (ProfileKind != instr) + exitWithError( + "-supplement-instr-with-sample can only work with -instr. "); + + supplementInstrProfile(WeightedInputs, SupplInstrWithSample, OutputFilename, + OutputFormat, OutputSparse, SupplMinSizeThreshold, + ZeroCounterThreshold, InstrProfColdThreshold); + return 0; + } + if (ProfileKind == instr) mergeInstrProfile(WeightedInputs, Remapper.get(), OutputFilename, OutputFormat, OutputSparse, NumThreads, FailureMode); @@ -904,6 +1103,8 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts, uint64_t FuncMax = 0; uint64_t FuncSum = 0; for (size_t I = 0, E = Func.Counts.size(); I < E; ++I) { + if (Func.Counts[I] == (uint64_t)-1) + continue; FuncMax = std::max(FuncMax, Func.Counts[I]); FuncSum += Func.Counts[I]; } From a0ebac52df6d890fcba52e7db9ac66d0fc7c2582 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 27 Jul 2020 21:11:42 -0700 Subject: [PATCH 0266/1035] [X86] Properly encode a 32-bit address with an index register and no base register in 16-bit mode. In 16-bit mode we can encode a 32-bit address using 0x67 prefix. We were failing to do this when the index register was a 32-bit register, the base register was not present, and the displacement fit in 16-bits. Fixes PR46866. --- .../X86/MCTargetDesc/X86MCCodeEmitter.cpp | 20 +++++++++++-------- llvm/test/MC/X86/code16gcc.s | 2 ++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 7dea0760a8310..5f1b5b5e2b96e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -164,17 +164,20 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) { /// \returns true if the specified instruction has a 16-bit memory operand. static bool is16BitMemOperand(const MCInst &MI, unsigned Op, const MCSubtargetInfo &STI) { - const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg); - const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); + const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp); - if (STI.hasFeature(X86::Mode16Bit) && BaseReg.getReg() == 0 && Disp.isImm() && - Disp.getImm() < 0x10000) + unsigned BaseReg = Base.getReg(); + unsigned IndexReg = Index.getReg(); + + if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0 && + Disp.isImm() && Disp.getImm() < 0x10000) return true; - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + if ((BaseReg != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) || + (IndexReg != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))) return true; return false; } @@ -498,6 +501,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // This is the [REG]+disp16 case. emitByte(modRMByte(2, RegOpcodeField, RMfield), OS); } else { + assert(IndexReg.getReg() == 0 && "Unexpected index register!"); // There is no BaseReg; this is the plain [disp16] case. emitByte(modRMByte(0, RegOpcodeField, 6), OS); } diff --git a/llvm/test/MC/X86/code16gcc.s b/llvm/test/MC/X86/code16gcc.s index 2391aeffe99a0..8d8589a1117da 100644 --- a/llvm/test/MC/X86/code16gcc.s +++ b/llvm/test/MC/X86/code16gcc.s @@ -62,6 +62,8 @@ //CHECK: popfl # encoding: [0x66,0x9d] pushw 4 //CHECK: pushw 4 # encoding: [0xff,0x36,0x04,0x00] + addw $1, (,%eax,4) + //CHECK: addw $1, (,%eax,4) # encoding: [0x67,0x83,0x04,0x85,0x00,0x00,0x00,0x00,0x01] From 25f193fb46dbdcc178946765aa929535199e2a4b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 27 Jul 2020 21:11:48 -0700 Subject: [PATCH 0267/1035] [X86] Add support for {disp32} to control size of jmp and jcc instructions in the assembler By default we pick a 1 byte displacement and let relaxation enlarge it if necessary. The GNU assembler supports a pseudo prefix to basically pre-relax the instruction the larger size. I plan to add {disp8} and {disp32} support for memory operands in another patch which is why I've included the parsing code and enum for {disp8} pseudo prefix as well. Reviewed By: echristo Differential Revision: https://reviews.llvm.org/D84709 --- .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 33 +++++++++++++++++++ llvm/test/MC/X86/x86-16.s | 11 +++++++ llvm/test/MC/X86/x86-32.s | 11 +++++++ 3 files changed, 55 insertions(+) diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index a3014b2aba92c..bb9919a25847b 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -87,6 +87,14 @@ class X86AsmParser : public MCTargetAsmParser { VEXEncoding ForcedVEXEncoding = VEXEncoding_Default; + enum DispEncoding { + DispEncoding_Default, + DispEncoding_Disp8, + DispEncoding_Disp32, + }; + + DispEncoding ForcedDispEncoding = DispEncoding_Default; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -2592,6 +2600,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Reset the forced VEX encoding. ForcedVEXEncoding = VEXEncoding_Default; + ForcedDispEncoding = DispEncoding_Default; // Parse pseudo prefixes. while (1) { @@ -2610,6 +2619,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, ForcedVEXEncoding = VEXEncoding_VEX3; else if (Prefix == "evex") ForcedVEXEncoding = VEXEncoding_EVEX; + else if (Prefix == "disp8") + ForcedDispEncoding = DispEncoding_Disp8; + else if (Prefix == "disp32") + ForcedDispEncoding = DispEncoding_Disp32; else return Error(NameLoc, "unknown prefix"); @@ -3118,6 +3131,26 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; + case X86::JMP_1: + // {disp32} forces a larger displacement as if the instruction was relaxed. + // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. + // This matches GNU assembler. + if (ForcedDispEncoding == DispEncoding_Disp32) { + Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4); + return true; + } + + return false; + case X86::JCC_1: + // {disp32} forces a larger displacement as if the instruction was relaxed. + // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. + // This matches GNU assembler. + if (ForcedDispEncoding == DispEncoding_Disp32) { + Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4); + return true; + } + + return false; case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: diff --git a/llvm/test/MC/X86/x86-16.s b/llvm/test/MC/X86/x86-16.s index ed3540902894e..f92164e57314a 100644 --- a/llvm/test/MC/X86/x86-16.s +++ b/llvm/test/MC/X86/x86-16.s @@ -1045,3 +1045,14 @@ xsusldtrk // CHECK: xresldtrk // CHECK: encoding: [0xf2,0x0f,0x01,0xe9] xresldtrk + +// CHECK: jmp foo +// CHECK: encoding: [0xe9,A,A] +// CHECK: fixup A - offset: 1, value: foo-2, kind: FK_PCRel_2 +{disp32} jmp foo +foo: + +// CHECK: je foo +// CHECK: encoding: [0x0f,0x84,A,A] +// CHECK: fixup A - offset: 2, value: foo-2, kind: FK_PCRel_2 +{disp32} je foo diff --git a/llvm/test/MC/X86/x86-32.s b/llvm/test/MC/X86/x86-32.s index fdd3c53ed88f5..256d8351e74d8 100644 --- a/llvm/test/MC/X86/x86-32.s +++ b/llvm/test/MC/X86/x86-32.s @@ -1109,3 +1109,14 @@ ptwritel 0xdeadbeef(%ebx,%ecx,8) // CHECK: ptwritel %eax // CHECK: encoding: [0xf3,0x0f,0xae,0xe0] ptwritel %eax + +// CHECK: jmp foo +// CHECK: encoding: [0xe9,A,A,A,A] +// CHECK: fixup A - offset: 1, value: foo-4, kind: FK_PCRel_4 +{disp32} jmp foo +foo: + +// CHECK: je foo +// CHECK: encoding: [0x0f,0x84,A,A,A,A] +// CHECK: fixup A - offset: 2, value: foo-4, kind: FK_PCRel_4 +{disp32} je foo From 647e861e080382593648b234668ad2f5a376ac5e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 27 Jul 2020 21:11:54 -0700 Subject: [PATCH 0268/1035] [X86] Detect if EFLAGs is live across XBEGIN pseudo instruction. Add it as livein to the basic blocks created when expanding the pseudo XBEGIN causes several based blocks to be inserted. If flags are live across it we need to make eflags live in the new basic blocks to avoid machine verifier errors. Fixes PR46827 Reviewed By: ivanbaev Differential Revision: https://reviews.llvm.org/D84479 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 57 ++++++++++++++++--------- llvm/test/CodeGen/X86/pr46827.ll | 39 +++++++++++++++++ 2 files changed, 75 insertions(+), 21 deletions(-) create mode 100644 llvm/test/CodeGen/X86/pr46827.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b986c42b9563f..390a20c3e71f5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30954,6 +30954,34 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { // X86 Scheduler Hooks //===----------------------------------------------------------------------===// +// Returns true if EFLAG is consumed after this iterator in the rest of the +// basic block or any successors of the basic block. +static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, + MachineBasicBlock *BB) { + // Scan forward through BB for a use/def of EFLAGS. + for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end(); + miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return true; + // If we found a def, we can stop searching. + if (mi.definesRegister(X86::EFLAGS)) + return false; + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return true; + } + + return false; +} + /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { @@ -30986,6 +31014,12 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, MF->insert(I, fallMBB); MF->insert(I, sinkMBB); + if (isEFLAGSLiveAfter(MI, MBB)) { + mainMBB->addLiveIn(X86::EFLAGS); + fallMBB->addLiveIn(X86::EFLAGS); + sinkMBB->addLiveIn(X86::EFLAGS); + } + // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); @@ -31374,27 +31408,8 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { - // Scan forward through BB for a use/def of EFLAGS. - MachineBasicBlock::iterator miI(std::next(SelectItr)); - for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { - const MachineInstr& mi = *miI; - if (mi.readsRegister(X86::EFLAGS)) - return false; - if (mi.definesRegister(X86::EFLAGS)) - break; // Should have kill-flag - update below. - } - - // If we hit the end of the block, check whether EFLAGS is live into a - // successor. - if (miI == BB->end()) { - for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), - sEnd = BB->succ_end(); - sItr != sEnd; ++sItr) { - MachineBasicBlock* succ = *sItr; - if (succ->isLiveIn(X86::EFLAGS)) - return false; - } - } + if (isEFLAGSLiveAfter(SelectItr, BB)) + return false; // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. diff --git a/llvm/test/CodeGen/X86/pr46827.ll b/llvm/test/CodeGen/X86/pr46827.ll new file mode 100644 index 0000000000000..438b13c3400fe --- /dev/null +++ b/llvm/test/CodeGen/X86/pr46827.ll @@ -0,0 +1,39 @@ +; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+rtm -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s + +; CHECK: body: | +; CHECK: bb.0.bb107: +; CHECK: successors: %bb.3(0x40000000), %bb.4(0x40000000) +; CHECK: %0:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0, align 16) +; CHECK: %1:gr32 = SUB32ri8 %0, 1, implicit-def $eflags +; CHECK: XBEGIN_4 %bb.4, implicit-def $eax +; CHECK: bb.3.bb107: +; CHECK: successors: %bb.5(0x80000000) +; CHECK: liveins: $eflags +; CHECK: %3:gr32 = MOV32ri -1 +; CHECK: JMP_1 %bb.5 +; CHECK: bb.4.bb107: +; CHECK: successors: %bb.5(0x80000000) +; CHECK: liveins: $eflags +; CHECK: XABORT_DEF implicit-def $eax +; CHECK: %4:gr32 = COPY $eax +; CHECK: bb.5.bb107: +; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) +; CHECK: liveins: $eflags +; CHECK: %2:gr32 = PHI %3, %bb.3, %4, %bb.4 +; CHECK: JCC_1 %bb.2, 5, implicit $eflags +; CHECK: JMP_1 %bb.1 + +declare i32 @llvm.x86.xbegin() #0 + +define void @wobble.12(i32 %tmp116) { +bb107: ; preds = %bb42 + %tmp117 = icmp eq i32 %tmp116, 1 + %tmp127 = tail call i32 @llvm.x86.xbegin() #0 + br i1 %tmp117, label %bb129, label %bb250 + +bb129: ; preds = %bb107 + unreachable + +bb250: ; preds = %bb107 + unreachable +} From 8c9241a051fd677cfbfd9c79c6af9d714be7c792 Mon Sep 17 00:00:00 2001 From: Artem Dergachev Date: Thu, 23 Jul 2020 11:37:45 -0700 Subject: [PATCH 0269/1035] [clang-tidy] Suppress one unittest on macOS. Possibly a linker bug but I'm in a hurry to fix a buildbot. Differential Revision: https://reviews.llvm.org/D84453 --- .../clang-tidy/ClangTidyDiagnosticConsumerTest.cpp | 11 +++++++++++ .../unittests/clang-tidy/ClangTidyOptionsTest.cpp | 1 + 2 files changed, 12 insertions(+) diff --git a/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp b/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp index 2a4ed64b88508..a8729660bdcec 100644 --- a/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp @@ -21,6 +21,16 @@ class TestCheck : public ClangTidyCheck { } }; +// FIXME: This test seems to cause a strange linking interference +// with the ValidConfiguration.ValidEnumOptions test on macOS. +// If both tests are enabled, this test will fail as if +// runCheckOnCode() is not invoked at all. Looks like a linker bug. +// For now both tests are disabled on macOS. It is not sufficient +// to only disable the other test because this test keeps failing +// under Address Sanitizer, which may be an indication of more +// such linking interference with other tests and this test +// seems to be in the center of it. +#ifndef __APPLE__ TEST(ClangTidyDiagnosticConsumer, SortsErrors) { std::vector Errors; runCheckOnCode("int a;", &Errors); @@ -28,6 +38,7 @@ TEST(ClangTidyDiagnosticConsumer, SortsErrors) { EXPECT_EQ("type specifier", Errors[0].Message.Message); EXPECT_EQ("variable", Errors[1].Message.Message); } +#endif } // namespace test } // namespace tidy diff --git a/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp b/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp index 63f9a06e91bec..c4239af0e7673 100644 --- a/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp @@ -223,6 +223,7 @@ TEST(CheckOptionsValidation, ValidIntOptions) { } // FIXME: Figure out why this test causes crashes on mac os. +// See also comments around the ClangTidyDiagnosticConsumer.SortsErrors test. #ifndef __APPLE__ TEST(ValidConfiguration, ValidEnumOptions) { From 486d2750c7151d3d93b785a4669e2d7d5c9286ac Mon Sep 17 00:00:00 2001 From: Ehsan Toosi Date: Fri, 24 Jul 2020 15:43:59 +0200 Subject: [PATCH 0270/1035] [mlir][NFC] Polish copy removal transform Address a few remaining comments in copy removal transform. Differential Revision: https://reviews.llvm.org/D84529 --- mlir/lib/Transforms/CopyRemoval.cpp | 41 ++++++++++++++++------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/mlir/lib/Transforms/CopyRemoval.cpp b/mlir/lib/Transforms/CopyRemoval.cpp index 28648e0b4294a..ccfd02630ac28 100644 --- a/mlir/lib/Transforms/CopyRemoval.cpp +++ b/mlir/lib/Transforms/CopyRemoval.cpp @@ -19,16 +19,28 @@ namespace { //===----------------------------------------------------------------------===// // CopyRemovalPass //===----------------------------------------------------------------------===// + /// This pass removes the redundant Copy operations. Additionally, it /// removes the leftover definition and deallocation operations by erasing the /// copy operation. class CopyRemovalPass : public PassWrapper> { +public: + void runOnOperation() override { + getOperation()->walk([&](CopyOpInterface copyOp) { + reuseCopySourceAsTarget(copyOp); + reuseCopyTargetAsSource(copyOp); + }); + for (Operation *op : eraseList) + op->erase(); + } + private: /// List of operations that need to be removed. DenseSet eraseList; /// Returns the deallocation operation for `value` in `block` if it exists. Operation *getDeallocationInBlock(Value value, Block *block) { + assert(block && "Block cannot be null"); auto valueUsers = value.getUsers(); auto it = llvm::find_if(valueUsers, [&](Operation *op) { auto effects = dyn_cast(op); @@ -40,12 +52,12 @@ class CopyRemovalPass : public PassWrapper> { /// Returns true if an operation between start and end operations has memory /// effect. bool hasMemoryEffectOpBetween(Operation *start, Operation *end) { + assert((start || end) && "Start and end operations cannot be null"); assert(start->getBlock() == end->getBlock() && "Start and end operations should be in the same block."); Operation *op = start->getNextNode(); while (op->isBeforeInBlock(end)) { - auto effects = dyn_cast(op); - if (effects) + if (isa(op)) return true; op = op->getNextNode(); } @@ -55,6 +67,7 @@ class CopyRemovalPass : public PassWrapper> { /// Returns true if `val` value has at least a user between `start` and /// `end` operations. bool hasUsersBetween(Value val, Operation *start, Operation *end) { + assert((start || end) && "Start and end operations cannot be null"); Block *block = start->getBlock(); assert(block == end->getBlock() && "Start and end operations should be in the same block."); @@ -65,10 +78,11 @@ class CopyRemovalPass : public PassWrapper> { }; bool areOpsInTheSameBlock(ArrayRef operations) { - llvm::SmallPtrSet blocks; - for (Operation *op : operations) - blocks.insert(op->getBlock()); - return blocks.size() == 1; + assert(!operations.empty() && + "The operations list should contain at least a single operation"); + Block *block = operations.front()->getBlock(); + return llvm::none_of( + operations, [&](Operation *op) { return block != op->getBlock(); }); } /// Input: @@ -97,7 +111,7 @@ class CopyRemovalPass : public PassWrapper> { /// TODO: Alias analysis is not available at the moment. Currently, we check /// if there are any operations with memory effects between copy and /// deallocation operations. - void ReuseCopySourceAsTarget(CopyOpInterface copyOp) { + void reuseCopySourceAsTarget(CopyOpInterface copyOp) { if (eraseList.count(copyOp)) return; @@ -147,7 +161,7 @@ class CopyRemovalPass : public PassWrapper> { /// TODO: Alias analysis is not available at the moment. Currently, we check /// if there are any operations with memory effects between copy and /// deallocation operations. - void ReuseCopyTargetAsSource(CopyOpInterface copyOp) { + void reuseCopyTargetAsSource(CopyOpInterface copyOp) { if (eraseList.count(copyOp)) return; @@ -169,16 +183,6 @@ class CopyRemovalPass : public PassWrapper> { eraseList.insert(fromDefiningOp); eraseList.insert(fromFreeingOp); } - -public: - void runOnOperation() override { - getOperation()->walk([&](CopyOpInterface copyOp) { - ReuseCopySourceAsTarget(copyOp); - ReuseCopyTargetAsSource(copyOp); - }); - for (Operation *op : eraseList) - op->erase(); - } }; } // end anonymous namespace @@ -186,6 +190,7 @@ class CopyRemovalPass : public PassWrapper> { //===----------------------------------------------------------------------===// // CopyRemovalPass construction //===----------------------------------------------------------------------===// + std::unique_ptr mlir::createCopyRemovalPass() { return std::make_unique(); } From e40315d2b4ed1e38962a8f33ff151693ed4ada63 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Tue, 28 Jul 2020 10:16:52 +0300 Subject: [PATCH 0271/1035] [GVN] Rewrite IsValueFullyAvailableInBlock(): no recursion, less false-negatives While this doesn't appear to help with the perf issue being exposed by D84108, the function as-is is very weird, convoluted, and what's worse, recursive. There was no need for `SpeculativelyAvaliableAndUsedForSpeculation`, tri-state choice is enough. We don't even ever check for that state. The basic idea here is that we need to perform a depth-first traversal of the predecessors of the basic block in question, either finding a preexisting state for the block in a map, or inserting a "placeholder" `SpeculativelyAvaliable`, If we encounter an `Unavaliable` block, then we need to give up search, and back-propagate the `Unavaliable` state to the each successor of said block, more specifically to the each `SpeculativelyAvaliable` we've just created. However, if we have traversed entirety of the predecessors and have not encountered an `Unavaliable` block, then it must mean the value is fully available. We could update each inserted `SpeculativelyAvaliable` into a `Avaliable`, but we don't need to, as assertion excersizes, because we can assume that if we see an `SpeculativelyAvaliable` entry, it is actually `Avaliable`, because during the time we've produced it, if we would have found that it has an `Unavaliable` predecessor, we would have updated it's successors, including this block, into `Unavaliable` Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D84181 --- llvm/lib/Transforms/Scalar/GVN.cpp | 198 +++++++++++------- .../GVN/loadpre-missed-opportunity.ll | 34 ++- 2 files changed, 155 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 0b416cc4afb86..1d82664ed4076 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -98,21 +98,30 @@ STATISTIC(NumGVNSimpl, "Number of instructions simplified"); STATISTIC(NumGVNEqProp, "Number of equalities propagated"); STATISTIC(NumPRELoad, "Number of loads PRE'd"); +STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax, + "Number of blocks speculated as available in " + "IsValueFullyAvailableInBlock(), max"); +STATISTIC(MaxBBSpeculationCutoffReachedTimes, + "Number of times we we reached gvn-max-block-speculations cut-off " + "preventing further exploration"); + static cl::opt GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden); static cl::opt GVNEnableLoadPRE("enable-load-pre", cl::init(true)); static cl::opt GVNEnableLoadInLoopPRE("enable-load-in-loop-pre", cl::init(true)); static cl::opt GVNEnableMemDep("enable-gvn-memdep", cl::init(true)); -// Maximum allowed recursion depth. -static cl::opt -MaxRecurseDepth("gvn-max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, - cl::desc("Max recurse depth in GVN (default = 1000)")); - static cl::opt MaxNumDeps( "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore, cl::desc("Max number of dependences to attempt Load PRE (default = 100)")); +// This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat. +static cl::opt MaxBBSpeculations( + "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore, + cl::desc("Max number of blocks we're willing to speculate on (and recurse " + "into) when deducing if a value is fully avaliable or not in GVN " + "(default = 600)")); + struct llvm::GVN::Expression { uint32_t opcode; bool commutative = false; @@ -669,15 +678,14 @@ LLVM_DUMP_METHOD void GVN::dump(DenseMap& d) const { enum class AvaliabilityState : char { /// We know the block *is not* fully available. This is a fixpoint. - Unavaliable = 0, + Unavailable = 0, /// We know the block *is* fully available. This is a fixpoint. - Avaliable = 1, + Available = 1, /// We do not know whether the block is fully available or not, /// but we are currently speculating that it will be. - SpeculativelyAvaliable = 2, - /// We are speculating for this block and have used that - /// to speculate for other blocks. - SpeculativelyAvaliableAndUsedForSpeculation = 3, + /// If it would have turned out that the block was, in fact, not fully + /// available, this would have been cleaned up into an Unavailable. + SpeculativelyAvailable = 2, }; /// Return true if we can prove that the value @@ -688,80 +696,118 @@ enum class AvaliabilityState : char { /// 1) we know the block *is* fully available. /// 2) we do not know whether the block is fully available or not, but we are /// currently speculating that it will be. -/// 3) we are speculating for this block and have used that to speculate for -/// other blocks. static bool IsValueFullyAvailableInBlock( BasicBlock *BB, - DenseMap &FullyAvailableBlocks, - uint32_t RecurseDepth) { - if (RecurseDepth > MaxRecurseDepth) - return false; - - // Optimistically assume that the block is speculatively available and check - // to see if we already know about this block in one lookup. - std::pair::iterator, bool> IV = - FullyAvailableBlocks.insert( - std::make_pair(BB, AvaliabilityState::SpeculativelyAvaliable)); - - // If the entry already existed for this block, return the precomputed value. - if (!IV.second) { - // If this is a speculative "available" value, mark it as being used for - // speculation of other blocks. - if (IV.first->second == AvaliabilityState::SpeculativelyAvaliable) - IV.first->second = - AvaliabilityState::SpeculativelyAvaliableAndUsedForSpeculation; - return IV.first->second != AvaliabilityState::Unavaliable; - } + DenseMap &FullyAvailableBlocks) { + SmallVector Worklist; + Optional UnavailableBB; - // Otherwise, see if it is fully available in all predecessors. - pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + // The number of times we didn't find an entry for a block in a map and + // optimistically inserted an entry marking block as speculatively avaliable. + unsigned NumNewNewSpeculativelyAvailableBBs = 0; - // If this block has no predecessors, it isn't live-in here. - if (PI == PE) - goto SpeculationFailure; +#ifndef NDEBUG + SmallSet NewSpeculativelyAvailableBBs; + SmallVector AvailableBBs; +#endif - for (; PI != PE; ++PI) - // If the value isn't fully available in one of our predecessors, then it - // isn't fully available in this block either. Undo our previous - // optimistic assumption and bail out. - if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1)) - goto SpeculationFailure; + Worklist.emplace_back(BB); + while (!Worklist.empty()) { + BasicBlock *CurrBB = Worklist.pop_back_val(); // LIFO - depth-first! + // Optimistically assume that the block is Speculatively Available and check + // to see if we already know about this block in one lookup. + std::pair::iterator, bool> IV = + FullyAvailableBlocks.try_emplace( + CurrBB, AvaliabilityState::SpeculativelyAvailable); + AvaliabilityState &State = IV.first->second; + + // Did the entry already exist for this block? + if (!IV.second) { + if (State == AvaliabilityState::Unavailable) { + UnavailableBB = CurrBB; + break; // Backpropagate unavaliability info. + } - return true; +#ifndef NDEBUG + AvailableBBs.emplace_back(CurrBB); +#endif + continue; // Don't recurse further, but continue processing worklist. + } -// If we get here, we found out that this is not, after -// all, a fully-available block. We have a problem if we speculated on this and -// used the speculation to mark other blocks as available. -SpeculationFailure: - AvaliabilityState &BBVal = FullyAvailableBlocks[BB]; + // No entry found for block. + ++NumNewNewSpeculativelyAvailableBBs; + bool OutOfBudget = NumNewNewSpeculativelyAvailableBBs > MaxBBSpeculations; + + // If we have exhausted our budget, mark this block as unavailable. + // Also, if this block has no predecessors, the value isn't live-in here. + if (OutOfBudget || pred_empty(CurrBB)) { + MaxBBSpeculationCutoffReachedTimes += (int)OutOfBudget; + State = AvaliabilityState::Unavailable; + UnavailableBB = CurrBB; + break; // Backpropagate unavaliability info. + } - // If we didn't speculate on this, just return with it set to unavaliable. - if (BBVal == AvaliabilityState::SpeculativelyAvaliable) { - BBVal = AvaliabilityState::Unavaliable; - return false; + // Tentatively consider this block as speculatively available. +#ifndef NDEBUG + NewSpeculativelyAvailableBBs.insert(CurrBB); +#endif + // And further recurse into block's predecessors, in depth-first order! + Worklist.append(pred_begin(CurrBB), pred_end(CurrBB)); } - // If we did speculate on this value, we could have blocks set to - // speculatively avaliable that are incorrect. Walk the (transitive) - // successors of this block and mark them as unavaliable instead. - SmallVector BBWorklist; - BBWorklist.push_back(BB); - - do { - BasicBlock *Entry = BBWorklist.pop_back_val(); - // Note that this sets blocks to unavailable if they happen to not - // already be in FullyAvailableBlocks. This is safe. - AvaliabilityState &EntryVal = FullyAvailableBlocks[Entry]; - if (EntryVal == AvaliabilityState::Unavaliable) - continue; // Already unavailable. - - // Mark as unavailable. - EntryVal = AvaliabilityState::Unavaliable; +#if LLVM_ENABLE_STATS + IsValueFullyAvailableInBlockNumSpeculationsMax.updateMax( + NumNewNewSpeculativelyAvailableBBs); +#endif - BBWorklist.append(succ_begin(Entry), succ_end(Entry)); - } while (!BBWorklist.empty()); + // If the block isn't marked as fixpoint yet + // (the Unavailable and Available states are fixpoints) + auto MarkAsFixpointAndEnqueueSuccessors = + [&](BasicBlock *BB, AvaliabilityState FixpointState) { + auto It = FullyAvailableBlocks.find(BB); + if (It == FullyAvailableBlocks.end()) + return; // Never queried this block, leave as-is. + switch (AvaliabilityState &State = It->second) { + case AvaliabilityState::Unavailable: + case AvaliabilityState::Available: + return; // Don't backpropagate further, continue processing worklist. + case AvaliabilityState::SpeculativelyAvailable: // Fix it! + State = FixpointState; +#ifndef NDEBUG + assert(NewSpeculativelyAvailableBBs.erase(BB) && + "Found a speculatively available successor leftover?"); +#endif + // Queue successors for further processing. + Worklist.append(succ_begin(BB), succ_end(BB)); + return; + } + }; + + if (UnavailableBB) { + // Okay, we have encountered an unavailable block. + // Mark speculatively available blocks reachable from UnavailableBB as + // unavailable as well. Paths are terminated when they reach blocks not in + // FullyAvailableBlocks or they are not marked as speculatively available. + Worklist.clear(); + Worklist.append(succ_begin(*UnavailableBB), succ_end(*UnavailableBB)); + while (!Worklist.empty()) + MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(), + AvaliabilityState::Unavailable); + } + +#ifndef NDEBUG + Worklist.clear(); + for (BasicBlock *AvailableBB : AvailableBBs) + Worklist.append(succ_begin(AvailableBB), succ_end(AvailableBB)); + while (!Worklist.empty()) + MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(), + AvaliabilityState::Available); + + assert(NewSpeculativelyAvailableBBs.empty() && + "Must have fixed all the new speculatively available blocks."); +#endif - return false; + return !UnavailableBB; } /// Given a set of loads specified by ValuesPerBlock, @@ -1126,9 +1172,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, MapVector PredLoads; DenseMap FullyAvailableBlocks; for (const AvailableValueInBlock &AV : ValuesPerBlock) - FullyAvailableBlocks[AV.BB] = AvaliabilityState::Avaliable; + FullyAvailableBlocks[AV.BB] = AvaliabilityState::Available; for (BasicBlock *UnavailableBB : UnavailableBlocks) - FullyAvailableBlocks[UnavailableBB] = AvaliabilityState::Unavaliable; + FullyAvailableBlocks[UnavailableBB] = AvaliabilityState::Unavailable; SmallVector CriticalEdgePred; for (BasicBlock *Pred : predecessors(LoadBB)) { @@ -1141,7 +1187,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; } - if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) { continue; } diff --git a/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll b/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll index 1c967f48d546c..013672fee11e9 100644 --- a/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll +++ b/llvm/test/Transforms/GVN/loadpre-missed-opportunity.ll @@ -1,7 +1,39 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -gvn -S | FileCheck %s +; RUN: opt < %s -gvn -gvn-max-block-speculations=1 -S | FileCheck -check-prefixes=ALL,PRE %s +; RUN: opt < %s -gvn -gvn-max-block-speculations=0 -S | FileCheck -check-prefixes=ALL,CHECK %s define i32 @loadpre_opportunity(i32** %arg, i1 %arg1, i1 %arg2, i1 %arg3) { +; PRE-LABEL: @loadpre_opportunity( +; PRE-NEXT: bb: +; PRE-NEXT: [[I:%.*]] = load i32*, i32** [[ARG:%.*]], align 8 +; PRE-NEXT: [[I6:%.*]] = call i32 @use(i32* [[I]]) +; PRE-NEXT: br label [[BB11:%.*]] +; PRE: bb7: +; PRE-NEXT: [[I8:%.*]] = phi i32* [ [[I8_PRE:%.*]], [[BB17_BB7_CRIT_EDGE:%.*]] ], [ [[I81:%.*]], [[BB11]] ] +; PRE-NEXT: [[I10:%.*]] = call i32 @use(i32* [[I8]]) +; PRE-NEXT: br label [[BB11]] +; PRE: bb11: +; PRE-NEXT: [[I81]] = phi i32* [ [[I]], [[BB:%.*]] ], [ [[I8]], [[BB7:%.*]] ] +; PRE-NEXT: [[I12:%.*]] = phi i32 [ [[I6]], [[BB]] ], [ [[I10]], [[BB7]] ] +; PRE-NEXT: br i1 [[ARG1:%.*]], label [[BB7]], label [[BB13:%.*]] +; PRE: bb13: +; PRE-NEXT: call void @somecall() +; PRE-NEXT: br i1 [[ARG2:%.*]], label [[BB14:%.*]], label [[BB17:%.*]] +; PRE: bb14: +; PRE-NEXT: br label [[BB15:%.*]] +; PRE: bb15: +; PRE-NEXT: br i1 [[ARG3:%.*]], label [[BB16:%.*]], label [[BB15]] +; PRE: bb16: +; PRE-NEXT: br label [[BB17]] +; PRE: bb17: +; PRE-NEXT: [[I18:%.*]] = call i1 @cond() +; PRE-NEXT: br i1 [[I18]], label [[BB17_BB7_CRIT_EDGE]], label [[BB19:%.*]] +; PRE: bb17.bb7_crit_edge: +; PRE-NEXT: [[I8_PRE]] = load i32*, i32** [[ARG]], align 8 +; PRE-NEXT: br label [[BB7]] +; PRE: bb19: +; PRE-NEXT: ret i32 [[I12]] +; ; CHECK-LABEL: @loadpre_opportunity( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = load i32*, i32** [[ARG:%.*]], align 8 From c2abdec722f119ebda0cee330fe8dd7bf9c6d506 Mon Sep 17 00:00:00 2001 From: Chen Zheng Date: Tue, 28 Jul 2020 02:56:35 -0400 Subject: [PATCH 0272/1035] [PowerPC] test case for adding dq form to isLegalAddressingMode, nfc --- llvm/test/CodeGen/PowerPC/prefer-dqform.ll | 96 ++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 llvm/test/CodeGen/PowerPC/prefer-dqform.ll diff --git a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll new file mode 100644 index 0000000000000..0d1992763d0ff --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll @@ -0,0 +1,96 @@ +; RUN: llc -verify-machineinstrs -disable-ppc-instr-form-prep=true -mcpu=pwr9 < %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck %s -check-prefix=CHECK-P9 +; RUN: llc -verify-machineinstrs -disable-ppc-instr-form-prep=true -mcpu=pwr10 < %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck %s -check-prefix=CHECK-P10 + +target triple = "powerpc64le-unknown-linux-gnu" + +%_elem_type_of_a = type <{ double }> +%_elem_type_of_x = type <{ double }> +%_elem_type_of_y = type <{ double }> + +define void @test(i32* dereferenceable(4) %.ial, i32* noalias dereferenceable(4) %.m, i32* noalias dereferenceable(4) %.n, [0 x %_elem_type_of_a]* %.a, i32* noalias dereferenceable(4) %.lda, [0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_y]* noalias %.y) { +; CHECK-P9-LABEL: test: +; CHECK-P9: .LBB0_2: # %_loop_2_do_ +; CHECK-P9: lxvx +; CHECK-P9: lxvx +; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: xvmaddadp +; CHECK-P9-DAG: xvmaddadp +; CHECK-P9-DAG: stxvx +; CHECK-P9: stxvx +; CHECK-P9: bdnz .LBB0_2 +; +; CHECK-P10-LABEL: test: +; CHECK-P10: .LBB0_2: # %_loop_2_do_ +; CHECK-P10: lxvx +; CHECK-P10: lxvx +; CHECK-P10-DAG: lxvx +; CHECK-P10-DAG: lxvx +; CHECK-P10-DAG: xvmaddadp +; CHECK-P10-DAG: xvmaddadp +; CHECK-P10-DAG: stxvx +; CHECK-P10: stxvx +; CHECK-P10: bdnz .LBB0_2 +test_entry: + %_conv5 = ptrtoint [0 x %_elem_type_of_a]* %.a to i64 + %_andi_tmp = and i64 %_conv5, 15 + %_equ_tmp = icmp eq i64 %_andi_tmp, 0 + %. = select i1 %_equ_tmp, i32 1, i32 2 + %_val_m_ = load i32, i32* %.m, align 4 + %_sub_tmp9 = sub nsw i32 1, %. + %_add_tmp10 = add i32 %_sub_tmp9, %_val_m_ + %_mod_tmp = srem i32 %_add_tmp10, 16 + %_sub_tmp11 = sub i32 %_val_m_, %_mod_tmp + %_val_n_ = load i32, i32* %.n, align 4 + %x_rvo_based_addr_17 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1 + %_div_tmp = sdiv i32 %_val_n_, 2 + %_conv16 = sext i32 %_div_tmp to i64 + %_ind_cast = getelementptr inbounds %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_17, i64 %_conv16, i32 0 + %_val_x_ = load double, double* %_ind_cast, align 8 + %.splatinsert = insertelement <2 x double> undef, double %_val_x_, i32 0 + %.splat = shufflevector <2 x double> %.splatinsert, <2 x double> undef, <2 x i32> zeroinitializer + %_grt_tmp21 = icmp sgt i32 %., %_sub_tmp11 + br i1 %_grt_tmp21, label %_return_bb, label %_loop_2_do_.lr.ph + +_loop_2_do_.lr.ph: ; preds = %test_entry + %_val_lda_ = load i32, i32* %.lda, align 4 + %_conv = sext i32 %_val_lda_ to i64 + %_mult_tmp = shl nsw i64 %_conv, 3 + %_sub_tmp4 = sub nuw nsw i64 -8, %_mult_tmp + %y_rvo_based_addr_19 = getelementptr inbounds [0 x %_elem_type_of_y], [0 x %_elem_type_of_y]* %.y, i64 0, i64 -1 + %a_byte_ptr_ = bitcast [0 x %_elem_type_of_a]* %.a to i8* + %a_rvo_based_addr_ = getelementptr inbounds i8, i8* %a_byte_ptr_, i64 %_sub_tmp4 + %0 = zext i32 %. to i64 + %1 = sext i32 %_sub_tmp11 to i64 + br label %_loop_2_do_ + +_loop_2_do_: ; preds = %_loop_2_do_.lr.ph, %_loop_2_do_ + %indvars.iv = phi i64 [ %0, %_loop_2_do_.lr.ph ], [ %indvars.iv.next, %_loop_2_do_ ] + %_ix_x_len19 = shl nuw nsw i64 %indvars.iv, 3 + %y_ix_dim_0_20 = getelementptr inbounds %_elem_type_of_y, %_elem_type_of_y* %y_rvo_based_addr_19, i64 %indvars.iv + %2 = bitcast %_elem_type_of_y* %y_ix_dim_0_20 to <2 x double>* + %3 = load <2 x double>, <2 x double>* %2, align 1 + %4 = getelementptr %_elem_type_of_y, %_elem_type_of_y* %y_ix_dim_0_20, i64 2 + %5 = bitcast %_elem_type_of_y* %4 to <2 x double>* + %6 = load <2 x double>, <2 x double>* %5, align 1 + %a_ix_dim_1_ = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len19 + %7 = bitcast i8* %a_ix_dim_1_ to <2 x double>* + %8 = load <2 x double>, <2 x double>* %7, align 1 + %9 = getelementptr i8, i8* %a_ix_dim_1_, i64 16 + %10 = bitcast i8* %9 to <2 x double>* + %11 = load <2 x double>, <2 x double>* %10, align 1 + %12 = tail call nsz contract <2 x double> @llvm.fma.v2f64(<2 x double> %8, <2 x double> %3, <2 x double> %3) + %13 = tail call nsz contract <2 x double> @llvm.fma.v2f64(<2 x double> %11, <2 x double> %.splat, <2 x double> %6) + store <2 x double> %12, <2 x double>* %2, align 1 + store <2 x double> %13, <2 x double>* %5, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 + %_grt_tmp = icmp sgt i64 %indvars.iv.next, %1 + br i1 %_grt_tmp, label %_return_bb, label %_loop_2_do_ + +_return_bb: ; preds = %_loop_2_do_, %test_entry + ret void +} + +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) From ad4ab81dccaa72d9b5137433a0923d325ff76135 Mon Sep 17 00:00:00 2001 From: Kristina Bessonova Date: Fri, 29 May 2020 13:14:51 +0200 Subject: [PATCH 0273/1035] [clang][cmake] Force CMAKE_LINKER for multistage build in case of BOOTSTRAP_LLVM_ENABLE_LLD and MSVC The issue with LLVM_ENABLE_LLD is that it just passes -fuse-ld=lld to compiler/linker options which makes sense only for those platforms where cmake invokes a compiler driver for linking. On Windows (MSVC) cmake invokes the linker directly and requires CMAKE_LINKER to be specified otherwise it defaults CMAKE_LINKER to be link.exe. This patch allows BOOTSTRAP_LLVM_ENABLE_LLD to set CMAKE_LINKER in two cases: * if building for host Windows, * if crosscompiling for target Windows. It also skips adding '-fuse-ld=lld' to make lld-link not warning about 'unknown argument'. This fixes build with `clang/cmake/caches/DistributionExample.cmake` on Windows. Reviewed By: phosek Differential Revision: https://reviews.llvm.org/D80873 --- clang/CMakeLists.txt | 8 ++++++++ llvm/cmake/modules/HandleLLVMOptions.cmake | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 1a6a20a271f36..0f08538495fca 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -753,6 +753,14 @@ if (CLANG_ENABLE_BOOTSTRAP) -DCMAKE_ASM_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/${C_COMPILER} -DCMAKE_ASM_COMPILER_ID=Clang) + # cmake requires CMAKE_LINKER to be specified if the compiler is MSVC-like, + # otherwise it defaults the linker to be link.exe. + if(BOOTSTRAP_LLVM_ENABLE_LLD) + if((WIN32 AND NOT BOOTSTRAP_CMAKE_SYSTEM_NAME) OR BOOTSTRAP_CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(${CLANG_STAGE}_LINKER -DCMAKE_LINKER=${LLVM_RUNTIME_OUTPUT_INTDIR}/lld-link${CMAKE_EXECUTABLE_SUFFIX}) + endif() + endif() + if(BOOTSTRAP_CMAKE_SYSTEM_NAME) set(${CLANG_STAGE}_CONFIG -DLLVM_CONFIG_PATH=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-config) set(${CLANG_STAGE}_TABLEGEN diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 62dd0ef79cf48..89f7016a7db44 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -261,7 +261,12 @@ if( LLVM_ENABLE_LLD ) if ( LLVM_USE_LINKER ) message(FATAL_ERROR "LLVM_ENABLE_LLD and LLVM_USE_LINKER can't be set at the same time") endif() - set(LLVM_USE_LINKER "lld") + # In case of MSVC cmake always invokes the linker directly, so the linker + # should be specified by CMAKE_LINKER cmake variable instead of by -fuse-ld + # compiler option. + if ( NOT MSVC ) + set(LLVM_USE_LINKER "lld") + endif() endif() if( LLVM_USE_LINKER ) From ee068aafbc5c6722158d5113290a211503e1cfe4 Mon Sep 17 00:00:00 2001 From: Georgii Rymar Date: Thu, 23 Jul 2020 13:32:05 +0300 Subject: [PATCH 0274/1035] [llvm-readelf] - Do not treat SHT_ANDROID_RELR sections the same as SHT_RELR. Currently, when dumping section headers, llvm-readelf prints "RELR" for SHT_ANDROID_RELR/SHT_RELR sections. The behavior was introduced in D47919 and revealed in D84330. But "SHT_ANDROID_RELR" has a different value from "SHT_RELR". Also, "SHT_ANDROID_REL/SHT_ANDROID_RELA" are printed as "ANDROID_REL/ANDROID_RELA", what makes the handling of the "SHT_ANDROID_RELR" inconsistent. This patch makes llvm-readelf to print "ANDROID_RELR" instead of "RELR". Differential revision: https://reviews.llvm.org/D84393 --- llvm/test/tools/llvm-readobj/ELF/section-types.test | 2 +- llvm/tools/llvm-readobj/ELFDumper.cpp | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/test/tools/llvm-readobj/ELF/section-types.test b/llvm/test/tools/llvm-readobj/ELF/section-types.test index 8718f5894c68a..edbc59772f88b 100644 --- a/llvm/test/tools/llvm-readobj/ELF/section-types.test +++ b/llvm/test/tools/llvm-readobj/ELF/section-types.test @@ -110,7 +110,7 @@ # GNU-NEXT: relr RELR # GNU-NEXT: android_rel ANDROID_REL # GNU-NEXT: android_rela ANDROID_RELA -# GNU-NEXT: android_relr RELR +# GNU-NEXT: android_relr ANDROID_RELR # GNU-NEXT: llvm_odrtab LLVM_ODRTAB # GNU-NEXT: linker_options LLVM_LINKER_OPTIONS # GNU-NEXT: llvm_call_graph_profile LLVM_CALL_GRAPH_PROFILE diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 16270cb0f58c2..72dcb9c5a15aa 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -3757,11 +3757,6 @@ static std::string getSectionTypeString(unsigned Machine, unsigned Type) { if (Name == "SHT_SYMTAB_SHNDX") return "SYMTAB SECTION INDICES"; - // The SHT_ANDROID_RELR is special, all other SHT_ANDROID_* types are handled - // in the common block below. - if (Name == "SHT_ANDROID_RELR") - return "RELR"; - if (Name.startswith("SHT_")) return Name.drop_front(4).str(); return getSectionTypeOffsetString(Type); From 3218c064d6d8c2cc910570c9a13cb859a2e91d08 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 28 Jul 2020 10:55:09 +0200 Subject: [PATCH 0275/1035] [legacyPM] Do not compute preserved analysis if there's no local change All analysis are preserved if there's no local change, and thanks to 3667d87a33d3c8d4072a41fd84bb880c59347dc0 this property is enforced for all passes. Skipping the dependency computation improves the performance when there's a lot of small functions, where only a few change happen. Thanks to Nikita Popov who provided this numbers (extract below) https://llvm-compile-time-tracker.com/compare.php?from=183342c0a9850e60dd7a004b651c83dfb3a7d25e&to=f2f91e6a2743070471cc9471e4e8c646e50c653c&stat=instructions O3: (number of instructions) Benchmark Old New kimwitu++ 60783M 59968M (-1.34%) sqlite3 73200M 73083M (-0.16%) consumer-typeset 52776M 52712M (-0.12%) Bullet 133709M 132940M (-0.58%) tramp3d-v4 123864M 123186M (-0.55%) mafft 55534M 55477M (-0.10%) ClamAV 76292M 76164M (-0.17%) lencod 103190M 103061M (-0.13%) SPASS 64068M 63713M (-0.55%) 7zip 197332M 196308M (-0.52%) geomean 85750M 85389M (-0.42%) Differential Revision: https://reviews.llvm.org/D80707 --- llvm/lib/IR/LegacyPassManager.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index 74869fa62c66f..c01696e4e575e 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -1614,7 +1614,8 @@ bool FPPassManager::runOnFunction(Function &F) { dumpUsedSet(FP); verifyPreservedAnalysis(FP); - removeNotPreservedAnalysis(FP); + if (LocalChanged) + removeNotPreservedAnalysis(FP); recordAvailableAnalysis(FP); removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG); } @@ -1723,7 +1724,8 @@ MPPassManager::runOnModule(Module &M) { dumpUsedSet(MP); verifyPreservedAnalysis(MP); - removeNotPreservedAnalysis(MP); + if (LocalChanged) + removeNotPreservedAnalysis(MP); recordAvailableAnalysis(MP); removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG); } From 182111777b4ec215eeebe8ab5cc2a324e2f055ff Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Jul 2020 09:52:38 +0100 Subject: [PATCH 0276/1035] [X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)) An initial backend patch towards fixing the various poor HADD combines (PR34724, PR41813, PR45747 etc.). This extends isHorizontalBinOp to check if we have per-element horizontal ops (odd+even element pairs), but not in the expected serial order - in which case we build a "post shuffle mask" that we can apply to the HOP result, assuming we have fast-hops/optsize etc. The next step will be to extend the SHUFFLE(HOP(X,Y)) combines as suggested on PR41813 - accepting more post-shuffle masks even on slow-hop targets if we can fold it into another shuffle. Differential Revision: https://reviews.llvm.org/D83789 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 74 ++++++-- llvm/test/CodeGen/X86/haddsub-3.ll | 54 ++++-- llvm/test/CodeGen/X86/haddsub-4.ll | 169 +++++------------- llvm/test/CodeGen/X86/haddsub-shuf.ll | 125 ++++--------- llvm/test/CodeGen/X86/haddsub-undef.ll | 98 ++++++---- .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 48 +++-- .../test/CodeGen/X86/vector-shuffle-256-v8.ll | 162 ++++------------- 7 files changed, 321 insertions(+), 409 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 390a20c3e71f5..5eadd9c287c79 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44371,8 +44371,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - bool IsCommutative) { + const X86Subtarget &Subtarget, bool IsCommutative, + SmallVectorImpl &PostShuffleMask) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -44465,6 +44465,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, RMask.push_back(i); } + // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split). + if (!Subtarget.hasAVX2() && VT.isFloatingPoint() && + (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) || + isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask))) + return false; + // If A and B occur in reverse order in RHS, then canonicalize by commuting // RHS operands and shuffle mask. if (A != C) { @@ -44475,6 +44481,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, if (!(A == C && B == D)) return false; + PostShuffleMask.clear(); + PostShuffleMask.append(NumElts, SM_SentinelUndef); + // LHS and RHS are now: // LHS = shuffle A, B, LMask // RHS = shuffle A, B, RMask @@ -44483,6 +44492,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, // so we just repeat the inner loop if this is a 256-bit op. unsigned Num128BitChunks = VT.getSizeInBits() / 128; unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; + unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; assert((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"); for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { @@ -44494,25 +44504,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; + // Check that successive odd/even elements are being operated on. If not, + // this is not a horizontal operation. + if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && + !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) + return false; + + // Compute the post-shuffle mask index based on where the element + // is stored in the HOP result, and where it needs to be moved to. + int Base = LIdx & ~1u; + int Index = ((Base % NumEltsPer128BitChunk) / 2) + + ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); + // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. - unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; - unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; - - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) + Index += NumEltsPer64BitChunk; + PostShuffleMask[i + j] = Index; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. - if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + bool IsIdentityPostShuffle = + isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); + if (IsIdentityPostShuffle) + PostShuffleMask.clear(); + + // Assume a SingleSource HOP if we only shuffle one input and don't need to + // shuffle the result. + if (!shouldUseHorizontalOp(LHS == RHS && + (NumShuffles < 2 || !IsIdentityPostShuffle), + DAG, Subtarget)) return false; LHS = DAG.getBitcast(VT, LHS); @@ -44531,10 +44556,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. + SmallVector PostShuffleMask; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) - return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) { + SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; + } // NOTE: isHorizontalBinOp may have changed LHS/RHS variables. @@ -47636,17 +47667,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG, bool IsAdd = N->getOpcode() == ISD::ADD; assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"); + SmallVector PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) { + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) { auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, - DL, Ops[0].getValueType(), Ops); + return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL, + Ops[0].getValueType(), Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, - HOpBuilder); + SDValue HorizBinOp = + SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; } return SDValue(); diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll index f603ace202a1e..6abba1bbfe9f3 100644 --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -17,22 +17,46 @@ define float @pr26491(<4 x float> %a0) { ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; SSSE3-LABEL: pr26491: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: addps %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: pr26491: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: pr26491: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: pr26491: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSSE3-FAST-NEXT: addss %xmm0, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: pr26491: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: pr26491: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: pr26491: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = fadd <4 x float> %1, %a0 %3 = extractelement <4 x float> %2, i32 2 diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll index 5c8e9a7c72f2a..4c1dc71982aa4 100644 --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -9,30 +9,16 @@ define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { ; SSE-LABEL: hadd_reverse_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshufb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pshufb %xmm2, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pshufb %xmm2, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: paddw %xmm4, %xmm0 +; SSE-NEXT: phaddw %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE-NEXT: retq ; ; AVX-LABEL: hadd_reverse_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; AVX-NEXT: retq %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> @@ -67,67 +53,34 @@ define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; SSE-LABEL: hadd_reverse_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pshufb %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pshufb %xmm0, %xmm6 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pshufb %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pshufb %xmm0, %xmm7 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; SSE-NEXT: pshufb %xmm0, %xmm2 -; SSE-NEXT: pshufb %xmm0, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: paddw %xmm6, %xmm4 -; SSE-NEXT: pshufb %xmm0, %xmm3 -; SSE-NEXT: pshufb %xmm0, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: paddw %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: phaddw %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: phaddw %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX1-NEXT: vpaddw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,0],ymm3[6,4],ymm2[6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> @@ -209,21 +162,11 @@ define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) noun ; ; AVX2-LABEL: hadd_reverse_v8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,3,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,2,1] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,1] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,0,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-NEXT: vaddpd %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[2,1,0,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vaddpd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovapd %ymm2, %ymm1 +; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vmovapd %ymm3, %ymm0 ; AVX2-NEXT: retq %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> @@ -290,22 +233,14 @@ define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nou ; SSE: # %bb.0: ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[3,1] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm0[3,1] -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm6[3,1] -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm2[3,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,0] -; SSE-NEXT: addps %xmm1, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: addps %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,0] -; SSE-NEXT: addps %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0] -; SSE-NEXT: addps %xmm11, %xmm3 +; SSE-NEXT: haddps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] +; SSE-NEXT: haddps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0,3,2] +; SSE-NEXT: haddps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0,3,2] +; SSE-NEXT: haddps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0,3,2] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: movaps %xmm5, %xmm2 @@ -316,29 +251,23 @@ define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nou ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm0[3,1],ymm4[7,5],ymm0[7,5] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm3[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm5[3,1],ymm3[3,1],ymm5[7,5],ymm3[7,5] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm0[2,0],ymm4[6,4],ymm0[6,4] -; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm1 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm3[2,0],ymm5[6,4],ymm3[6,4] -; AVX1-NEXT: vaddps %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1],ymm2[3,1],ymm0[7,5],ymm2[7,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,0,3,1] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1],ymm3[3,1],ymm1[7,5],ymm3[7,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,0,3,1] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] -; AVX2-NEXT: vaddps %ymm0, %ymm4, %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] -; AVX2-NEXT: vaddps %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1] +; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1] +; AVX2-NEXT: vmovaps %ymm3, %ymm0 ; AVX2-NEXT: retq %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 7bedbeb581099..76ef7afbebf33 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -879,77 +879,59 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { ; SSSE3_SLOW-LABEL: PR34724_1: ; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSSE3_SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSSE3_SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3_SLOW-NEXT: addps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[1,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] ; SSSE3_SLOW-NEXT: retq ; ; SSSE3_FAST-LABEL: PR34724_1: ; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSSE3_FAST-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSSE3_FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3_FAST-NEXT: addps %xmm0, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 ; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0 +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; SSSE3_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: PR34724_1: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX1_SLOW-NEXT: retq ; ; AVX1_FAST-LABEL: PR34724_1: ; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX1_FAST-NEXT: retq ; ; AVX2_SLOW-LABEL: PR34724_1: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX2_SLOW-NEXT: retq ; ; AVX2_FAST-LABEL: PR34724_1: ; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> @@ -964,78 +946,49 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { ; SSSE3_SLOW-LABEL: PR34724_2: ; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[1,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[2,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3_SLOW-NEXT: retq ; ; SSSE3_FAST-LABEL: PR34724_2: ; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm3 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSSE3_FAST-NEXT: addps %xmm3, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 ; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0 +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: PR34724_2: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_SLOW-NEXT: retq ; ; AVX1_FAST-LABEL: PR34724_2: ; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_FAST-NEXT: retq ; ; AVX2_SLOW-LABEL: PR34724_2: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_SLOW-NEXT: retq ; ; AVX2_FAST-LABEL: PR34724_2: ; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index f8648f5b7018f..ae53f2d8905fe 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -818,12 +818,25 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-LABEL: PR44694: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: PR44694: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-SLOW-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: PR44694: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-LABEL: PR44694: +; AVX512: # %bb.0: +; AVX512-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> %5 = fadd <4 x double> %3, %4 @@ -831,20 +844,30 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { } define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { -; SSE-LABEL: PR45747_1: -; SSE: # %bb.0: -; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: PR45747_1: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-SLOW-NEXT: addps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: PR45747_1: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; SSE-FAST-LABEL: PR45747_1: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: PR45747_1: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: PR45747_1: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %a %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> @@ -852,19 +875,32 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { } define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { -; SSE-LABEL: PR45747_2: -; SSE: # %bb.0: -; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: retq +; SSE-SLOW-LABEL: PR45747_2: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: PR45747_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: retq +; SSE-FAST-LABEL: PR45747_2: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE-FAST-NEXT: haddps %xmm1, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: PR45747_2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: PR45747_2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-FAST-NEXT: retq %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %b %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 5195f5f0e0c75..1fd61912ed4af 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1766,12 +1766,24 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) { } define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: add_v4f64_0246_1357: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: add_v4f64_0246_1357: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4f64_0246_1357: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: add_v4f64_0246_1357: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: retq entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -1780,12 +1792,24 @@ entry: } define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: add_v4f64_4602_5713: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: add_v4f64_4602_5713: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: add_v4f64_4602_5713: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: add_v4f64_4602_5713: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX512VL-NEXT: retq entry: %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 4798b4b1d38a2..3077428386a26 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3039,32 +3039,11 @@ define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) ; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8f32_02468ACE_13579BDF: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8f32_02468ACE_13579BDF: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8f32_02468ACE_13579BDF: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm2 -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3 -; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8f32_02468ACE_13579BDF: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -3080,32 +3059,11 @@ define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) ; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8f32_8ACE0246_9BDF1357: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8f32_8ACE0246_9BDF1357: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm3 -; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8f32_8ACE0246_9BDF1357: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -3116,45 +3074,21 @@ entry: define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: add_v8i32_02468ACE_13579BDF: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8i32_02468ACE_13579BDF: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8i32_02468ACE_13579BDF: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VL-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512VL-SLOW-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8i32_02468ACE_13579BDF: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VL-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512VL-FAST-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm3 -; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8i32_02468ACE_13579BDF: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -3165,45 +3099,21 @@ entry: define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[2],ymm0[2] ; AVX1-NEXT: retq ; -; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: add_v8i32_8ACE0246_9BDF1357: -; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VL-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 -; AVX512VL-SLOW-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: add_v8i32_8ACE0246_9BDF1357: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VL-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 -; AVX512VL-FAST-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] -; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm3 -; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: add_v8i32_8ACE0246_9BDF1357: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2OR512VL-NEXT: retq entry: %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> From e2f5444c9ce2e0558a69fb32bd001017244eeebc Mon Sep 17 00:00:00 2001 From: Joachim Protze Date: Tue, 28 Jul 2020 11:08:24 +0200 Subject: [PATCH 0277/1035] [OpenMP][Tests] Enable nvptx64 testing for most libomptarget tests Also add $BUILD/lib to the LIBRARY_PATH to fix https://bugs.llvm.org/show_bug.cgi?id=46836. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D84557 --- openmp/libomptarget/test/env/omp_target_debug.c | 2 ++ openmp/libomptarget/test/lit.cfg | 3 +++ openmp/libomptarget/test/mapping/alloc_fail.c | 4 ++++ openmp/libomptarget/test/mapping/declare_mapper_api.cpp | 1 + openmp/libomptarget/test/mapping/declare_mapper_target.cpp | 2 +- .../libomptarget/test/mapping/declare_mapper_target_data.cpp | 2 +- .../test/mapping/declare_mapper_target_data_enter_exit.cpp | 2 +- .../test/mapping/declare_mapper_target_update.cpp | 2 +- openmp/libomptarget/test/mapping/delete_inf_refcount.c | 1 + openmp/libomptarget/test/mapping/pr38704.c | 1 + openmp/libomptarget/test/offloading/d2d_memcpy.c | 5 ++--- openmp/libomptarget/test/offloading/dynamic_module.c | 1 + openmp/libomptarget/test/offloading/dynamic_module_load.c | 1 + openmp/libomptarget/test/offloading/looptripcnt.c | 1 + openmp/libomptarget/test/offloading/offloading_success.c | 1 + openmp/libomptarget/test/offloading/offloading_success.cpp | 1 + .../libomptarget/test/offloading/parallel_offloading_map.c | 1 + openmp/libomptarget/test/offloading/requires.c | 1 + openmp/libomptarget/test/offloading/target_depend_nowait.cpp | 1 + 19 files changed, 26 insertions(+), 7 deletions(-) diff --git a/openmp/libomptarget/test/env/omp_target_debug.c b/openmp/libomptarget/test/env/omp_target_debug.c index ce84c9842f64f..34a71793d9271 100644 --- a/openmp/libomptarget/test/env/omp_target_debug.c +++ b/openmp/libomptarget/test/env/omp_target_debug.c @@ -6,6 +6,8 @@ // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=NDEBUG // REQUIRES: libomptarget-debug int main(void) { diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg index 3dbe8789d5162..77476c6fec793 100644 --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -71,6 +71,9 @@ else: # Unices append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":") append_dynamic_library_path('LD_LIBRARY_PATH', \ config.omp_host_rtl_directory, ":") + append_dynamic_library_path('LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('LIBRARY_PATH', \ + config.omp_host_rtl_directory, ":") # substitutions # - for targets that exist in the system create the actual command. diff --git a/openmp/libomptarget/test/mapping/alloc_fail.c b/openmp/libomptarget/test/mapping/alloc_fail.c index ca15a429f130c..6d1f708dcb2ec 100644 --- a/openmp/libomptarget/test/mapping/alloc_fail.c +++ b/openmp/libomptarget/test/mapping/alloc_fail.c @@ -14,6 +14,10 @@ // RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \ // RUN: | %fcheck-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda +// RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 \ +// RUN: | %fcheck-nvptx64-nvidia-cuda + // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory int main() { diff --git a/openmp/libomptarget/test/mapping/declare_mapper_api.cpp b/openmp/libomptarget/test/mapping/declare_mapper_api.cpp index 45bc076d4110a..54a5ad61538b9 100644 --- a/openmp/libomptarget/test/mapping/declare_mapper_api.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_api.cpp @@ -2,6 +2,7 @@ // RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/mapping/declare_mapper_target.cpp b/openmp/libomptarget/test/mapping/declare_mapper_target.cpp index 6246285283393..096431f6edc0b 100644 --- a/openmp/libomptarget/test/mapping/declare_mapper_target.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_target.cpp @@ -1,8 +1,8 @@ // RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/mapping/declare_mapper_target_data.cpp b/openmp/libomptarget/test/mapping/declare_mapper_target_data.cpp index b457048a2e580..e2636f157d659 100644 --- a/openmp/libomptarget/test/mapping/declare_mapper_target_data.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_target_data.cpp @@ -1,8 +1,8 @@ // RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp b/openmp/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp index ac915a08ea8e2..44fcc20e8ee13 100644 --- a/openmp/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp @@ -1,8 +1,8 @@ // RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/mapping/declare_mapper_target_update.cpp b/openmp/libomptarget/test/mapping/declare_mapper_target_update.cpp index 689275962f2c5..c894c92dc09a6 100644 --- a/openmp/libomptarget/test/mapping/declare_mapper_target_update.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_target_update.cpp @@ -1,8 +1,8 @@ // RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/mapping/delete_inf_refcount.c b/openmp/libomptarget/test/mapping/delete_inf_refcount.c index b4106be04ab73..e6b6e94326968 100644 --- a/openmp/libomptarget/test/mapping/delete_inf_refcount.c +++ b/openmp/libomptarget/test/mapping/delete_inf_refcount.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/mapping/pr38704.c b/openmp/libomptarget/test/mapping/pr38704.c index 3e7135e284114..c4e80ca44f6b9 100644 --- a/openmp/libomptarget/test/mapping/pr38704.c +++ b/openmp/libomptarget/test/mapping/pr38704.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda // Clang 6.0 doesn't use the new map interface, undefined behavior when // the compiler emits "old" interface code for structures. diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy.c b/openmp/libomptarget/test/offloading/d2d_memcpy.c index 4c5f2c2ef5ffa..968f7112cd8b0 100644 --- a/openmp/libomptarget/test/offloading/d2d_memcpy.c +++ b/openmp/libomptarget/test/offloading/d2d_memcpy.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda -allow-empty #include #include @@ -21,9 +22,7 @@ int main(int argc, char *argv[]) { } const int src_device = 0; - int dst_device = 1; - if (dst_device >= num_devices) - dst_device = num_devices - 1; + int dst_device = num_devices - 1; int length = N * sizeof(int); int *src_ptr = omp_target_alloc(length, src_device); diff --git a/openmp/libomptarget/test/offloading/dynamic_module.c b/openmp/libomptarget/test/offloading/dynamic_module.c index 4cb6f35ba8992..ba3c025ba30d0 100644 --- a/openmp/libomptarget/test/offloading/dynamic_module.c +++ b/openmp/libomptarget/test/offloading/dynamic_module.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-powerpc64-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-powerpc64le-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-x86_64-pc-linux-gnu %t.so && %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-nvptx64-nvidia-cuda %t.so && %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda #ifdef SHARED void foo() {} diff --git a/openmp/libomptarget/test/offloading/dynamic_module_load.c b/openmp/libomptarget/test/offloading/dynamic_module_load.c index 958fe2dec351c..f70d76523dc82 100644 --- a/openmp/libomptarget/test/offloading/dynamic_module_load.c +++ b/openmp/libomptarget/test/offloading/dynamic_module_load.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-powerpc64-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-powerpc64le-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64le-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-x86_64-pc-linux-gnu -ldl && %libomptarget-run-x86_64-pc-linux-gnu %t.so 2>&1 | %fcheck-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-nvptx64-nvidia-cuda -ldl && %libomptarget-run-nvptx64-nvidia-cuda %t.so 2>&1 | %fcheck-nvptx64-nvidia-cuda #ifdef SHARED #include diff --git a/openmp/libomptarget/test/offloading/looptripcnt.c b/openmp/libomptarget/test/offloading/looptripcnt.c index 025231b0c6d32..d4c3d6013328b 100644 --- a/openmp/libomptarget/test/offloading/looptripcnt.c +++ b/openmp/libomptarget/test/offloading/looptripcnt.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG // REQUIRES: libomptarget-debug /* diff --git a/openmp/libomptarget/test/offloading/offloading_success.c b/openmp/libomptarget/test/offloading/offloading_success.c index 12e78fac1f5a3..000a38cb5478c 100644 --- a/openmp/libomptarget/test/offloading/offloading_success.c +++ b/openmp/libomptarget/test/offloading/offloading_success.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/offloading/offloading_success.cpp b/openmp/libomptarget/test/offloading/offloading_success.cpp index eecd97a3f317d..910cb1790e9d4 100644 --- a/openmp/libomptarget/test/offloading/offloading_success.cpp +++ b/openmp/libomptarget/test/offloading/offloading_success.cpp @@ -2,6 +2,7 @@ // RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda #include #include diff --git a/openmp/libomptarget/test/offloading/parallel_offloading_map.c b/openmp/libomptarget/test/offloading/parallel_offloading_map.c index 3bd59574747d5..afac7225f37a4 100644 --- a/openmp/libomptarget/test/offloading/parallel_offloading_map.c +++ b/openmp/libomptarget/test/offloading/parallel_offloading_map.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda -allow-empty #include #include diff --git a/openmp/libomptarget/test/offloading/requires.c b/openmp/libomptarget/test/offloading/requires.c index 079ce5cb9348c..60346b1039ac9 100644 --- a/openmp/libomptarget/test/offloading/requires.c +++ b/openmp/libomptarget/test/offloading/requires.c @@ -2,6 +2,7 @@ // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG // REQUIRES: libomptarget-debug /* diff --git a/openmp/libomptarget/test/offloading/target_depend_nowait.cpp b/openmp/libomptarget/test/offloading/target_depend_nowait.cpp index 2c1c7e7191882..32a85d9725392 100644 --- a/openmp/libomptarget/test/offloading/target_depend_nowait.cpp +++ b/openmp/libomptarget/test/offloading/target_depend_nowait.cpp @@ -2,6 +2,7 @@ // RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda #include #include From f5acd11d2c0ea228452aa5ed3abbc2c502009d56 Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Mon, 27 Jul 2020 23:19:02 +0100 Subject: [PATCH 0278/1035] [clang-format][NFC] Be more careful about the layout of FormatToken. The underlying ABI forces FormatToken to have a lot of padding. Currently (on x86-64 linux) `sizeof(FormatToken) == 288`. After this patch `sizeof(FormatToken) == 232`. No functional changes. Reviewed By: MyDeveloperDay Differential Revision: https://reviews.llvm.org/D84306 --- clang/lib/Format/ContinuationIndenter.cpp | 20 +- clang/lib/Format/Format.cpp | 4 +- clang/lib/Format/FormatToken.cpp | 4 +- clang/lib/Format/FormatToken.h | 198 ++++++++++++-------- clang/lib/Format/TokenAnnotator.cpp | 43 +++-- clang/lib/Format/UnwrappedLineFormatter.cpp | 8 +- clang/lib/Format/UnwrappedLineParser.cpp | 44 ++--- clang/lib/Format/WhitespaceManager.cpp | 2 +- 8 files changed, 183 insertions(+), 140 deletions(-) diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index b1497651a8fef..f3202bcb5bc14 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -284,7 +284,7 @@ bool ContinuationIndenter::canBreak(const LineState &State) { // The opening "{" of a braced list has to be on the same line as the first // element if it is nested in another braced init list or function call. if (!Current.MustBreakBefore && Previous.is(tok::l_brace) && - Previous.isNot(TT_DictLiteral) && Previous.BlockKind == BK_BracedInit && + Previous.isNot(TT_DictLiteral) && Previous.is(BK_BracedInit) && Previous.Previous && Previous.Previous->isOneOf(tok::l_brace, tok::l_paren, tok::comma)) return false; @@ -501,7 +501,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { // The following could be precomputed as they do not depend on the state. // However, as they should take effect only if the UnwrappedLine does not fit // into the ColumnLimit, they are checked here in the ContinuationIndenter. - if (Style.ColumnLimit != 0 && Previous.BlockKind == BK_Block && + if (Style.ColumnLimit != 0 && Previous.is(BK_Block) && Previous.is(tok::l_brace) && !Current.isOneOf(tok::r_brace, tok::comment)) return true; @@ -627,7 +627,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // opening parenthesis. Don't break if it doesn't conserve columns. if (Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak && (Previous.isOneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) || - (Previous.is(tok::l_brace) && Previous.BlockKind != BK_Block && + (Previous.is(tok::l_brace) && Previous.isNot(BK_Block) && Style.Cpp11BracedListStyle)) && State.Column > getNewLineColumn(State) && (!Previous.Previous || !Previous.Previous->isOneOf( @@ -648,7 +648,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign && !State.Stack.back().IsCSharpGenericTypeConstraint && Previous.opensScope() && Previous.isNot(TT_ObjCMethodExpr) && - (Current.isNot(TT_LineComment) || Previous.BlockKind == BK_BracedInit)) { + (Current.isNot(TT_LineComment) || Previous.is(BK_BracedInit))) { State.Stack.back().Indent = State.Column + Spaces; State.Stack.back().IsAligned = true; } @@ -972,7 +972,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { return (Style.IndentWidth * State.Line->First->IndentLevel) + Style.IndentWidth; - if (NextNonComment->is(tok::l_brace) && NextNonComment->BlockKind == BK_Block) + if (NextNonComment->is(tok::l_brace) && NextNonComment->is(BK_Block)) return Current.NestingLevel == 0 ? State.FirstIndent : State.Stack.back().Indent; if ((Current.isOneOf(tok::r_brace, tok::r_square) || @@ -982,8 +982,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { State.Stack.size() > 1) { if (Current.closesBlockOrBlockTypeList(Style)) return State.Stack[State.Stack.size() - 2].NestedBlockIndent; - if (Current.MatchingParen && - Current.MatchingParen->BlockKind == BK_BracedInit) + if (Current.MatchingParen && Current.MatchingParen->is(BK_BracedInit)) return State.Stack[State.Stack.size() - 2].LastSpace; return State.FirstIndent; } @@ -1417,7 +1416,7 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, State.Stack.back().IsCSharpGenericTypeConstraint) return; - if (Current.MatchingParen && Current.BlockKind == BK_Block) { + if (Current.MatchingParen && Current.is(BK_Block)) { moveStateToNewBlock(State); return; } @@ -1486,9 +1485,8 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, (State.Line->MustBeDeclaration && !BinPackDeclaration) || (!State.Line->MustBeDeclaration && !Style.BinPackArguments) || (Style.ExperimentalAutoDetectBinPacking && - (Current.PackingKind == PPK_OnePerLine || - (!BinPackInconclusiveFunctions && - Current.PackingKind == PPK_Inconclusive))); + (Current.is(PPK_OnePerLine) || + (!BinPackInconclusiveFunctions && Current.is(PPK_Inconclusive)))); if (Current.is(TT_ObjCMethodExpr) && Current.MatchingParen && Style.ObjCBreakBeforeNestedBlockParam) { diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 3966f0a38639b..8c1d7c90e02a0 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -1575,9 +1575,9 @@ class Formatter : public TokenAnalyzer { continue; FormatToken *Tok = AnnotatedLines[i]->First->Next; while (Tok->Next) { - if (Tok->PackingKind == PPK_BinPacked) + if (Tok->is(PPK_BinPacked)) HasBinPackedFunction = true; - if (Tok->PackingKind == PPK_OnePerLine) + if (Tok->is(PPK_OnePerLine)) HasOnePerLineFunction = true; Tok = Tok->Next; diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index 7d792974cd577..4bc865b043fd2 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -85,8 +85,8 @@ unsigned CommaSeparatedList::formatAfterToken(LineState &State, const FormatToken *LBrace = State.NextToken->Previous->getPreviousNonComment(); if (!LBrace || !LBrace->isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) || - LBrace->BlockKind == BK_Block || LBrace->getType() == TT_DictLiteral || - LBrace->Next->getType() == TT_DesignatedInitializerPeriod) + LBrace->is(BK_Block) || LBrace->is(TT_DictLiteral) || + LBrace->Next->is(TT_DesignatedInitializerPeriod)) return 0; // Calculate the number of code points we have to format this list. As the diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index d4287f53fde37..ece1bf4b97f7b 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -140,80 +140,156 @@ class AnnotatedLine; /// A wrapper around a \c Token storing information about the /// whitespace characters preceding it. struct FormatToken { - FormatToken() {} + FormatToken() + : HasUnescapedNewline(false), IsMultiline(false), IsFirst(false), + MustBreakBefore(false), MustBreakAlignBefore(false), + IsUnterminatedLiteral(false), CanBreakBefore(false), + ClosesTemplateDeclaration(false), StartsBinaryExpression(false), + EndsBinaryExpression(false), PartOfMultiVariableDeclStmt(false), + ContinuesLineCommentSection(false), Finalized(false), + BlockKind(BK_Unknown), Type(TT_Unknown), Decision(FD_Unformatted), + PackingKind(PPK_Inconclusive) {} /// The \c Token. Token Tok; - /// The number of newlines immediately before the \c Token. + /// The raw text of the token. /// - /// This can be used to determine what the user wrote in the original code - /// and thereby e.g. leave an empty line between two function definitions. - unsigned NewlinesBefore = 0; + /// Contains the raw token text without leading whitespace and without leading + /// escaped newlines. + StringRef TokenText; - /// Whether there is at least one unescaped newline before the \c - /// Token. - bool HasUnescapedNewline = false; + /// A token can have a special role that can carry extra information + /// about the token's formatting. + std::unique_ptr Role; /// The range of the whitespace immediately preceding the \c Token. SourceRange WhitespaceRange; - /// The offset just past the last '\n' in this token's leading - /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. - unsigned LastNewlineOffset = 0; - - /// The width of the non-whitespace parts of the token (or its first - /// line for multi-line tokens) in columns. - /// We need this to correctly measure number of columns a token spans. - unsigned ColumnWidth = 0; - - /// Contains the width in columns of the last line of a multi-line - /// token. - unsigned LastLineColumnWidth = 0; + /// Whether there is at least one unescaped newline before the \c + /// Token. + unsigned HasUnescapedNewline : 1; /// Whether the token text contains newlines (escaped or not). - bool IsMultiline = false; + unsigned IsMultiline : 1; /// Indicates that this is the first token of the file. - bool IsFirst = false; + unsigned IsFirst : 1; /// Whether there must be a line break before this token. /// /// This happens for example when a preprocessor directive ended directly /// before the token. - bool MustBreakBefore = false; + unsigned MustBreakBefore : 1; /// Whether to not align across this token /// /// This happens for example when a preprocessor directive ended directly /// before the token, but very rarely otherwise. - bool MustBreakAlignBefore = false; + unsigned MustBreakAlignBefore : 1; - /// The raw text of the token. + /// Set to \c true if this token is an unterminated literal. + unsigned IsUnterminatedLiteral : 1; + + /// \c true if it is allowed to break before this token. + unsigned CanBreakBefore : 1; + + /// \c true if this is the ">" of "template<..>". + unsigned ClosesTemplateDeclaration : 1; + + /// \c true if this token starts a binary expression, i.e. has at least + /// one fake l_paren with a precedence greater than prec::Unknown. + unsigned StartsBinaryExpression : 1; + /// \c true if this token ends a binary expression. + unsigned EndsBinaryExpression : 1; + + /// Is this token part of a \c DeclStmt defining multiple variables? /// - /// Contains the raw token text without leading whitespace and without leading - /// escaped newlines. - StringRef TokenText; + /// Only set if \c Type == \c TT_StartOfName. + unsigned PartOfMultiVariableDeclStmt : 1; - /// Set to \c true if this token is an unterminated literal. - bool IsUnterminatedLiteral = 0; + /// Does this line comment continue a line comment section? + /// + /// Only set to true if \c Type == \c TT_LineComment. + unsigned ContinuesLineCommentSection : 1; + /// If \c true, this token has been fully formatted (indented and + /// potentially re-formatted inside), and we do not allow further formatting + /// changes. + unsigned Finalized : 1; + +private: /// Contains the kind of block if this token is a brace. - BraceBlockKind BlockKind = BK_Unknown; + unsigned BlockKind : 2; +public: + BraceBlockKind getBlockKind() const { + return static_cast(BlockKind); + } + void setBlockKind(BraceBlockKind BBK) { + BlockKind = BBK; + assert(getBlockKind() == BBK && "BraceBlockKind overflow!"); + } + +private: + unsigned Type : 8; + +public: /// Returns the token's type, e.g. whether "<" is a template opener or /// binary operator. - TokenType getType() const { return Type; } - void setType(TokenType T) { Type = T; } + TokenType getType() const { return static_cast(Type); } + void setType(TokenType T) { + Type = T; + assert(getType() == T && "TokenType overflow!"); + } - /// The number of spaces that should be inserted before this token. - unsigned SpacesRequiredBefore = 0; +private: + /// Stores the formatting decision for the token once it was made. + unsigned Decision : 2; - /// \c true if it is allowed to break before this token. - bool CanBreakBefore = false; +public: + FormatDecision getDecision() const { + return static_cast(Decision); + } + void setDecision(FormatDecision D) { + Decision = D; + assert(getDecision() == D && "FormatDecision overflow!"); + } - /// \c true if this is the ">" of "template<..>". - bool ClosesTemplateDeclaration = false; +private: + /// If this is an opening parenthesis, how are the parameters packed? + unsigned PackingKind : 2; + +public: + ParameterPackingKind getPackingKind() const { + return static_cast(PackingKind); + } + void setPackingKind(ParameterPackingKind K) { + PackingKind = K; + assert(getPackingKind() == K && "ParameterPackingKind overflow!"); + } + + /// The number of newlines immediately before the \c Token. + /// + /// This can be used to determine what the user wrote in the original code + /// and thereby e.g. leave an empty line between two function definitions. + unsigned NewlinesBefore = 0; + + /// The offset just past the last '\n' in this token's leading + /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. + unsigned LastNewlineOffset = 0; + + /// The width of the non-whitespace parts of the token (or its first + /// line for multi-line tokens) in columns. + /// We need this to correctly measure number of columns a token spans. + unsigned ColumnWidth = 0; + + /// Contains the width in columns of the last line of a multi-line + /// token. + unsigned LastLineColumnWidth = 0; + + /// The number of spaces that should be inserted before this token. + unsigned SpacesRequiredBefore = 0; /// Number of parameters, if this is "(", "[" or "<". unsigned ParameterCount = 0; @@ -226,13 +302,6 @@ struct FormatToken { /// the surrounding bracket. tok::TokenKind ParentBracket = tok::unknown; - /// A token can have a special role that can carry extra information - /// about the token's formatting. - std::unique_ptr Role; - - /// If this is an opening parenthesis, how are the parameters packed? - ParameterPackingKind PackingKind = PPK_Inconclusive; - /// The total length of the unwrapped line up to and including this /// token. unsigned TotalLength = 0; @@ -286,12 +355,6 @@ struct FormatToken { /// Insert this many fake ) after this token for correct indentation. unsigned FakeRParens = 0; - /// \c true if this token starts a binary expression, i.e. has at least - /// one fake l_paren with a precedence greater than prec::Unknown. - bool StartsBinaryExpression = false; - /// \c true if this token ends a binary expression. - bool EndsBinaryExpression = false; - /// If this is an operator (or "."/"->") in a sequence of operators /// with the same precedence, contains the 0-based operator index. unsigned OperatorIndex = 0; @@ -300,16 +363,6 @@ struct FormatToken { /// with the same precedence, points to the next operator. FormatToken *NextOperator = nullptr; - /// Is this token part of a \c DeclStmt defining multiple variables? - /// - /// Only set if \c Type == \c TT_StartOfName. - bool PartOfMultiVariableDeclStmt = false; - - /// Does this line comment continue a line comment section? - /// - /// Only set to true if \c Type == \c TT_LineComment. - bool ContinuesLineCommentSection = false; - /// If this is a bracket, this points to the matching one. FormatToken *MatchingParen = nullptr; @@ -323,16 +376,8 @@ struct FormatToken { /// in it. SmallVector Children; - /// Stores the formatting decision for the token once it was made. - FormatDecision Decision = FD_Unformatted; - - /// If \c true, this token has been fully formatted (indented and - /// potentially re-formatted inside), and we do not allow further formatting - /// changes. - bool Finalized = false; - bool is(tok::TokenKind Kind) const { return Tok.is(Kind); } - bool is(TokenType TT) const { return Type == TT; } + bool is(TokenType TT) const { return getType() == TT; } bool is(const IdentifierInfo *II) const { return II && II == Tok.getIdentifierInfo(); } @@ -340,6 +385,9 @@ struct FormatToken { return Tok.getIdentifierInfo() && Tok.getIdentifierInfo()->getPPKeywordID() == Kind; } + bool is(BraceBlockKind BBK) const { return getBlockKind() == BBK; } + bool is(ParameterPackingKind PPK) const { return getPackingKind() == PPK; } + template bool isOneOf(A K1, B K2) const { return is(K1) || is(K2); } @@ -355,7 +403,7 @@ struct FormatToken { } bool closesScopeAfterBlock() const { - if (BlockKind == BK_Block) + if (getBlockKind() == BK_Block) return true; if (closesScope()) return Previous->closesScopeAfterBlock(); @@ -525,13 +573,13 @@ struct FormatToken { /// list that should be indented with a block indent. bool opensBlockOrBlockTypeList(const FormatStyle &Style) const { // C# Does not indent object initialisers as continuations. - if (is(tok::l_brace) && BlockKind == BK_BracedInit && Style.isCSharp()) + if (is(tok::l_brace) && getBlockKind() == BK_BracedInit && Style.isCSharp()) return true; if (is(TT_TemplateString) && opensScope()) return true; return is(TT_ArrayInitializerLSquare) || is(TT_ProtoExtensionLSquare) || (is(tok::l_brace) && - (BlockKind == BK_Block || is(TT_DictLiteral) || + (getBlockKind() == BK_Block || is(TT_DictLiteral) || (!Style.Cpp11BracedListStyle && NestingLevel == 0))) || (is(tok::less) && (Style.Language == FormatStyle::LK_Proto || Style.Language == FormatStyle::LK_TextProto)); @@ -602,8 +650,6 @@ struct FormatToken { return Previous->endsSequenceInternal(K1, Tokens...); return is(K1) && Previous && Previous->endsSequenceInternal(Tokens...); } - - TokenType Type = TT_Unknown; }; class ContinuationIndenter; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 0ab09b4a1218a..b19fc34bcc802 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -343,11 +343,11 @@ class AnnotatingParser { CurrentToken->setType(TT_AttributeSquare); if (!HasMultipleLines) - Left->PackingKind = PPK_Inconclusive; + Left->setPackingKind(PPK_Inconclusive); else if (HasMultipleParametersOnALine) - Left->PackingKind = PPK_BinPacked; + Left->setPackingKind(PPK_BinPacked); else - Left->PackingKind = PPK_OnePerLine; + Left->setPackingKind(PPK_OnePerLine); next(); return true; @@ -704,7 +704,7 @@ class AnnotatingParser { ScopedContextCreator ContextCreator(*this, tok::l_brace, 1); Contexts.back().ColonIsDictLiteral = true; - if (Left->BlockKind == BK_BracedInit) + if (Left->is(BK_BracedInit)) Contexts.back().IsExpression = true; if (Style.Language == FormatStyle::LK_JavaScript && Left->Previous && Left->Previous->is(TT_JsTypeColon)) @@ -751,7 +751,7 @@ class AnnotatingParser { // For ObjC methods, the number of parameters is calculated differently as // method declarations have a different structure (the parameters are not // inside a bracket scope). - if (Current->is(tok::l_brace) && Current->BlockKind == BK_Block) + if (Current->is(tok::l_brace) && Current->is(BK_Block)) ++Left->BlockParameterCount; if (Current->is(tok::comma)) { ++Left->ParameterCount; @@ -2420,7 +2420,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { if (isFunctionDeclarationName(*Current, Line)) Current->setType(TT_FunctionDeclarationName); if (Current->is(TT_LineComment)) { - if (Current->Previous->BlockKind == BK_BracedInit && + if (Current->Previous->is(BK_BracedInit) && Current->Previous->opensScope()) Current->SpacesRequiredBefore = (Style.Cpp11BracedListStyle && !Style.SpacesInParentheses) ? 0 : 1; @@ -2755,8 +2755,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, if (Left.isOneOf(tok::hashhash, tok::hash)) return Right.is(tok::hash); if ((Left.is(tok::l_paren) && Right.is(tok::r_paren)) || - (Left.is(tok::l_brace) && Left.BlockKind != BK_Block && - Right.is(tok::r_brace) && Right.BlockKind != BK_Block)) + (Left.is(tok::l_brace) && Left.isNot(BK_Block) && + Right.is(tok::r_brace) && Right.isNot(BK_Block))) return Style.SpaceInEmptyParentheses; if (Style.SpacesInConditionalStatement) { if (Left.is(tok::l_paren) && Left.Previous && @@ -2836,7 +2836,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, return Right.Tok.isLiteral() || Right.is(TT_BlockComment) || (Right.isOneOf(Keywords.kw_override, Keywords.kw_final) && !Right.is(TT_StartOfName)) || - (Right.is(tok::l_brace) && Right.BlockKind == BK_Block) || + (Right.is(tok::l_brace) && Right.is(BK_Block)) || (!Right.isOneOf(TT_PointerOrReference, TT_ArraySubscriptLSquare, tok::l_paren) && (Style.PointerAlignment != FormatStyle::PAS_Right && @@ -2921,9 +2921,9 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, return false; if (Left.is(tok::l_brace) && Right.is(tok::r_brace)) return !Left.Children.empty(); // No spaces in "{}". - if ((Left.is(tok::l_brace) && Left.BlockKind != BK_Block) || + if ((Left.is(tok::l_brace) && Left.isNot(BK_Block)) || (Right.is(tok::r_brace) && Right.MatchingParen && - Right.MatchingParen->BlockKind != BK_Block)) + Right.MatchingParen->isNot(BK_Block))) return Style.Cpp11BracedListStyle ? Style.SpacesInParentheses : true; if (Left.is(TT_BlockComment)) // No whitespace in x(/*foo=*/1), except for JavaScript. @@ -2967,7 +2967,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, tok::r_paren) || Left.isSimpleTypeSpecifier()) && Right.is(tok::l_brace) && Right.getNextNonComment() && - Right.BlockKind != BK_Block) + Right.isNot(BK_Block)) return false; if (Left.is(tok::period) || Right.is(tok::period)) return false; @@ -3009,7 +3009,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, if (Style.isCpp()) { if (Left.is(tok::kw_operator)) return Right.is(tok::coloncolon); - if (Right.is(tok::l_brace) && Right.BlockKind == BK_BracedInit && + if (Right.is(tok::l_brace) && Right.is(BK_BracedInit) && !Left.opensScope() && Style.SpaceBeforeCpp11BracedList) return true; } else if (Style.Language == FormatStyle::LK_Proto || @@ -3362,7 +3362,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, // Returns 'true' if 'Tok' is a brace we'd want to break before in Allman style. static bool isAllmanBrace(const FormatToken &Tok) { - return Tok.is(tok::l_brace) && Tok.BlockKind == BK_Block && + return Tok.is(tok::l_brace) && Tok.is(BK_Block) && !Tok.isOneOf(TT_ObjCBlockLBrace, TT_LambdaLBrace, TT_DictLiteral); } @@ -3398,7 +3398,7 @@ static bool isOneChildWithoutMustBreakBefore(const FormatToken &Tok) { return true; } static bool isAllmanLambdaBrace(const FormatToken &Tok) { - return (Tok.is(tok::l_brace) && Tok.BlockKind == BK_Block && + return (Tok.is(tok::l_brace) && Tok.is(BK_Block) && !Tok.isOneOf(TT_ObjCBlockLBrace, TT_DictLiteral)); } @@ -3498,7 +3498,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, if ((Left.isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) || (Style.Language == FormatStyle::LK_JavaScript && Left.is(tok::l_paren))) && - Left.BlockKind != BK_Block && Left.MatchingParen) + Left.isNot(BK_Block) && Left.MatchingParen) BeforeClosingBrace = Left.MatchingParen->Previous; else if (Right.MatchingParen && (Right.MatchingParen->isOneOf(tok::l_brace, @@ -3512,8 +3512,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, } if (Right.is(tok::comment)) - return Left.BlockKind != BK_BracedInit && - Left.isNot(TT_CtorInitializerColon) && + return Left.isNot(BK_BracedInit) && Left.isNot(TT_CtorInitializerColon) && (Right.NewlinesBefore > 0 && Right.HasUnescapedNewline); if (Left.isTrailingComment()) return true; @@ -3822,7 +3821,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, // The first comment in a braced lists is always interpreted as belonging to // the first list element. Otherwise, it should be placed outside of the // list. - return Left.BlockKind == BK_BracedInit || + return Left.is(BK_BracedInit) || (Left.is(TT_CtorInitializerColon) && Style.BreakConstructorInitializers == FormatStyle::BCIS_AfterColon); if (Left.is(tok::question) && Right.is(tok::colon)) @@ -3923,7 +3922,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, // We only break before r_brace if there was a corresponding break before // the l_brace, which is tracked by BreakBeforeClosingBrace. if (Right.is(tok::r_brace)) - return Right.MatchingParen && Right.MatchingParen->BlockKind == BK_Block; + return Right.MatchingParen && Right.MatchingParen->is(BK_Block); // Allow breaking after a trailing annotation, e.g. after a method // declaration. @@ -4008,9 +4007,9 @@ void TokenAnnotator::printDebugInfo(const AnnotatedLine &Line) { << " T=" << getTokenTypeName(Tok->getType()) << " S=" << Tok->SpacesRequiredBefore << " F=" << Tok->Finalized << " B=" << Tok->BlockParameterCount - << " BK=" << Tok->BlockKind << " P=" << Tok->SplitPenalty + << " BK=" << Tok->getBlockKind() << " P=" << Tok->SplitPenalty << " Name=" << Tok->Tok.getName() << " L=" << Tok->TotalLength - << " PPK=" << Tok->PackingKind << " FakeLParens="; + << " PPK=" << Tok->getPackingKind() << " FakeLParens="; for (unsigned i = 0, e = Tok->FakeLParens.size(); i != e; ++i) llvm::errs() << Tok->FakeLParens[i] << "/"; llvm::errs() << " FakeRParens=" << Tok->FakeRParens; diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 22f27a668dccd..ec0c628fe750d 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -606,7 +606,7 @@ class LineJoiner { if (I[1]->Last->is(TT_LineComment)) return 0; do { - if (Tok->is(tok::l_brace) && Tok->BlockKind != BK_BracedInit) + if (Tok->is(tok::l_brace) && Tok->isNot(BK_BracedInit)) return 0; Tok = Tok->Next; } while (Tok); @@ -767,8 +767,8 @@ class LineFormatter { unsigned &Penalty) { const FormatToken *LBrace = State.NextToken->getPreviousNonComment(); FormatToken &Previous = *State.NextToken->Previous; - if (!LBrace || LBrace->isNot(tok::l_brace) || - LBrace->BlockKind != BK_Block || Previous.Children.size() == 0) + if (!LBrace || LBrace->isNot(tok::l_brace) || LBrace->isNot(BK_Block) || + Previous.Children.size() == 0) // The previous token does not open a block. Nothing to do. We don't // assert so that we can simply call this function for all tokens. return true; @@ -979,7 +979,7 @@ class OptimizingLineFormatter : public LineFormatter { // State already examined with lower penalty. continue; - FormatDecision LastFormat = Node->State.NextToken->Decision; + FormatDecision LastFormat = Node->State.NextToken->getDecision(); if (LastFormat == FD_Unformatted || LastFormat == FD_Continue) addNextStateToQueue(Penalty, Node, /*NewLine=*/false, &Count, &Queue); if (LastFormat == FD_Unformatted || LastFormat == FD_Break) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index a37386425aaed..b599168b48e17 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -472,19 +472,19 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { // individual members in a type member list, which would normally // trigger BK_Block. In both cases, this must be parsed as an inline // braced init. - Tok->BlockKind = BK_BracedInit; + Tok->setBlockKind(BK_BracedInit); else if (PrevTok->is(tok::r_paren)) // `) { }` can only occur in function or method declarations in JS. - Tok->BlockKind = BK_Block; + Tok->setBlockKind(BK_Block); } else { - Tok->BlockKind = BK_Unknown; + Tok->setBlockKind(BK_Unknown); } LBraceStack.push_back(Tok); break; case tok::r_brace: if (LBraceStack.empty()) break; - if (LBraceStack.back()->BlockKind == BK_Unknown) { + if (LBraceStack.back()->is(BK_Unknown)) { bool ProbablyBracedList = false; if (Style.Language == FormatStyle::LK_Proto) { ProbablyBracedList = NextTok->isOneOf(tok::comma, tok::r_square); @@ -524,11 +524,11 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { } } if (ProbablyBracedList) { - Tok->BlockKind = BK_BracedInit; - LBraceStack.back()->BlockKind = BK_BracedInit; + Tok->setBlockKind(BK_BracedInit); + LBraceStack.back()->setBlockKind(BK_BracedInit); } else { - Tok->BlockKind = BK_Block; - LBraceStack.back()->BlockKind = BK_Block; + Tok->setBlockKind(BK_Block); + LBraceStack.back()->setBlockKind(BK_Block); } } LBraceStack.pop_back(); @@ -545,8 +545,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { case tok::kw_switch: case tok::kw_try: case tok::kw___try: - if (!LBraceStack.empty() && LBraceStack.back()->BlockKind == BK_Unknown) - LBraceStack.back()->BlockKind = BK_Block; + if (!LBraceStack.empty() && LBraceStack.back()->is(BK_Unknown)) + LBraceStack.back()->setBlockKind(BK_Block); break; default: break; @@ -557,8 +557,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { // Assume other blocks for all unclosed opening braces. for (unsigned i = 0, e = LBraceStack.size(); i != e; ++i) { - if (LBraceStack[i]->BlockKind == BK_Unknown) - LBraceStack[i]->BlockKind = BK_Block; + if (LBraceStack[i]->is(BK_Unknown)) + LBraceStack[i]->setBlockKind(BK_Block); } FormatTok = Tokens->setPosition(StoredPosition); @@ -584,7 +584,7 @@ void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel, assert(FormatTok->isOneOf(tok::l_brace, TT_MacroBlockBegin) && "'{' or macro block token expected"); const bool MacroBlock = FormatTok->is(TT_MacroBlockBegin); - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); size_t PPStartHash = computePPHash(); @@ -614,7 +614,7 @@ void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel, if (MacroBlock ? !FormatTok->is(TT_MacroBlockEnd) : !FormatTok->is(tok::r_brace)) { Line->Level = InitialLevel; - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); return; } @@ -690,7 +690,7 @@ static bool ShouldBreakBeforeBrace(const FormatStyle &Style, } void UnwrappedLineParser::parseChildBlock() { - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); nextToken(); { bool SkipIndent = (Style.Language == FormatStyle::LK_JavaScript && @@ -1476,7 +1476,7 @@ void UnwrappedLineParser::parseStructuralElement() { // C# needs this change to ensure that array initialisers and object // initialisers are indented the same way. if (Style.isCSharp()) - FormatTok->BlockKind = BK_BracedInit; + FormatTok->setBlockKind(BK_BracedInit); nextToken(); parseBracedList(); } else if (Style.Language == FormatStyle::LK_Proto && @@ -1747,10 +1747,10 @@ void UnwrappedLineParser::tryToParseJSFunction() { } bool UnwrappedLineParser::tryToParseBracedList() { - if (FormatTok->BlockKind == BK_Unknown) + if (FormatTok->is(BK_Unknown)) calculateBraceTypes(); - assert(FormatTok->BlockKind != BK_Unknown); - if (FormatTok->BlockKind == BK_Block) + assert(FormatTok->isNot(BK_Unknown)); + if (FormatTok->is(BK_Block)) return false; nextToken(); parseBracedList(); @@ -1830,7 +1830,7 @@ bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons, case tok::l_brace: // Assume there are no blocks inside a braced init list apart // from the ones we explicitly parse out (like lambdas). - FormatTok->BlockKind = BK_BracedInit; + FormatTok->setBlockKind(BK_BracedInit); nextToken(); parseBracedList(); break; @@ -2318,7 +2318,7 @@ bool UnwrappedLineParser::parseEnum() { // Just a declaration or something is wrong. if (FormatTok->isNot(tok::l_brace)) return true; - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); if (Style.Language == FormatStyle::LK_Java) { // Java enums are different. @@ -2726,7 +2726,7 @@ void UnwrappedLineParser::parseJavaScriptEs6ImportExport() { return; } if (FormatTok->is(tok::l_brace)) { - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); nextToken(); parseBracedList(); } else { diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index 32e0b685ea0f4..2d479817118db 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -49,7 +49,7 @@ void WhitespaceManager::replaceWhitespace(FormatToken &Tok, unsigned Newlines, bool IsAligned, bool InPPDirective) { if (Tok.Finalized) return; - Tok.Decision = (Newlines > 0) ? FD_Break : FD_Continue; + Tok.setDecision((Newlines > 0) ? FD_Break : FD_Continue); Changes.push_back(Change(Tok, /*CreateReplacement=*/true, Tok.WhitespaceRange, Spaces, StartOfTokenColumn, Newlines, "", "", IsAligned, InPPDirective && !Tok.IsFirst, From eb10b065f2a870b425dcc2040b9955e0eee464b4 Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Mon, 27 Jul 2020 23:22:21 +0100 Subject: [PATCH 0279/1035] [clang] Pass the NamedDecl* instead of the DeclarationName into many diagnostics. Background: ----------- There are two related argument types which can be sent into a diagnostic to display the name of an entity: DeclarationName (ak_declarationname) or NamedDecl* (ak_nameddecl) (there is also ak_identifierinfo for IdentifierInfo*, but we are not concerned with it here). A DeclarationName in a diagnostic will just be streamed to the output, which will directly result in a call to DeclarationName::print. A NamedDecl* in a diagnostic will also ultimately result in a call to DeclarationName::print, but with two customisation points along the way: The first customisation point is NamedDecl::getNameForDiagnostic which is overloaded by FunctionDecl, ClassTemplateSpecializationDecl and VarTemplateSpecializationDecl to print the template arguments, if any. The second customisation point is NamedDecl::printName. By default it just streams the stored DeclarationName into the output but it can be customised to provide a user-friendly name for an entity. It is currently overloaded by DecompositionDecl and MSGuidDecl. What this patch does: --------------------- For many diagnostics a DeclarationName is used instead of the NamedDecl*. This bypasses the two customisation points mentioned above. This patches fix this for diagnostics in Sema.cpp, SemaCast.cpp, SemaChecking.cpp, SemaDecl.cpp, SemaDeclAttr.cpp, SemaDecl.cpp, SemaOverload.cpp and SemaStmt.cpp. I have only modified diagnostics where I could construct a test-case which demonstrates that the change is appropriate (either with this patch or the next one). Reviewed By: erichkeane, aaron.ballman Differential Revision: https://reviews.llvm.org/D84656 --- clang/lib/Sema/Sema.cpp | 23 +++++------ clang/lib/Sema/SemaCast.cpp | 6 +-- clang/lib/Sema/SemaChecking.cpp | 3 +- clang/lib/Sema/SemaDecl.cpp | 37 +++++++++-------- clang/lib/Sema/SemaDeclAttr.cpp | 4 +- clang/lib/Sema/SemaExpr.cpp | 40 +++++++------------ clang/lib/Sema/SemaOverload.cpp | 4 +- clang/lib/Sema/SemaStmt.cpp | 16 +++----- .../dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp | 5 ++- .../CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp | 4 ++ .../stmt.stmt/stmt.iter/stmt.ranged/p1.cpp | 15 +++++++ clang/test/CXX/temp/temp.param/p15-cxx0x.cpp | 6 +-- clang/test/Modules/module-private.cpp | 10 +++-- clang/test/SemaCXX/array-bounds.cpp | 13 +++++- clang/test/SemaCXX/attr-unused.cpp | 2 +- .../cxx1y-variable-templates_in_class.cpp | 2 +- clang/test/SemaCXX/default2.cpp | 2 +- clang/test/SemaCXX/incomplete-call.cpp | 2 +- clang/test/SemaCXX/references.cpp | 11 ++++- clang/test/SemaCXX/return-void.cpp | 2 +- clang/test/SemaCXX/warn-func-not-needed.cpp | 2 +- .../test/SemaCXX/warn-large-by-value-copy.cpp | 8 ++++ clang/test/SemaCXX/warn-member-not-needed.cpp | 4 +- .../warn-pure-virtual-call-from-ctor-dtor.cpp | 8 ++-- clang/test/SemaCXX/warn-pure-virtual-kext.cpp | 2 +- clang/test/SemaCXX/warn-unused-filescoped.cpp | 8 ++-- .../test/SemaCXX/warn-variable-not-needed.cpp | 2 +- 27 files changed, 137 insertions(+), 104 deletions(-) diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 9c8f3fdcda4a6..7415d0d0766ba 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -1197,7 +1197,7 @@ void Sema::ActOnEndOfTranslationUnit() { if (DiagD->isReferenced()) { if (isa(DiagD)) Diag(DiagD->getLocation(), diag::warn_unneeded_member_function) - << DiagD->getDeclName(); + << DiagD; else { if (FD->getStorageClass() == SC_Static && !FD->isInlineSpecified() && @@ -1205,20 +1205,20 @@ void Sema::ActOnEndOfTranslationUnit() { SourceMgr.getExpansionLoc(FD->getLocation()))) Diag(DiagD->getLocation(), diag::warn_unneeded_static_internal_decl) - << DiagD->getDeclName(); + << DiagD; else Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl) - << /*function*/0 << DiagD->getDeclName(); + << /*function*/ 0 << DiagD; } } else { if (FD->getDescribedFunctionTemplate()) Diag(DiagD->getLocation(), diag::warn_unused_template) - << /*function*/0 << DiagD->getDeclName(); + << /*function*/ 0 << DiagD; else - Diag(DiagD->getLocation(), - isa(DiagD) ? diag::warn_unused_member_function + Diag(DiagD->getLocation(), isa(DiagD) + ? diag::warn_unused_member_function : diag::warn_unused_function) - << DiagD->getDeclName(); + << DiagD; } } else { const VarDecl *DiagD = cast(*I)->getDefinition(); @@ -1226,20 +1226,19 @@ void Sema::ActOnEndOfTranslationUnit() { DiagD = cast(*I); if (DiagD->isReferenced()) { Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl) - << /*variable*/1 << DiagD->getDeclName(); + << /*variable*/ 1 << DiagD; } else if (DiagD->getType().isConstQualified()) { const SourceManager &SM = SourceMgr; if (SM.getMainFileID() != SM.getFileID(DiagD->getLocation()) || !PP.getLangOpts().IsHeaderFile) Diag(DiagD->getLocation(), diag::warn_unused_const_variable) - << DiagD->getDeclName(); + << DiagD; } else { if (DiagD->getDescribedVarTemplate()) Diag(DiagD->getLocation(), diag::warn_unused_template) - << /*variable*/1 << DiagD->getDeclName(); + << /*variable*/ 1 << DiagD; else - Diag(DiagD->getLocation(), diag::warn_unused_variable) - << DiagD->getDeclName(); + Diag(DiagD->getLocation(), diag::warn_unused_variable) << DiagD; } } } diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 2efe26052c78c..58cf3a1be7301 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -510,12 +510,10 @@ static void diagnoseBadCast(Sema &S, unsigned msg, CastType castType, if (RecFrom && RecTo) { auto DeclFrom = RecFrom->getAsCXXRecordDecl(); if (!DeclFrom->isCompleteDefinition()) - S.Diag(DeclFrom->getLocation(), diag::note_type_incomplete) - << DeclFrom->getDeclName(); + S.Diag(DeclFrom->getLocation(), diag::note_type_incomplete) << DeclFrom; auto DeclTo = RecTo->getAsCXXRecordDecl(); if (!DeclTo->isCompleteDefinition()) - S.Diag(DeclTo->getLocation(), diag::note_type_incomplete) - << DeclTo->getDeclName(); + S.Diag(DeclTo->getLocation(), diag::note_type_incomplete) << DeclTo; } } } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index f445272b020bf..77d5f3ff816ed 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -13962,8 +13962,7 @@ void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr, if (ND) DiagRuntimeBehavior(ND->getBeginLoc(), BaseExpr, - PDiag(diag::note_array_declared_here) - << ND->getDeclName()); + PDiag(diag::note_array_declared_here) << ND); } void Sema::CheckArrayAccess(const Expr *expr) { diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 869e4de02cc41..ddbf086ea6385 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -1926,7 +1926,7 @@ static void CheckPoppedLabel(LabelDecl *L, Sema &S) { else Diagnose = L->getStmt() == nullptr; if (Diagnose) - S.Diag(L->getLocation(), diag::err_undeclared_label_use) <getDeclName(); + S.Diag(L->getLocation(), diag::err_undeclared_label_use) << L; } void Sema::ActOnPopScope(SourceLocation Loc, Scope *S) { @@ -7194,9 +7194,10 @@ NamedDecl *Sema::ActOnVariableDeclarator( << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); else if (NewVD->hasLocalStorage()) Diag(NewVD->getLocation(), diag::err_module_private_local) - << 0 << NewVD->getDeclName() - << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) - << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); + << 0 << NewVD + << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) + << FixItHint::CreateRemoval( + D.getDeclSpec().getModulePrivateSpecLoc()); else { NewVD->setModulePrivate(); if (NewTemplate) @@ -12429,7 +12430,7 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { !Context.getTargetInfo().getCXXABI().isMicrosoft()) { Diag(Var->getLocation(), diag::err_constexpr_static_mem_var_requires_init) - << Var->getDeclName(); + << Var; Var->setInvalidDecl(); return; } @@ -12562,8 +12563,7 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { // definitions with reference type. if (Type->isReferenceType()) { Diag(Var->getLocation(), diag::err_reference_var_requires_init) - << Var->getDeclName() - << SourceRange(Var->getLocation(), Var->getLocation()); + << Var << SourceRange(Var->getLocation(), Var->getLocation()); Var->setInvalidDecl(); return; } @@ -12696,7 +12696,7 @@ void Sema::ActOnCXXForRangeDecl(Decl *D) { } if (Error != -1) { Diag(VD->getOuterLocStart(), diag::err_for_range_storage_class) - << VD->getDeclName() << Error; + << VD << Error; D->setInvalidDecl(); } } @@ -13477,9 +13477,8 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D) { if (D.getDeclSpec().isModulePrivateSpecified()) Diag(New->getLocation(), diag::err_module_private_local) - << 1 << New->getDeclName() - << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) - << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); + << 1 << New << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) + << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); if (New->hasAttr()) { Diag(New->getLocation(), diag::err_block_on_nonlocal); @@ -13531,8 +13530,7 @@ void Sema::DiagnoseSizeOfParametersAndReturnValue( if (!ReturnTy->isDependentType() && ReturnTy.isPODType(Context)) { unsigned Size = Context.getTypeSizeInChars(ReturnTy).getQuantity(); if (Size > LangOpts.NumLargeByValueCopy) - Diag(D->getLocation(), diag::warn_return_value_size) - << D->getDeclName() << Size; + Diag(D->getLocation(), diag::warn_return_value_size) << D << Size; } // Warn if any parameter is pass-by-value and larger than the specified @@ -13544,7 +13542,7 @@ void Sema::DiagnoseSizeOfParametersAndReturnValue( unsigned Size = Context.getTypeSizeInChars(T).getQuantity(); if (Size > LangOpts.NumLargeByValueCopy) Diag(Parameter->getLocation(), diag::warn_parameter_size) - << Parameter->getDeclName() << Size; + << Parameter << Size; } } @@ -13852,9 +13850,9 @@ Sema::CheckForFunctionRedefinition(FunctionDecl *FD, if (getLangOpts().GNUMode && Definition->isInlineSpecified() && Definition->getStorageClass() == SC_Extern) Diag(FD->getLocation(), diag::err_redefinition_extern_inline) - << FD->getDeclName() << getLangOpts().CPlusPlus; + << FD << getLangOpts().CPlusPlus; else - Diag(FD->getLocation(), diag::err_redefinition) << FD->getDeclName(); + Diag(FD->getLocation(), diag::err_redefinition) << FD; Diag(Definition->getLocation(), diag::note_previous_definition); FD->setInvalidDecl(); @@ -14909,9 +14907,10 @@ TypedefDecl *Sema::ParseTypedefDecl(Scope *S, Declarator &D, QualType T, if (D.getDeclSpec().isModulePrivateSpecified()) { if (CurContext->isFunctionOrMethod()) Diag(NewTD->getLocation(), diag::err_module_private_local) - << 2 << NewTD->getDeclName() - << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) - << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); + << 2 << NewTD + << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) + << FixItHint::CreateRemoval( + D.getDeclSpec().getModulePrivateSpecLoc()); else NewTD->setModulePrivate(); } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index ece93cbd6a9bd..58602a4c58d4e 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3643,8 +3643,8 @@ static void handleTransparentUnionAttr(Sema &S, Decl *D, const ParsedAttr &AL) { unsigned FieldBits = isSize? S.Context.getTypeSize(FieldType) : S.Context.getTypeAlign(FieldType); S.Diag(Field->getLocation(), - diag::warn_transparent_union_attribute_field_size_align) - << isSize << Field->getDeclName() << FieldBits; + diag::warn_transparent_union_attribute_field_size_align) + << isSize << *Field << FieldBits; unsigned FirstBits = isSize? FirstSize : FirstAlign; S.Diag(FirstField->getLocation(), diag::note_transparent_union_first_field_size_align) diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index bb5a07e8079dd..21d3bbf419a9a 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -94,7 +94,7 @@ static void DiagnoseUnusedOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc) { A->getSemanticSpelling() != UnusedAttr::C2x_maybe_unused) { const Decl *DC = cast_or_null(S.getCurObjCLexicalContext()); if (DC && !DC->hasAttr()) - S.Diag(Loc, diag::warn_used_but_marked_unused) << D->getDeclName(); + S.Diag(Loc, diag::warn_used_but_marked_unused) << D; } } } @@ -5567,9 +5567,8 @@ bool Sema::CheckCXXDefaultArgExpr(SourceLocation CallLoc, FunctionDecl *FD, return true; } - Diag(CallLoc, - diag::err_use_of_default_argument_to_function_declared_later) << - FD << cast(FD->getDeclContext())->getDeclName(); + Diag(CallLoc, diag::err_use_of_default_argument_to_function_declared_later) + << FD << cast(FD->getDeclContext()); Diag(UnparsedDefaultArgLocs[Param], diag::note_default_argument_declared_here); return true; @@ -16932,8 +16931,7 @@ static bool isVariableCapturable(CapturingScopeInfo *CSI, VarDecl *Var, if (Var->getType()->isVariablyModifiedType() && IsBlock) { if (Diagnose) { S.Diag(Loc, diag::err_ref_vm_type); - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; } return false; } @@ -16945,10 +16943,8 @@ static bool isVariableCapturable(CapturingScopeInfo *CSI, VarDecl *Var, if (IsBlock) S.Diag(Loc, diag::err_ref_flexarray_type); else - S.Diag(Loc, diag::err_lambda_capture_flexarray_type) - << Var->getDeclName(); - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Loc, diag::err_lambda_capture_flexarray_type) << Var; + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; } return false; } @@ -16958,10 +16954,8 @@ static bool isVariableCapturable(CapturingScopeInfo *CSI, VarDecl *Var, // variables; they don't support the expected semantics. if (HasBlocksAttr && (IsLambda || isa(CSI))) { if (Diagnose) { - S.Diag(Loc, diag::err_capture_block_variable) - << Var->getDeclName() << !IsLambda; - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Loc, diag::err_capture_block_variable) << Var << !IsLambda; + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; } return false; } @@ -16992,8 +16986,7 @@ static bool captureInBlock(BlockScopeInfo *BSI, VarDecl *Var, if (!Invalid && !S.getLangOpts().OpenCL && CaptureType->isArrayType()) { if (BuildAndDiagnose) { S.Diag(Loc, diag::err_ref_array_type); - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; Invalid = true; } else { return false; @@ -17006,8 +16999,7 @@ static bool captureInBlock(BlockScopeInfo *BSI, VarDecl *Var, if (BuildAndDiagnose) { S.Diag(Loc, diag::err_arc_autoreleasing_capture) << /*block*/ 0; - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; Invalid = true; } else { return false; @@ -17277,9 +17269,8 @@ bool Sema::tryCaptureVariable( if (BuildAndDiagnose) { LambdaScopeInfo *LSI = cast(CSI); if (LSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_None) { - Diag(ExprLoc, diag::err_lambda_impcap) << Var->getDeclName(); - Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + Diag(ExprLoc, diag::err_lambda_impcap) << Var; + Diag(Var->getLocation(), diag::note_previous_decl) << Var; Diag(LSI->Lambda->getBeginLoc(), diag::note_lambda_decl); } else diagnoseUncapturableValueReference(*this, ExprLoc, Var, DC); @@ -17353,9 +17344,8 @@ bool Sema::tryCaptureVariable( // No capture-default, and this is not an explicit capture // so cannot capture this variable. if (BuildAndDiagnose) { - Diag(ExprLoc, diag::err_lambda_impcap) << Var->getDeclName(); - Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + Diag(ExprLoc, diag::err_lambda_impcap) << Var; + Diag(Var->getLocation(), diag::note_previous_decl) << Var; if (cast(CSI)->Lambda) Diag(cast(CSI)->Lambda->getBeginLoc(), diag::note_lambda_decl); @@ -18319,7 +18309,7 @@ bool Sema::CheckCallReturnType(QualType ReturnType, SourceLocation Loc, } S.Diag(Loc, diag::err_call_function_incomplete_return) - << CE->getSourceRange() << FD->getDeclName() << T; + << CE->getSourceRange() << FD << T; S.Diag(FD->getLocation(), diag::note_entity_declared_at) << FD->getDeclName(); } diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 5b4e7a2fdafa5..00563cff62cf6 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -14193,12 +14193,12 @@ Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE, Diag(MemExpr->getBeginLoc(), diag::warn_call_to_pure_virtual_member_function_from_ctor_dtor) << MD->getDeclName() << isa(CurContext) - << MD->getParent()->getDeclName(); + << MD->getParent(); Diag(MD->getBeginLoc(), diag::note_previous_decl) << MD->getDeclName(); if (getLangOpts().AppleKext) Diag(MemExpr->getBeginLoc(), diag::note_pure_qualified_call_kext) - << MD->getParent()->getDeclName() << MD->getDeclName(); + << MD->getParent() << MD->getDeclName(); } } diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 948c187804dcc..9ca2411b33e78 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3625,12 +3625,11 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { if (FD->hasAttrs()) Attrs = &FD->getAttrs(); if (FD->isNoReturn()) - Diag(ReturnLoc, diag::warn_noreturn_function_has_return_expr) - << FD->getDeclName(); + Diag(ReturnLoc, diag::warn_noreturn_function_has_return_expr) << FD; if (FD->isMain() && RetValExp) if (isa(RetValExp)) Diag(ReturnLoc, diag::warn_main_returns_bool_literal) - << RetValExp->getSourceRange(); + << RetValExp->getSourceRange(); if (FD->hasAttr() && RetValExp) { if (const auto *RT = dyn_cast(FnRetType.getCanonicalType())) { if (RT->getDecl()->isOrContainsUnion()) @@ -3701,8 +3700,7 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { FunctionKind = 3; Diag(ReturnLoc, diag::err_return_init_list) - << CurDecl->getDeclName() << FunctionKind - << RetValExp->getSourceRange(); + << CurDecl << FunctionKind << RetValExp->getSourceRange(); // Drop the expression. RetValExp = nullptr; @@ -3729,9 +3727,8 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { // return of void in constructor/destructor is illegal in C++. if (D == diag::err_ctor_dtor_returns_void) { NamedDecl *CurDecl = getCurFunctionOrMethodDecl(); - Diag(ReturnLoc, D) - << CurDecl->getDeclName() << isa(CurDecl) - << RetValExp->getSourceRange(); + Diag(ReturnLoc, D) << CurDecl << isa(CurDecl) + << RetValExp->getSourceRange(); } // return (some void expression); is legal in C++. else if (D != diag::ext_return_has_void_expr || @@ -3747,8 +3744,7 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { FunctionKind = 3; Diag(ReturnLoc, D) - << CurDecl->getDeclName() << FunctionKind - << RetValExp->getSourceRange(); + << CurDecl << FunctionKind << RetValExp->getSourceRange(); } } diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp index 0d4d34ac0e147..d92356c1ec0b2 100644 --- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp +++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp @@ -8,9 +8,10 @@ void a2 [[noreturn]] () { } template void a3 [[noreturn]] () {} -template <> void a3 () { return; } // expected-warning {{function 'a3' declared 'noreturn' should not return}} +template <> void a3() { return; } // expected-warning {{function 'a3' declared 'noreturn' should not return}} -template void a4 [[noreturn]] () { return; } // expected-warning 2{{function 'a4' declared 'noreturn' should not return}} +template void a4 [[noreturn]] () { return; } // expected-warning {{function 'a4' declared 'noreturn' should not return}} + // expected-warning@-1 {{function 'a4' declared 'noreturn' should not return}} void a4_test() { a4(); } // expected-note {{in instantiation of function template specialization 'a4' requested here}} [[noreturn, noreturn]] void b() { throw 0; } // expected-error {{attribute 'noreturn' cannot appear multiple times in an attribute specifier}} diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp index ad827fb7b3142..415d634d5f98c 100644 --- a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp +++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp @@ -1,3 +1,7 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s void f(int) { } // expected-note {{previous definition is here}} void f(const int) { } // expected-error {{redefinition of 'f'}} + +template void ft(T) {} +template <> void ft(int) {} // expected-note {{previous definition is here}} +template <> void ft(int) {} // expected-error {{redefinition of 'ft'}} diff --git a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp index 01c758bef19a5..7c95a3ca88eb3 100644 --- a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp +++ b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp @@ -64,6 +64,21 @@ namespace X { }; constexpr int operator*(const C::It &) { return 0; } + + struct D { + D(); + using Ty = int[2]; + Ty *begin(); + Ty *end(); + }; + + void test_D() { +#if __cplusplus >= 201703L + for (extern auto [x, y] : D()) { + } // expected-error@-1 {{decomposition declaration cannot be declared 'extern'}} + // expected-error@-2 {{loop variable '[x, y]' may not be declared 'extern'}} +#endif + } } using X::A; diff --git a/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp b/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp index 667152da1cbcd..63f56640b1ce9 100644 --- a/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp +++ b/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s -template struct X; // expected-note {{'X' is incomplete}} +template struct X; // expected-note {{'X>>' is incomplete}} template struct Y; X> *x1; @@ -14,8 +14,8 @@ typedef X X_int; struct Z : X_int { }; void f(const X x) { - (void)reinterpret_cast>(x); // expected-error{{reinterpret_cast from}} - (void)reinterpret_cast>>>(x); // expected-error{{reinterpret_cast from}} + (void)reinterpret_cast>(x); // expected-error{{reinterpret_cast from 'const X' to 'X' is not allowed}} + (void)reinterpret_cast>>>(x); // expected-error{{reinterpret_cast from 'const X' to 'X>>' is not allowed}} X> *x1; } diff --git a/clang/test/Modules/module-private.cpp b/clang/test/Modules/module-private.cpp index 30957865d1cd0..a4b3b0fd21d39 100644 --- a/clang/test/Modules/module-private.cpp +++ b/clang/test/Modules/module-private.cpp @@ -1,7 +1,7 @@ // RUN: rm -rf %t -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_left -emit-module %S/Inputs/module.map -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_right -emit-module %S/Inputs/module.map -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -I %S/Inputs %s -verify +// RUN: %clang_cc1 -std=c++17 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_left -emit-module %S/Inputs/module.map +// RUN: %clang_cc1 -std=c++17 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_right -emit-module %S/Inputs/module.map +// RUN: %clang_cc1 -std=c++17 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -I %S/Inputs %s -verify // FIXME: When we have a syntax for modules in C++, use that. @import module_private_left; @@ -79,11 +79,15 @@ __module_private__ struct public_class { }; // expected-error{{partial spec void local_var_private(__module_private__ int param) { // expected-error{{parameter 'param' cannot be declared __module_private__}} __module_private__ struct Local { int x, y; } local; //expected-error{{local variable 'local' cannot be declared __module_private__}} + __module_private__ auto [x, y] = local; // expected-error {{local variable '[x, y]' cannot be declared __module_private__}} + __module_private__ struct OtherLocal { int x; }; // expected-error{{local struct cannot be declared __module_private__}} typedef __module_private__ int local_typedef; // expected-error{{typedef 'local_typedef' cannot be declared __module_private__}} } +void param_private(__module_private__ int) {} // expected-error {{parameter '' cannot be declared __module_private}} + // Check struct size struct LikeVisibleStruct { int field; diff --git a/clang/test/SemaCXX/array-bounds.cpp b/clang/test/SemaCXX/array-bounds.cpp index 495ccaf71bd6a..47be6c2423dc1 100644 --- a/clang/test/SemaCXX/array-bounds.cpp +++ b/clang/test/SemaCXX/array-bounds.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -verify -std=c++11 %s +// RUN: %clang_cc1 -verify -std=c++14 %s int foo() { int x[2]; // expected-note 4 {{array 'x' declared here}} @@ -309,3 +309,14 @@ namespace PR41087 { foo(); // expected-note 1{{in instantiation of function template specialization}} }; } + +namespace var_template_array { +template int arr[2]; // expected-note {{array 'arr' declared here}} +template <> int arr[1]; // expected-note {{array 'arr' declared here}} + +void test() { + arr[1] = 0; // ok + arr[2] = 0; // expected-warning {{array index 2 is past the end of the array (which contains 2 elements)}} + arr[1] = 0; // expected-warning {{array index 1 is past the end of the array (which contains 1 element)}} +} +} // namespace var_template_array diff --git a/clang/test/SemaCXX/attr-unused.cpp b/clang/test/SemaCXX/attr-unused.cpp index e3878152eca97..5bca693864e32 100644 --- a/clang/test/SemaCXX/attr-unused.cpp +++ b/clang/test/SemaCXX/attr-unused.cpp @@ -15,5 +15,5 @@ void f() { }; (void) i; - C(); // expected-warning {{'C' was marked unused but was used}} + C(); // expected-warning {{'C' was marked unused but was used}} } diff --git a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp index fc49ec88d5537..1a24c66805690 100644 --- a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp +++ b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp @@ -164,7 +164,7 @@ namespace constexpred { // expected-error {{non-static data member cannot be constexpr; did you intend to make it const?}} template constexpr float right = 5; // expected-error {{non-static data member cannot be constexpr; did you intend to make it static?}} template<> static constexpr int right = 7; - template<> static constexpr float right; // expected-error {{requires an initializer}} + template <> static constexpr float right; // expected-error {{declaration of constexpr static data member 'right' requires an initializer}} template static constexpr int right; // expected-error {{expected '<' after 'template'}} }; } diff --git a/clang/test/SemaCXX/default2.cpp b/clang/test/SemaCXX/default2.cpp index 7651233f8636f..c51d272853e72 100644 --- a/clang/test/SemaCXX/default2.cpp +++ b/clang/test/SemaCXX/default2.cpp @@ -119,7 +119,7 @@ class C2 { template class C3; template <> class C3 { - static void g(int = f()); // expected-error {{use of default argument to function 'f' that is declared later in class 'C3'}} + static void g(int = f()); // expected-error {{use of default argument to function 'f' that is declared later in class 'C3'}} static int f(int = 10); // expected-note {{default argument declared here}} }; diff --git a/clang/test/SemaCXX/incomplete-call.cpp b/clang/test/SemaCXX/incomplete-call.cpp index 46f470e4a8810..208daa2988a66 100644 --- a/clang/test/SemaCXX/incomplete-call.cpp +++ b/clang/test/SemaCXX/incomplete-call.cpp @@ -40,7 +40,7 @@ void g() { A (B::*mfp)() = 0; (b.*mfp)(); // expected-error {{calling function with incomplete return type 'A'}} - ft(42); // expected-error {{calling 'ft' with incomplete return type 'A'}} + ft(42); // expected-error {{calling 'ft' with incomplete return type 'A'}} } diff --git a/clang/test/SemaCXX/references.cpp b/clang/test/SemaCXX/references.cpp index eaab1ae833e4e..f059eb6e64602 100644 --- a/clang/test/SemaCXX/references.cpp +++ b/clang/test/SemaCXX/references.cpp @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s -// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s int g(int); void f() { @@ -114,6 +115,14 @@ void test8(int& const,// expected-error{{'const' qualifier may not be applied to void restrict_ref(int &__restrict); // ok } +namespace var_template { +#if __cplusplus >= 201402L +int i; +template int &ref = i; // ok +template <> int &ref; // expected-error {{declaration of reference variable 'ref' requires an initializer}} +#endif +} // namespace var_template + template int const_param(const T) {} int const_ref_param = const_param(const_ref_param); // no-warning diff --git a/clang/test/SemaCXX/return-void.cpp b/clang/test/SemaCXX/return-void.cpp index b3aa203133dc3..c72fbdfae9fa9 100644 --- a/clang/test/SemaCXX/return-void.cpp +++ b/clang/test/SemaCXX/return-void.cpp @@ -4,7 +4,7 @@ void f1() { return {1,2}; } // expected-error {{void function 'f1' must not retu template void f2() { return {1,2}; } // expected-error {{void function 'f2' must not return a value}} -template <> void f2() { return {1,2}; } // expected-error {{void function 'f2' must not return a value}} +template <> void f2() { return {1, 2}; } // expected-error {{void function 'f2' must not return a value}} void test_f2() { f2(); diff --git a/clang/test/SemaCXX/warn-func-not-needed.cpp b/clang/test/SemaCXX/warn-func-not-needed.cpp index 5040aaad94601..cb3cae4cd6c76 100644 --- a/clang/test/SemaCXX/warn-func-not-needed.cpp +++ b/clang/test/SemaCXX/warn-func-not-needed.cpp @@ -11,7 +11,7 @@ void foo() { namespace test1_template { template static void f() {} -template <> void f() {} // expected-warning {{function 'f' is not needed and will not be emitted}} +template <> void f() {} // expected-warning {{function 'f' is not needed and will not be emitted}} template void foo() { f(); diff --git a/clang/test/SemaCXX/warn-large-by-value-copy.cpp b/clang/test/SemaCXX/warn-large-by-value-copy.cpp index 3e419ec08f075..309fdc75dbd79 100644 --- a/clang/test/SemaCXX/warn-large-by-value-copy.cpp +++ b/clang/test/SemaCXX/warn-large-by-value-copy.cpp @@ -16,6 +16,14 @@ S100 f100(S100 s) { return s; } S101 f101(S101 s) { return s; } // expected-warning {{return value of 'f101' is a large (101 bytes) pass-by-value object}} \ // expected-warning {{'s' is a large (101 bytes) pass-by-value argument}} +void f101_no_param_name(S101) {} // expected-warning {{'' is a large (101 bytes) pass-by-value argument}} + +// FIXME: Don't warn when when the return value is subject to (N)RVO. + +template T foo_template(T); +template <> S101 foo_template(S101) { return S101(); } // expected-warning {{return value of 'foo_template' is a large}} + // expected-warning@-1 {{'' is a large (101 bytes) pass-by-value argument}} + typedef int Arr[200]; void farr(Arr a) { } diff --git a/clang/test/SemaCXX/warn-member-not-needed.cpp b/clang/test/SemaCXX/warn-member-not-needed.cpp index 95241f4f7fee0..c48447719ba99 100644 --- a/clang/test/SemaCXX/warn-member-not-needed.cpp +++ b/clang/test/SemaCXX/warn-member-not-needed.cpp @@ -4,8 +4,8 @@ namespace { class A { void g() {} // expected-warning {{member function 'g' is not needed and will not be emitted}} template void gt(T) {} - template <> void gt(int) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} - template <> void gt(float) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} + template <> void gt(int) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} + template <> void gt(float) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} template void foo() { diff --git a/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp b/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp index 789935e3470ac..9acf84c6ce8c5 100644 --- a/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp +++ b/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp @@ -22,8 +22,8 @@ struct C { }; template struct TA { - TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} - ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} + TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} + ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} virtual void f() = 0; // expected-note 2{{'f' declared here}} }; @@ -35,8 +35,8 @@ template <> struct TA { }; template <> struct TA { - TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} - ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} + TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} + ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} virtual void f() = 0; // expected-note 2{{'f' declared here}} }; diff --git a/clang/test/SemaCXX/warn-pure-virtual-kext.cpp b/clang/test/SemaCXX/warn-pure-virtual-kext.cpp index 8431e202ad714..d23456fa4fd5c 100644 --- a/clang/test/SemaCXX/warn-pure-virtual-kext.cpp +++ b/clang/test/SemaCXX/warn-pure-virtual-kext.cpp @@ -10,7 +10,7 @@ struct A { template struct TA { virtual void f() = 0; // expected-note {{'f' declared here}} - TA() { TA::f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} // expected-note {{qualified call to 'TA'::'f' is treated as a virtual call to 'f' due to -fapple-kext}} + TA() { TA::f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} // expected-note {{qualified call to 'TA'::'f' is treated as a virtual call to 'f' due to -fapple-kext}} }; struct B : TA { // expected-note {{in instantiation of member function 'TA::TA' requested here}} diff --git a/clang/test/SemaCXX/warn-unused-filescoped.cpp b/clang/test/SemaCXX/warn-unused-filescoped.cpp index 056543d5eeb08..d53608003b16d 100644 --- a/clang/test/SemaCXX/warn-unused-filescoped.cpp +++ b/clang/test/SemaCXX/warn-unused-filescoped.cpp @@ -67,7 +67,7 @@ struct S { template void tf() {} // expected-warning{{unused function template 'tf'}} - template <> void tf() {} // expected-warning{{unused function 'tf'}} + template <> void tf() {} // expected-warning{{unused function 'tf'}} struct VS { virtual void vm() { } @@ -102,7 +102,7 @@ struct S2 { template int vt = 0; // expected-warning {{unused variable template 'vt'}} template int vt = 0; - template <> int vt = 0; // expected-warning {{unused variable 'vt'}} + template <> int vt = 0; // expected-warning {{unused variable 'vt'}} } namespace PR8841 { @@ -132,7 +132,7 @@ namespace test4 { namespace rdar8733476 { static void foo() {} // expected-warning {{function 'foo' is not needed and will not be emitted}} template static void foo_t() {} // expected-warning {{unused function template 'foo_t'}} -template <> void foo_t() {} // expected-warning {{function 'foo_t' is not needed and will not be emitted}} +template <> void foo_t() {} // expected-warning {{function 'foo_t' is not needed and will not be emitted}} template void bar() { @@ -157,7 +157,7 @@ namespace test5 { namespace { // FIXME: Should be "unused variable template 'var_t'" instead. template const double var_t = 0; // expected-warning {{unused variable 'var_t'}} - template <> const double var_t = 0; // expected-warning {{variable 'var_t' is not needed and will not be emitted}} + template <> const double var_t = 0; // expected-warning {{variable 'var_t' is not needed and will not be emitted}} int z = sizeof(var_t); // expected-warning {{unused variable 'z'}} } // namespace } diff --git a/clang/test/SemaCXX/warn-variable-not-needed.cpp b/clang/test/SemaCXX/warn-variable-not-needed.cpp index 139c2923f4bae..103be189068f8 100644 --- a/clang/test/SemaCXX/warn-variable-not-needed.cpp +++ b/clang/test/SemaCXX/warn-variable-not-needed.cpp @@ -5,7 +5,7 @@ namespace test1 { namespace { template int abc_template = 0; - template <> int abc_template = 0; // expected-warning {{variable 'abc_template' is not needed and will not be emitted}} + template <> int abc_template = 0; // expected-warning {{variable 'abc_template' is not needed and will not be emitted}} } // namespace template int foo(void) { From 536736995bf5d073853c7e884968c9847b4ae64d Mon Sep 17 00:00:00 2001 From: Serge Pavlov Date: Wed, 22 Apr 2020 23:00:12 +0700 Subject: [PATCH 0280/1035] [Support] Add file lock/unlock functions This is recommit of f51bc4fb60fb, reverted in 8577595e03fa, because the function `flock` is not available on Solaris. In this variant `flock` was replaced with `fcntl`, which is a POSIX function. New functions `lockFile`, `tryLockFile` and `unlockFile` implement simple file locking. They lock or unlock entire file. This must be enough to support simulataneous writes to log files in parallel builds. Differential Revision: https://reviews.llvm.org/D78896 --- llvm/include/llvm/Support/FileSystem.h | 37 ++++++++++++++++++ llvm/lib/Support/Unix/Path.inc | 45 +++++++++++++++++++++ llvm/lib/Support/Windows/Path.inc | 37 ++++++++++++++++++ llvm/unittests/Support/ProgramTest.cpp | 54 ++++++++++++++++++++++++++ 4 files changed, 173 insertions(+) diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h index a29a9d787947f..b6d2a9f3aad53 100644 --- a/llvm/include/llvm/Support/FileSystem.h +++ b/llvm/include/llvm/Support/FileSystem.h @@ -1131,6 +1131,43 @@ Expected openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None, SmallVectorImpl *RealPath = nullptr); +/// Try to locks the file during the specified time. +/// +/// This function implements advisory locking on entire file. If it returns +/// errc::success, the file is locked by the calling process. Until the +/// process unlocks the file by calling \a unlockFile, all attempts to lock the +/// same file will fail/block. The process that locked the file may assume that +/// none of other processes read or write this file, provided that all processes +/// lock the file prior to accessing its content. +/// +/// @param File The descriptor representing the file to lock. +/// @param Timeout Time in milliseconds that the process should wait before +/// reporting lock failure. Zero value means try to get lock only +/// once. +/// @returns errc::success if lock is successfully obtained, +/// errc::no_lock_available if the file cannot be locked, or platform-specific +/// error_code otherwise. +/// +/// @note Care should be taken when using this function in a multithreaded +/// context, as it may not prevent other threads in the same process from +/// obtaining a lock on the same file, even if they are using a different file +/// descriptor. +std::error_code +tryLockFile(int FD, + std::chrono::milliseconds Timeout = std::chrono::milliseconds(0)); + +/// Lock the file. +/// +/// This function acts as @ref tryLockFile but it waits infinitely. +std::error_code lockFile(int FD); + +/// Unlock the file. +/// +/// @param File The descriptor representing the file to unlock. +/// @returns errc::success if lock is successfully released or platform-specific +/// error_code otherwise. +std::error_code unlockFile(int FD); + /// @brief Close the file object. This should be used instead of ::close for /// portability. On error, the caller should assume the file is closed, as is /// the case for Process::SafelyCloseFileDescriptor diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index d91b269cc6d33..fa4682dd33d2f 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -33,6 +33,7 @@ #include #include +#include #ifdef __APPLE__ #include @@ -1078,6 +1079,50 @@ Expected readNativeFileSlice(file_t FD, MutableArrayRef Buf, return NumRead; } +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { + auto Start = std::chrono::steady_clock::now(); + auto End = Start + Timeout; + do { + struct flock Lock; + memset(&Lock, 0, sizeof(Lock)); + Lock.l_type = F_WRLCK; + Lock.l_whence = SEEK_SET; + Lock.l_start = 0; + Lock.l_len = 0; + if (::fcntl(FD, F_SETLK, &Lock) != -1) + return std::error_code(); + int Error = errno; + if (Error != EACCES && Error != EAGAIN) + return std::error_code(Error, std::generic_category()); + usleep(1000); + } while (std::chrono::steady_clock::now() < End); + return make_error_code(errc::no_lock_available); +} + +std::error_code lockFile(int FD) { + struct flock Lock; + memset(&Lock, 0, sizeof(Lock)); + Lock.l_type = F_WRLCK; + Lock.l_whence = SEEK_SET; + Lock.l_start = 0; + Lock.l_len = 0; + if (::fcntl(FD, F_SETLKW, &Lock) != -1) + return std::error_code(); + int Error = errno; + return std::error_code(Error, std::generic_category()); +} + +std::error_code unlockFile(int FD) { + struct flock Lock; + Lock.l_type = F_UNLCK; + Lock.l_whence = SEEK_SET; + Lock.l_start = 0; + Lock.l_len = 0; + if (::fcntl(FD, F_SETLK, &Lock) != -1) + return std::error_code(); + return std::error_code(errno, std::generic_category()); +} + std::error_code closeFile(file_t &F) { file_t TmpF = F; F = kInvalidFile; diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index e352beb77616b..3570d1d6e0563 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -1260,6 +1260,43 @@ Expected readNativeFileSlice(file_t FileHandle, return readNativeFileImpl(FileHandle, Buf, &Overlapped); } +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { + DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY; + OVERLAPPED OV = {0}; + file_t File = convertFDToNativeFile(FD); + auto Start = std::chrono::steady_clock::now(); + auto End = Start + Timeout; + do { + if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV)) + return std::error_code(); + DWORD Error = ::GetLastError(); + if (Error == ERROR_LOCK_VIOLATION) { + ::Sleep(1); + continue; + } + return mapWindowsError(Error); + } while (std::chrono::steady_clock::now() < End); + return mapWindowsError(ERROR_LOCK_VIOLATION); +} + +std::error_code lockFile(int FD) { + DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK; + OVERLAPPED OV = {0}; + file_t File = convertFDToNativeFile(FD); + if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV)) + return std::error_code(); + DWORD Error = ::GetLastError(); + return mapWindowsError(Error); +} + +std::error_code unlockFile(int FD) { + OVERLAPPED OV = {0}; + file_t File = convertFDToNativeFile(FD); + if (::UnlockFileEx(File, 0, MAXDWORD, MAXDWORD, &OV)) + return std::error_code(); + return mapWindowsError(::GetLastError()); +} + std::error_code closeFile(file_t &F) { file_t TmpF = F; F = kInvalidFile; diff --git a/llvm/unittests/Support/ProgramTest.cpp b/llvm/unittests/Support/ProgramTest.cpp index 9052b66b5fb9e..84a5d3f64cfe6 100644 --- a/llvm/unittests/Support/ProgramTest.cpp +++ b/llvm/unittests/Support/ProgramTest.cpp @@ -14,6 +14,7 @@ #include "llvm/Support/Path.h" #include "gtest/gtest.h" #include +#include #if defined(__APPLE__) # include #elif !defined(_MSC_VER) @@ -361,4 +362,57 @@ TEST_F(ProgramEnvTest, TestExecuteAndWaitStatistics) { ASSERT_GE(ProcStat->TotalTime, ProcStat->UserTime); } +TEST_F(ProgramEnvTest, TestLockFile) { + using namespace llvm::sys; + + if (const char *LockedFile = getenv("LLVM_PROGRAM_TEST_LOCKED_FILE")) { + // Child process. + int FD2; + ASSERT_NO_ERROR(fs::openFileForReadWrite(LockedFile, FD2, + fs::CD_OpenExisting, fs::OF_None)); + + std::error_code ErrC = fs::tryLockFile(FD2, std::chrono::seconds(5)); + ASSERT_NO_ERROR(ErrC); + ASSERT_NO_ERROR(fs::unlockFile(FD2)); + close(FD2); + exit(0); + } + + // Create file that will be locked. + SmallString<64> LockedFile; + int FD1; + ASSERT_NO_ERROR( + fs::createTemporaryFile("TestLockFile", "temp", FD1, LockedFile)); + + std::string Executable = + sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1); + StringRef argv[] = {Executable, "--gtest_filter=ProgramEnvTest.TestLockFile"}; + + // Add LLVM_PROGRAM_TEST_LOCKED_FILE to the environment of the child. + std::string EnvVar = "LLVM_PROGRAM_TEST_LOCKED_FILE="; + EnvVar += LockedFile.str(); + addEnvVar(EnvVar); + + // Lock the file. + ASSERT_NO_ERROR(fs::tryLockFile(FD1)); + + std::string Error; + bool ExecutionFailed; + ProcessInfo PI2 = ExecuteNoWait(Executable, argv, getEnviron(), {}, 0, &Error, + &ExecutionFailed); + ASSERT_FALSE(ExecutionFailed) << Error; + ASSERT_TRUE(Error.empty()); + ASSERT_NE(PI2.Pid, ProcessInfo::InvalidPid) << "Invalid process id"; + + // Wait some time to give the child process a chance to start. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + ASSERT_NO_ERROR(fs::unlockFile(FD1)); + ProcessInfo WaitResult = llvm::sys::Wait(PI2, 5 /* seconds */, true, &Error); + ASSERT_TRUE(Error.empty()); + ASSERT_EQ(0, WaitResult.ReturnCode); + ASSERT_EQ(WaitResult.Pid, PI2.Pid); + sys::fs::remove(LockedFile); +} + } // end anonymous namespace From b81fd5aeecd8047ef62348b67cab2cf9a1577d8e Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Tue, 28 Jul 2020 10:58:34 +0100 Subject: [PATCH 0281/1035] [clang-format][NFC] Fix a Wdocumentation warning in TokenAnnotator.cpp --- clang/lib/Format/TokenAnnotator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index b19fc34bcc802..6cbaf8a30812b 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -27,7 +27,7 @@ namespace format { namespace { /// Returns \c true if the token can be used as an identifier in -/// an Objective-C \c @selector, \c false otherwise. +/// an Objective-C \c \@selector, \c false otherwise. /// /// Because getFormattingLangOpts() always lexes source code as /// Objective-C++, C++ keywords like \c new and \c delete are From 5ee07dc53fcaaad35d33478e8702e443ac1cb058 Mon Sep 17 00:00:00 2001 From: Luofan Chen Date: Tue, 28 Jul 2020 18:02:49 +0800 Subject: [PATCH 0282/1035] [Attributor] Track AA dependency using dependency graph This patch added dependency graph to the attributor so that we can dump the dependencies between AAs more easily. We can also apply general graph algorithms to the graph, making it easier for us to create deep wrappers. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D78861 --- llvm/include/llvm/Transforms/IPO/Attributor.h | 103 ++++++++-- llvm/lib/Transforms/IPO/Attributor.cpp | 180 ++++++++++++++++-- .../Transforms/IPO/AttributorAttributes.cpp | 8 +- llvm/test/Transforms/Attributor/depgraph.ll | 174 +++++++++++++++++ 4 files changed, 428 insertions(+), 37 deletions(-) create mode 100644 llvm/test/Transforms/Attributor/depgraph.ll diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 1eead8ee788de..48c65c37eec78 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -97,8 +97,10 @@ #ifndef LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H #define LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H +#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" @@ -116,10 +118,15 @@ #include "llvm/IR/ConstantRange.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/DOTGraphTraits.h" +#include "llvm/Support/GraphWriter.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" namespace llvm { +struct AADepGraphNode; +struct AADepGraph; struct Attributor; struct AbstractAttribute; struct InformationCache; @@ -144,6 +151,70 @@ enum class DepClassTy { }; ///} +/// The data structure for the nodes of a dependency graph +struct AADepGraphNode { +public: + virtual ~AADepGraphNode(){}; + using DepTy = PointerIntPair; + +protected: + /// Set of dependency graph nodes which this one depends on. + /// The bit encodes if it is optional. + TinyPtrVector Deps; + + static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); } + static AbstractAttribute *DepGetValAA(DepTy &DT) { + return cast(DT.getPointer()); + } + + operator AbstractAttribute *() { return cast(this); } + +public: + using iterator = + mapped_iterator::iterator, decltype(&DepGetVal)>; + using aaiterator = + mapped_iterator::iterator, decltype(&DepGetValAA)>; + + aaiterator begin() { return aaiterator(Deps.begin(), &DepGetValAA); } + aaiterator end() { return aaiterator(Deps.end(), &DepGetValAA); } + iterator child_begin() { return iterator(Deps.begin(), &DepGetVal); } + iterator child_end() { return iterator(Deps.end(), &DepGetVal); } + + virtual void print(raw_ostream &OS) const { OS << "AADepNode Impl\n"; } + TinyPtrVector &getDeps() { return Deps; } + + friend struct Attributor; + friend struct AADepGraph; +}; + +struct AADepGraph { + AADepGraph() {} + ~AADepGraph() {} + + using DepTy = AADepGraphNode::DepTy; + static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); } + using iterator = + mapped_iterator::iterator, decltype(&DepGetVal)>; + + /// There is no root node for the dependency graph. But the SCCIterator + /// requires a single entry point, so we maintain a fake("synthetic") root + /// node that depends on every node. + AADepGraphNode SyntheticRoot; + + AADepGraphNode *GetEntryNode() { return &SyntheticRoot; } + + iterator begin() { return SyntheticRoot.child_begin(); } + iterator end() { return SyntheticRoot.child_end(); } + + void viewGraph(); + + /// Dump graph to file + void dumpGraph(); + + /// Print dependency graph + void print(); +}; + /// Helper to describe and deal with positions in the LLVM-IR. /// /// A position in the IR is described by an anchor value and an "offset" that @@ -1020,7 +1091,9 @@ struct Attributor { assert(!AAPtr && "Attribute already in map!"); AAPtr = &AA; - AllAbstractAttributes.push_back(&AA); + DG.SyntheticRoot.Deps.push_back( + AADepGraphNode::DepTy(&AA, unsigned(DepClassTy::REQUIRED))); + return AA; } @@ -1382,12 +1455,6 @@ struct Attributor { /// See getOrCreateAAFor. bool shouldSeedAttribute(AbstractAttribute &AA); - /// The set of all abstract attributes. - ///{ - using AAVector = SmallVector; - AAVector AllAbstractAttributes; - ///} - /// A nested map to lookup abstract attributes based on the argument position /// on the outer level, and the addresses of the static member (AAType::ID) on /// the inner level. @@ -1409,6 +1476,9 @@ struct Attributor { /// Helper to update an underlying call graph. CallGraphUpdater &CGUpdater; + /// Abstract Attribute dependency graph + AADepGraph DG; + /// Set of functions for which we modified the content such that it might /// impact the call graph. SmallPtrSet CGModifiedFunctions; @@ -1458,6 +1528,8 @@ struct Attributor { SmallPtrSet ToBeDeletedBlocks; SmallDenseSet ToBeDeletedInsts; ///} + + friend AADepGraph; }; /// An interface to query the internal state of an abstract attribute. @@ -2030,7 +2102,7 @@ struct IRAttribute : public BaseType { /// both directions will be added in the future. /// NOTE: The mechanics of adding a new "concrete" abstract attribute are /// described in the file comment. -struct AbstractAttribute : public IRPosition { +struct AbstractAttribute : public IRPosition, public AADepGraphNode { using StateType = AbstractState; AbstractAttribute(const IRPosition &IRP) : IRPosition(IRP) {} @@ -2038,6 +2110,14 @@ struct AbstractAttribute : public IRPosition { /// Virtual destructor. virtual ~AbstractAttribute() {} + /// This function is used to identify if an \p DGN is of type + /// AbstractAttribute so that the dyn_cast and cast can use such information + /// to cast an AADepGraphNode to an AbstractAttribute. + /// + /// We eagerly return true here because all AADepGraphNodes except for the + /// Synthethis Node are of type AbstractAttribute + static bool classof(const AADepGraphNode *DGN) { return true; } + /// Initialize the state with the information in the Attributor \p A. /// /// This function is called by the Attributor once all abstract attributes @@ -2059,6 +2139,7 @@ struct AbstractAttribute : public IRPosition { /// Helper functions, for debug purposes only. ///{ virtual void print(raw_ostream &OS) const; + virtual void printWithDeps(raw_ostream &OS) const; void dump() const { print(dbgs()); } /// This function should return the "summarized" assumed state as string. @@ -2106,12 +2187,6 @@ struct AbstractAttribute : public IRPosition { /// /// \Return CHANGED if the internal state changed, otherwise UNCHANGED. virtual ChangeStatus updateImpl(Attributor &A) = 0; - -private: - /// Set of abstract attributes which were queried by this one. The bit encodes - /// if there is an optional of required dependence. - using DepTy = PointerIntPair; - TinyPtrVector Deps; }; /// Forward declarations of output streams for debug purposes. diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index f96dac5f3515c..003baf39e8bdf 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -15,7 +15,10 @@ #include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/TinyPtrVector.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ValueTracking.h" @@ -25,10 +28,15 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include +#include using namespace llvm; @@ -85,6 +93,23 @@ static cl::list "allowed to be seeded."), cl::ZeroOrMore, cl::CommaSeparated); +static cl::opt + DumpDepGraph("attributor-dump-dep-graph", cl::Hidden, + cl::desc("Dump the dependency graph to dot files."), + cl::init(false)); + +static cl::opt DepGraphDotFileNamePrefix( + "attributor-depgraph-dot-filename-prefix", cl::Hidden, + cl::desc("The prefix used for the CallGraph dot file names.")); + +static cl::opt ViewDepGraph("attributor-view-dep-graph", cl::Hidden, + cl::desc("View the dependency graph."), + cl::init(false)); + +static cl::opt PrintDependencies("attributor-print-dep", cl::Hidden, + cl::desc("Print attribute dependencies"), + cl::init(false)); + /// Logic operators for the change status enum class. /// ///{ @@ -498,8 +523,10 @@ Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA, Attributor::~Attributor() { // The abstract attributes are allocated via the BumpPtrAllocator Allocator, // thus we cannot delete them. We can, and want to, destruct them though. - for (AbstractAttribute *AA : AllAbstractAttributes) + for (auto &DepAA : DG.SyntheticRoot.Deps) { + AbstractAttribute *AA = cast(DepAA.getPointer()); AA->~AbstractAttribute(); + } } bool Attributor::isAssumedDead(const AbstractAttribute &AA, @@ -904,7 +931,7 @@ bool Attributor::checkForAllReadWriteInstructions( void Attributor::runTillFixpoint() { LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized " - << AllAbstractAttributes.size() + << DG.SyntheticRoot.Deps.size() << " abstract attributes.\n"); // Now that all abstract attributes are collected and initialized we start @@ -914,11 +941,11 @@ void Attributor::runTillFixpoint() { SmallVector ChangedAAs; SetVector Worklist, InvalidAAs; - Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end()); + Worklist.insert(DG.SyntheticRoot.begin(), DG.SyntheticRoot.end()); do { // Remember the size to determine new attributes. - size_t NumAAs = AllAbstractAttributes.size(); + size_t NumAAs = DG.SyntheticRoot.Deps.size(); LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter << ", Worklist size: " << Worklist.size() << "\n"); @@ -935,7 +962,7 @@ void Attributor::runTillFixpoint() { while (!InvalidAA->Deps.empty()) { const auto &Dep = InvalidAA->Deps.back(); InvalidAA->Deps.pop_back(); - AbstractAttribute *DepAA = Dep.getPointer(); + AbstractAttribute *DepAA = cast(Dep.getPointer()); if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) { Worklist.insert(DepAA); continue; @@ -953,7 +980,8 @@ void Attributor::runTillFixpoint() { // changed to the work list. for (AbstractAttribute *ChangedAA : ChangedAAs) while (!ChangedAA->Deps.empty()) { - Worklist.insert(ChangedAA->Deps.back().getPointer()); + Worklist.insert( + cast(ChangedAA->Deps.back().getPointer())); ChangedAA->Deps.pop_back(); } @@ -981,8 +1009,8 @@ void Attributor::runTillFixpoint() { // Add attributes to the changed set if they have been created in the last // iteration. - ChangedAAs.append(AllAbstractAttributes.begin() + NumAAs, - AllAbstractAttributes.end()); + ChangedAAs.append(DG.SyntheticRoot.begin() + NumAAs, + DG.SyntheticRoot.end()); // Reset the work list and repopulate with the changed abstract attributes. // Note that dependent ones are added above. @@ -1015,7 +1043,8 @@ void Attributor::runTillFixpoint() { } while (!ChangedAA->Deps.empty()) { - ChangedAAs.push_back(ChangedAA->Deps.back().getPointer()); + ChangedAAs.push_back( + cast(ChangedAA->Deps.back().getPointer())); ChangedAA->Deps.pop_back(); } } @@ -1037,12 +1066,13 @@ void Attributor::runTillFixpoint() { } ChangeStatus Attributor::manifestAttributes() { - size_t NumFinalAAs = AllAbstractAttributes.size(); + size_t NumFinalAAs = DG.SyntheticRoot.Deps.size(); unsigned NumManifested = 0; unsigned NumAtFixpoint = 0; ChangeStatus ManifestChange = ChangeStatus::UNCHANGED; - for (AbstractAttribute *AA : AllAbstractAttributes) { + for (auto &DepAA : DG.SyntheticRoot.Deps) { + AbstractAttribute *AA = cast(DepAA.getPointer()); AbstractState &State = AA->getState(); // If there is not already a fixpoint reached, we can now take the @@ -1082,11 +1112,14 @@ ChangeStatus Attributor::manifestAttributes() { NumAttributesValidFixpoint += NumAtFixpoint; (void)NumFinalAAs; - if (NumFinalAAs != AllAbstractAttributes.size()) { - for (unsigned u = NumFinalAAs; u < AllAbstractAttributes.size(); ++u) - errs() << "Unexpected abstract attribute: " << *AllAbstractAttributes[u] + if (NumFinalAAs != DG.SyntheticRoot.Deps.size()) { + for (unsigned u = NumFinalAAs; u < DG.SyntheticRoot.Deps.size(); ++u) + errs() << "Unexpected abstract attribute: " + << cast(DG.SyntheticRoot.Deps[u].getPointer()) << " :: " - << AllAbstractAttributes[u]->getIRPosition().getAssociatedValue() + << cast(DG.SyntheticRoot.Deps[u].getPointer()) + ->getIRPosition() + .getAssociatedValue() << "\n"; llvm_unreachable("Expected the final number of abstract attributes to " "remain unchanged!"); @@ -1265,6 +1298,17 @@ ChangeStatus Attributor::cleanupIR() { ChangeStatus Attributor::run() { SeedingPeriod = false; runTillFixpoint(); + + // dump graphs on demand + if (DumpDepGraph) + DG.dumpGraph(); + + if (ViewDepGraph) + DG.viewGraph(); + + if (PrintDependencies) + DG.print(); + ChangeStatus ManifestChange = manifestAttributes(); ChangeStatus CleanupChange = cleanupIR(); return ManifestChange | CleanupChange; @@ -2028,8 +2072,31 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) { } void AbstractAttribute::print(raw_ostream &OS) const { - OS << "[P: " << getIRPosition() << "][" << getAsStr() << "][S: " << getState() - << "]"; + OS << "["; + OS << getName(); + OS << "] for CtxI "; + + if (auto *I = getCtxI()) { + OS << "'"; + I->print(OS); + OS << "'"; + } else + OS << "<>"; + + OS << " at position " << getIRPosition() << " with state " << getAsStr() + << '\n'; +} + +void AbstractAttribute::printWithDeps(raw_ostream &OS) const { + print(OS); + + for (const auto &DepAA : Deps) { + auto *AA = DepAA.getPointer(); + OS << " updates "; + AA->print(OS); + } + + OS << '\n'; } ///} @@ -2064,8 +2131,8 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, NumFnWithoutExactDefinition++; // We look at internal functions only on-demand but if any use is not a - // direct call or outside the current set of analyzed functions, we have to - // do it eagerly. + // direct call or outside the current set of analyzed functions, we have + // to do it eagerly. if (F->hasLocalLinkage()) { if (llvm::all_of(F->uses(), [&Functions](const Use &U) { const auto *CB = dyn_cast(U.getUser()); @@ -2081,11 +2148,41 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, } ChangeStatus Changed = A.run(); + LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size() << " functions, result: " << Changed << ".\n"); return Changed == ChangeStatus::CHANGED; } +void AADepGraph::viewGraph() { llvm::ViewGraph(this, "Dependency Graph"); } + +void AADepGraph::dumpGraph() { + static std::atomic CallTimes; + std::string Prefix; + + if (!DepGraphDotFileNamePrefix.empty()) + Prefix = DepGraphDotFileNamePrefix; + else + Prefix = "dep_graph"; + std::string Filename = + Prefix + "_" + std::to_string(CallTimes.load()) + ".dot"; + + outs() << "Dependency graph dump to " << Filename << ".\n"; + + std::error_code EC; + + raw_fd_ostream File(Filename, EC, sys::fs::OF_Text); + if (!EC) + llvm::WriteGraph(File, this); + + CallTimes++; +} + +void AADepGraph::print() { + for (auto DepAA : SyntheticRoot.Deps) + cast(DepAA.getPointer())->printWithDeps(outs()); +} + PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) { FunctionAnalysisManager &FAM = AM.getResult(M).getManager(); @@ -2132,6 +2229,51 @@ PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); } +namespace llvm { + +template <> struct GraphTraits { + using NodeRef = AADepGraphNode *; + using DepTy = PointerIntPair; + using EdgeRef = PointerIntPair; + + static NodeRef getEntryNode(AADepGraphNode *DGN) { return DGN; } + static NodeRef DepGetVal(DepTy &DT) { return DT.getPointer(); } + + using ChildIteratorType = + mapped_iterator::iterator, decltype(&DepGetVal)>; + using ChildEdgeIteratorType = TinyPtrVector::iterator; + + static ChildIteratorType child_begin(NodeRef N) { return N->child_begin(); } + + static ChildIteratorType child_end(NodeRef N) { return N->child_end(); } +}; + +template <> +struct GraphTraits : public GraphTraits { + static NodeRef getEntryNode(AADepGraph *DG) { return DG->GetEntryNode(); } + + using nodes_iterator = + mapped_iterator::iterator, decltype(&DepGetVal)>; + + static nodes_iterator nodes_begin(AADepGraph *DG) { return DG->begin(); } + + static nodes_iterator nodes_end(AADepGraph *DG) { return DG->end(); } +}; + +template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + static std::string getNodeLabel(const AADepGraphNode *Node, + const AADepGraph *DG) { + std::string AAString = ""; + raw_string_ostream O(AAString); + Node->print(O); + return AAString; + } +}; + +} // end namespace llvm + namespace { struct AttributorLegacyPass : public ModulePass { diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index dc916089d394e..cb25030ce71c4 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -1052,9 +1052,10 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) { // map, NewRVsMap. decltype(ReturnedValues) NewRVsMap; - auto HandleReturnValue = [&](Value *RV, SmallSetVector &RIs) { - LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV - << " by #" << RIs.size() << " RIs\n"); + auto HandleReturnValue = [&](Value *RV, + SmallSetVector &RIs) { + LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV << " by #" + << RIs.size() << " RIs\n"); CallBase *CB = dyn_cast(RV); if (!CB || UnresolvedCalls.count(CB)) return; @@ -3425,7 +3426,6 @@ struct AADereferenceableFloating : AADereferenceableImpl { T.GlobalState &= DS.GlobalState; } - // For now we do not try to "increase" dereferenceability due to negative // indices as we first have to come up with code to deal with loops and // for overflows of the dereferenceable bytes. diff --git a/llvm/test/Transforms/Attributor/depgraph.ll b/llvm/test/Transforms/Attributor/depgraph.ll new file mode 100644 index 0000000000000..059587789035e --- /dev/null +++ b/llvm/test/Transforms/Attributor/depgraph.ll @@ -0,0 +1,174 @@ +; RUN: opt -passes=attributor-cgscc -disable-output -attributor-print-dep < %s 2>&1 | FileCheck %s --check-prefixes=GRAPH +; RUN: opt -passes=attributor-cgscc -disable-output -attributor-dump-dep-graph -attributor-depgraph-dot-filename-prefix=%t < %s 2>/dev/null +; RUN: FileCheck %s -input-file=%t_0.dot --check-prefix=DOT + +; Test 0 +; +; test copied from the attributor introduction video: checkAndAdvance(), and the C code is: +; int *checkAndAdvance(int * __attribute__((aligned(16))) p) { +; if (*p == 0) +; return checkAndAdvance(p + 4); +; return p; +; } +; +define i32* @checkAndAdvance(i32* align 16 %0) { + %2 = load i32, i32* %0, align 4 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %4, label %7 + +4: ; preds = %1 + %5 = getelementptr inbounds i32, i32* %0, i64 4 + %6 = call i32* @checkAndAdvance(i32* %5) + br label %8 + +7: ; preds = %1 + br label %8 + +8: ; preds = %7, %4 + %.0 = phi i32* [ %6, %4 ], [ %0, %7 ] + ret i32* %.0 +} + +; +; Check for graph +; + +;GRAPH: [AANoUnwind] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +;GRAPH: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +;GRAPH: updates [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: [AANoSync] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync +;GRAPH: updates [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync +;GRAPH: updates [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync +;GRAPH: [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree +;GRAPH: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree +;GRAPH: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree +;GRAPH: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree +;GRAPH: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree +;GRAPH: [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +;GRAPH: updates [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument +;GRAPH: updates [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument +;GRAPH: [AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<0-16> +;GRAPH: updates [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> +;GRAPH: updates [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> +;GRAPH: [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +;GRAPH: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull +;GRAPH: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull +;GRAPH: [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state readonly +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly +;GRAPH: [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state nofree +;GRAPH: updates [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nofree +;GRAPH: [AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nounwind +;GRAPH: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live +;GRAPH: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live +;GRAPH: updates [AANoUnwind] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind +;GRAPH: [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state readonly +;GRAPH: updates [AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state assumed-live +;GRAPH: updates [AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly +;GRAPH: [AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: updates [AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned +;GRAPH: [AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state readonly +;GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +;GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +;GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +;GRAPH: [AANonNull] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state nonnull +;GRAPH: updates [AANonNull] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position {flt: [@-1]} with state nonnull +;GRAPH: updates [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_arg: [@0]} with state nonnull +;GRAPH: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +;GRAPH: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull +;GRAPH: [AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nosync +;GRAPH: updates [AANoSync] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync +;GRAPH: [AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state nofree +;GRAPH: updates [AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree +;GRAPH: [AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs: [@-1]} with state memory:argument +;GRAPH: updates [AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument +;GRAPH: [AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state align<1-16> +;GRAPH: updates [AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<0-16> +;GRAPH: [AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position {cs_ret: [@-1]} with state nonnull +;GRAPH: updates [AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull + +; +; Check for .dot file +; + +; DOT-DAG: Node[[Node6:0x[a-z0-9]+]] [shape=record,label="{[AANoUnwind] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{fn:checkAndAdvance [checkAndAdvance@-1]\} +; DOT-DAG: Node[[Node34:0x[a-z0-9]+]] [shape=record,label="{[AANoCapture] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{arg: [@0]\} +; DOT-DAG: Node[[Node39:0x[a-z0-9]+]] [shape=record,label="{[AANoUnwind] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs: [@-1]\} +; DOT-DAG: Node[[Node7:0x[a-z0-9]+]] [shape=record,label="{[AANoSync] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{fn:checkAndAdvance [checkAndAdvance@-1]\} +; DOT-DAG: Node[[Node61:0x[a-z0-9]+]] [shape=record,label="{[AANoSync] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs: [@-1]\} +; DOT-DAG: Node[[Node13:0x[a-z0-9]+]] [shape=record,label="{[AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{fn:checkAndAdvance [checkAndAdvance@-1]\} +; DOT-DAG: Node[[Node36:0x[a-z0-9]+]] [shape=record,label="{[AANoFree] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{arg: [@0]\} +; DOT-DAG: Node[[Node62:0x[a-z0-9]+]] [shape=record,label="{[AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs: [@-1]\} +; DOT-DAG: Node[[Node16:0x[a-z0-9]+]] [shape=record,label="{[AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{fn:checkAndAdvance [checkAndAdvance@-1]\} +; DOT-DAG: Node[[Node35:0x[a-z0-9]+]] [shape=record,label="{[AAMemoryBehavior] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{arg: [@0]\} +; DOT-DAG: Node[[Node40:0x[a-z0-9]+]] [shape=record,label="{[AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs: [@-1]\} +; DOT-DAG: Node[[Node17:0x[a-z0-9]+]] [shape=record,label="{[AAMemoryLocation] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{fn:checkAndAdvance [checkAndAdvance@-1]\} +; DOT-DAG: Node[[Node63:0x[a-z0-9]+]] [shape=record,label="{[AAMemoryLocation] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs: [@-1]\} +; DOT-DAG: Node[[Node22:0x[a-z0-9]+]] [shape=record,label="{[AAAlign] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{fn_ret:checkAndAdvance [checkAndAdvance@-1]\} +; DOT-DAG: Node[[Node65:0x[a-z0-9]+]] [shape=record,label="{[AAAlign] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs_ret: [@-1]\} +; DOT-DAG: Node[[Node23:0x[a-z0-9]+]] [shape=record,label="{[AANonNull] for CtxI ' %2 = load i32, i32* %0, align 4' at position \{fn_ret:checkAndAdvance [checkAndAdvance@-1]\} +; DOT-DAG: Node[[Node67:0x[a-z0-9]+]] [shape=record,label="{[AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs_ret: [@-1]\} +; DOT-DAG: Node[[Node43:0x[a-z0-9]+]] [shape=record,label="{[AANoCapture] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs_arg: [@0]\} +; DOT-DAG: Node[[Node45:0x[a-z0-9]+]] [shape=record,label="{[AAMemoryBehavior] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs_arg: [@0]\} +; DOT-DAG: Node[[Node46:0x[a-z0-9]+]] [shape=record,label="{[AANoFree] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs_arg: [@0]\} +; DOT-DAG: Node[[Node38:0x[a-z0-9]+]] [shape=record,label="{[AAIsDead] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs_ret: [@-1]\} +; DOT-DAG: Node[[Node55:0x[a-z0-9]+]] [shape=record,label="{[AANonNull] for CtxI ' %5 = getelementptr inbounds i32, i32* %0, i64 4' at position \{flt: [@-1]\} +; DOT-DAG: Node[[Node31:0x[a-x0-9]+]] [shape=record,label="{[AANonNull] for CtxI ' %6 = call i32* @checkAndAdvance(i32* %5)' at position \{cs_arg: [@0]\} + +; DOT-DAG: Node[[Node6]] -> Node[[Node34]] +; DOT-DAG: Node[[Node6]] -> Node[[Node39]] +; DOT-DAG: Node[[Node7]] -> Node[[Node61]] +; DOT-DAG: Node[[Node13]] -> Node[[Node36]] +; DOT-DAG: Node[[Node13]] -> Node[[Node62]] +; DOT-DAG: Node[[Node16]] -> Node[[Node34]] +; DOT-DAG: Node[[Node16]] -> Node[[Node35]] +; DOT-DAG: Node[[Node16]] -> Node[[Node40]] +; DOT-DAG: Node[[Node17]] -> Node[[Node63]] +; DOT-DAG: Node[[Node22]] -> Node[[Node65]] +; DOT-DAG: Node[[Node23]] -> Node[[Node67]] +; DOT-DAG: Node[[Node34]] -> Node[[Node43]] +; DOT-DAG: Node[[Node35]] -> Node[[Node45]] +; DOT-DAG: Node[[Node36]] -> Node[[Node46]] +; DOT-DAG: Node[[Node39]] -> Node[[Node38]] +; DOT-DAG: Node[[Node39]] -> Node[[Node6]] +; DOT-DAG: Node[[Node40]] -> Node[[Node38]] +; DOT-DAG: Node[[Node40]] -> Node[[Node16]] +; DOT-DAG: Node[[Node43]] -> Node[[Node34]] +; DOT-DAG: Node[[Node45]] -> Node[[Node17]] +; DOT-DAG: Node[[Node55]] -> Node[[Node55]] +; DOT-DAG: Node[[Node55]] -> Node[[Node31]] +; DOT-DAG: Node[[Node55]] -> Node[[Node23]] +; DOT-DAG: Node[[Node61]] -> Node[[Node7]] +; DOT-DAG: Node[[Node62]] -> Node[[Node13]] +; DOT-DAG: Node[[Node63]] -> Node[[Node17]] +; DOT-DAG: Node[[Node65]] -> Node[[Node22]] +; DOT-DAG: Node[[Node67]] -> Node[[Node23]] From c0bd9fa137c28a3ef833b46b7f9770b060275281 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Tue, 28 Jul 2020 11:59:49 +0200 Subject: [PATCH 0283/1035] [Concepts] Fix ast dump for immediately declared constraint. Reviewed By: nridge Differential Revision: https://reviews.llvm.org/D84461 --- clang/lib/AST/TextNodeDumper.cpp | 2 +- clang/test/AST/ast-dump-concepts.cpp | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 clang/test/AST/ast-dump-concepts.cpp diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 91b984820cd22..5b6c6085e02cf 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1994,7 +1994,7 @@ void TextNodeDumper::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *D) { dumpBareDeclRef(TC->getFoundDecl()); OS << ")"; } - Visit(TC->getImmediatelyDeclaredConstraint()); + AddChild([=] { Visit(TC->getImmediatelyDeclaredConstraint()); }); } else if (D->wasDeclaredWithTypename()) OS << " typename"; else diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp new file mode 100644 index 0000000000000..530c1baeffa77 --- /dev/null +++ b/clang/test/AST/ast-dump-concepts.cpp @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++2a -ast-dump -ast-dump-filter Foo %s | FileCheck -strict-whitespace %s + +// Test with serialization: +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -std=c++20 -triple x86_64-unknown-unknown -include-pch %t \ +// RUN: -ast-dump-all -ast-dump-filter Foo /dev/null \ +// RUN: | FileCheck --strict-whitespace %s + +template +concept binary_concept = true; + +template +struct Foo { + // CHECK: TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept' + // CHECK-NEXT: |-ConceptSpecializationExpr {{.*}} 'bool' + // CHECK-NEXT: `-TemplateArgument {{.*}} type 'int' + template R> + Foo(R); +}; From 67070d98fae5c49e183ff5d68ae8038e6fd8f5a9 Mon Sep 17 00:00:00 2001 From: Mikhail Kalashnikov Date: Tue, 28 Jul 2020 13:06:51 +0300 Subject: [PATCH 0284/1035] [llvm-readelf] Symbol index in symbol table printing is not reset Stop using static variables for keeping track of symbol indices. Bugfix for: https://bugs.llvm.org/show_bug.cgi?id=46777 Differential revision: https://reviews.llvm.org/D84606 --- llvm/test/tools/llvm-readobj/ELF/symbols.test | 10 ++++++++++ llvm/tools/llvm-readobj/ELFDumper.cpp | 13 +------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/llvm/test/tools/llvm-readobj/ELF/symbols.test b/llvm/test/tools/llvm-readobj/ELF/symbols.test index 69d59311f662b..0878827ab6765 100644 --- a/llvm/test/tools/llvm-readobj/ELF/symbols.test +++ b/llvm/test/tools/llvm-readobj/ELF/symbols.test @@ -103,6 +103,16 @@ # RUN: llvm-readobj --dyn-symbols %t64 | FileCheck %s --implicit-check-not="{{^}}Symbols [" # RUN: llvm-readelf --dyn-symbols %t64 | FileCheck %s --implicit-check-not="Symbol table '.symtab'" +## Case 6: Test that the Num index starts from zero at every new symbol table. +# RUN: llvm-readelf --symbols %t64 %t64 | FileCheck %s --check-prefix=NUM-INDEX + +# NUM-INDEX: Symbol table '.symtab' contains 3 entries: +# NUM-INDEX-NEXT: Num: {{.*}} +# NUM-INDEX-NEXT: 0: {{.*}} +# NUM-INDEX: Symbol table '.symtab' contains 3 entries: +# NUM-INDEX-NEXT: Num: {{.*}} +# NUM-INDEX-NEXT: 0: {{.*}} + --- !ELF FileHeader: Class: ELFCLASS[[BITS]] diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 72dcb9c5a15aa..590108a86fa7e 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -3924,21 +3924,10 @@ void GNUStyle::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol, const Elf_Sym *FirstSym, Optional StrTable, bool IsDynamic, bool NonVisibilityBitsUsed) { - static int Idx = 0; - static bool Dynamic = true; - - // If this function was called with a different value from IsDynamic - // from last call, happens when we move from dynamic to static symbol - // table, "Num" field should be reset. - if (!Dynamic != !IsDynamic) { - Idx = 0; - Dynamic = false; - } - unsigned Bias = ELFT::Is64Bits ? 8 : 0; Field Fields[8] = {0, 8, 17 + Bias, 23 + Bias, 31 + Bias, 38 + Bias, 48 + Bias, 51 + Bias}; - Fields[0].Str = to_string(format_decimal(Idx++, 6)) + ":"; + Fields[0].Str = to_string(format_decimal(Symbol - FirstSym, 6)) + ":"; Fields[1].Str = to_string( format_hex_no_prefix(Symbol->st_value, ELFT::Is64Bits ? 16 : 8)); Fields[2].Str = to_string(format_decimal(Symbol->st_size, 5)); From 93b7915504b708f39a75d72e08448443a899345e Mon Sep 17 00:00:00 2001 From: Georgii Rymar Date: Tue, 28 Jul 2020 13:31:13 +0300 Subject: [PATCH 0285/1035] [llvm-readobj] - Add comments and formatting to mips-options-sec.test and mips-reginfo.test. NFCI. This will allow to extend them (needed for D84651). --- .../llvm-readobj/ELF/mips-options-sec.test | 24 ++++++++++--------- .../tools/llvm-readobj/ELF/mips-reginfo.test | 20 +++++++++------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/llvm/test/tools/llvm-readobj/ELF/mips-options-sec.test b/llvm/test/tools/llvm-readobj/ELF/mips-options-sec.test index 3636d56cfe6e2..f53f04c55e9bf 100644 --- a/llvm/test/tools/llvm-readobj/ELF/mips-options-sec.test +++ b/llvm/test/tools/llvm-readobj/ELF/mips-options-sec.test @@ -1,12 +1,14 @@ -RUN: llvm-readobj -A %p/Inputs/options.obj.elf-mipsel | FileCheck %s +## Check that we are able to dump the SHT_MIPS_OPTIONS section using -A properly. -CHECK: MIPS Options { -CHECK-NEXT: ODK_REGINFO { -CHECK-NEXT: GP: 0x0 -CHECK-NEXT: General Mask: 0xF2000017 -CHECK-NEXT: Co-Proc Mask0: 0x0 -CHECK-NEXT: Co-Proc Mask1: 0x0 -CHECK-NEXT: Co-Proc Mask2: 0x0 -CHECK-NEXT: Co-Proc Mask3: 0x0 -CHECK-NEXT: } -CHECK-NEXT: } +# RUN: llvm-readobj -A %p/Inputs/options.obj.elf-mipsel | FileCheck %s + +# CHECK: MIPS Options { +# CHECK-NEXT: ODK_REGINFO { +# CHECK-NEXT: GP: 0x0 +# CHECK-NEXT: General Mask: 0xF2000017 +# CHECK-NEXT: Co-Proc Mask0: 0x0 +# CHECK-NEXT: Co-Proc Mask1: 0x0 +# CHECK-NEXT: Co-Proc Mask2: 0x0 +# CHECK-NEXT: Co-Proc Mask3: 0x0 +# CHECK-NEXT: } +# CHECK-NEXT: } diff --git a/llvm/test/tools/llvm-readobj/ELF/mips-reginfo.test b/llvm/test/tools/llvm-readobj/ELF/mips-reginfo.test index 20177a99d8cb7..0074631843cd0 100644 --- a/llvm/test/tools/llvm-readobj/ELF/mips-reginfo.test +++ b/llvm/test/tools/llvm-readobj/ELF/mips-reginfo.test @@ -1,10 +1,12 @@ -RUN: llvm-readobj -A %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s +## Check that we are able to dump the SHT_MIPS_REGINFO section using -A properly. -CHECK: MIPS RegInfo { -CHECK-NEXT: GP: 0x7FEF -CHECK-NEXT: General Mask: 0xB00001F6 -CHECK-NEXT: Co-Proc Mask0: 0x0 -CHECK-NEXT: Co-Proc Mask1: 0x0 -CHECK-NEXT: Co-Proc Mask2: 0x0 -CHECK-NEXT: Co-Proc Mask3: 0x0 -CHECK-NEXT: } +# RUN: llvm-readobj -A %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s + +# CHECK: MIPS RegInfo { +# CHECK-NEXT: GP: 0x7FEF +# CHECK-NEXT: General Mask: 0xB00001F6 +# CHECK-NEXT: Co-Proc Mask0: 0x0 +# CHECK-NEXT: Co-Proc Mask1: 0x0 +# CHECK-NEXT: Co-Proc Mask2: 0x0 +# CHECK-NEXT: Co-Proc Mask3: 0x0 +# CHECK-NEXT: } From 7e8d5a90f2c101388d3b0bbce8555e871c670232 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Tue, 28 Jul 2020 12:26:37 +0200 Subject: [PATCH 0286/1035] Avoid use of std::make_unique in compiler-rt/lib/scudo/standalone/tests/combined_test.cpp make_unique is a C++14 feature, and this prevents us from building on Ubuntu Trusty. While we do use a C++14 compatible toolchain for building in general, we fall back to the system toolchain for building the compiler-rt tests. The reason is that those tests get cross-compiled for e.g. 32-bit and 64-bit x86, and while the toolchain provides libstdc++ in those flavours, the resulting compiler-rt test binaries don't get RPATH set and so won't start if they're linked with that toolchain. We've tried linking the test binaries against libstdc++ statically, by passing COMPILER_RT_TEST_COMPILER_CFLAGS=-static-libstdc++. That mostly works, but some test targets append -lstdc++ to the compiler invocation. So, after spending way too much time on this, let's just avoid C++14 here for now. --- .../lib/scudo/standalone/tests/combined_test.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index c144ad0ae32a3..6cefe18b8f15c 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -78,7 +78,7 @@ template struct TestAllocator : scudo::Allocator { template static void testAllocator() { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); EXPECT_FALSE(Allocator->isOwned(&Mutex)); EXPECT_FALSE(Allocator->isOwned(&Allocator)); @@ -352,7 +352,7 @@ template static void stressAllocator(AllocatorT *A) { template static void testAllocatorThreaded() { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); std::thread Threads[32]; for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) Threads[I] = std::thread(stressAllocator, Allocator.get()); @@ -399,7 +399,7 @@ struct DeathConfig { TEST(ScudoCombinedTest, DeathCombined) { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); const scudo::uptr Size = 1000U; void *P = Allocator->allocate(Size, Origin); @@ -434,7 +434,7 @@ TEST(ScudoCombinedTest, DeathCombined) { // operation without issue. TEST(ScudoCombinedTest, ReleaseToOS) { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); Allocator->releaseToOS(); } @@ -443,7 +443,7 @@ TEST(ScudoCombinedTest, ReleaseToOS) { // fulfill the allocation through a larger size class. TEST(ScudoCombinedTest, FullRegion) { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); std::vector V; scudo::uptr FailedAllocationsCount = 0; @@ -474,7 +474,7 @@ TEST(ScudoCombinedTest, FullRegion) { TEST(ScudoCombinedTest, OddEven) { using AllocatorT = TestAllocator; using SizeClassMap = AllocatorT::PrimaryT::SizeClassMap; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); if (!Allocator->useMemoryTagging()) return; From 946be75b9ec131519837e85487fc3e8bf475d001 Mon Sep 17 00:00:00 2001 From: lorenzo chelini Date: Tue, 28 Jul 2020 12:29:54 +0200 Subject: [PATCH 0287/1035] [MLIR][Linalg] Retire C++ DotOp in favor of a linalg-ods-gen'd op - replace DotOp, now that DRR rules have been dropped. - Capture arguments mismatch in the parser. The number of parsed arguments must equal the number of expected arguments. Reviewed By: ftynse, nicolasvasilache Differential Revision: https://reviews.llvm.org/D82952 --- .../Linalg/IR/LinalgNamedStructuredOpsSpec.tc | 5 ++++ .../mlir/Dialect/Linalg/IR/LinalgOps.h | 6 ++--- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 25 ------------------- .../LinalgToStandard/LinalgToStandard.cpp | 4 +-- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 19 +++++++++++--- mlir/lib/Dialect/Linalg/Transforms/Loops.cpp | 16 ++---------- mlir/test/Dialect/Linalg/invalid.mlir | 4 +-- mlir/test/Dialect/Linalg/loops.mlir | 6 +++-- mlir/test/Dialect/Linalg/roundtrip.mlir | 16 ++++++------ mlir/test/Dialect/Linalg/standard.mlir | 6 ++--- mlir/test/Dialect/Linalg/tile.mlir | 8 +++--- .../transform-patterns-matmul-to-vector.mlir | 2 +- .../Dialect/Linalg/transform-patterns.mlir | 10 ++++---- .../linalg_integration_test.mlir | 2 +- 14 files changed, 56 insertions(+), 73 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc index bbd398585e5ff..056f0723e92dd 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc @@ -8,6 +8,11 @@ def matvec(A: f32(M, N), y: f32(N)) -> (x: f32(M)) { x(m) = std_addf(std_mulf(A(m, n), y(n))); } +ods_def: +def dot(A: f32(M), B: f32(M)) -> (C: f32()) { + C() = std_addf(std_mulf(A(m), B(m))); +} + ods_def: def batch_matmul(A: f32(Batch, M, K), B: f32(Batch, K, N)) -> (C: f32(Batch, M, N)) { C(b, m, n) = std_addf(std_mulf(A(b, m, k), B(b, k, n))); diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h index f89965a96857f..21bff4185abf8 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h @@ -51,9 +51,9 @@ using ReassociationExprs = SmallVector; /// 1. linalg.fill(%A, %f) : memref, f32 /// name mangles into `linalg_fill_viewf32_f32_impl` /// -/// 2. linalg.dot(%A, %B, %C) : -/// memref, -/// memref, memref +/// 2. linalg.dot %A, %B, %C : +/// (memref, +/// memref, memref) /// name mangles into `linalg_dot_viewxf32_viewxf32_viewf32_impl` /// /// 3. linalg.matmul(...) : diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 81f911a37cea2..1e3321af981e6 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -180,31 +180,6 @@ def FillOp : LinalgStructured_Op<"fill", [NInputs<0>, NOutputs<1>]> { let hasFolder = 1; } -def DotOp : LinalgStructured_Op<"dot", [NInputs<2>, NOutputs<1>]> { - - let arguments = (ins AnyStridedMemRefOfRank<1>, - AnyStridedMemRefOfRank<1>, - AnyStridedMemRefOfRank<0>); - - let extraClassDeclaration = libraryCallName # [{ - llvm::Optional> referenceIterators() { - return SmallVector{getReductionIteratorTypeName()}; - } - - // A(r_i) * B(r_i) -> C() - llvm::Optional> referenceIndexingMaps() { - MLIRContext *context = getContext(); - auto r_i = getAffineDimExpr(0, context); - return SmallVector{ - AffineMap::get(1, 0, {r_i}, context), - AffineMap::get(1, 0, {r_i}, context), - AffineMap::get(1, 0, {}, context)}; - } - }]; - - let hasFolder = 1; -} - /// A base class for pooling operation such as conv. The arguments must contain /// optional arguments `strides`, `dilations` and `padding` with following type: /// OptionalAttr:$strides diff --git a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp index 43f9d88253275..8a54c93d7685e 100644 --- a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp +++ b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp @@ -235,13 +235,13 @@ void mlir::populateLinalgToStandardConversionPatterns( LinalgOpConversion, LinalgOpConversion, LinalgOpConversion, - LinalgOpConversion, - LinalgOpConversion, + LinalgOpConversion, LinalgOpConversion, LinalgOpConversion, LinalgOpConversion>(ctx); // TODO: collect all auto-generated named ops with a tblgen directive. patterns.insert< + LinalgOpConversion, LinalgOpConversion, LinalgOpConversion, LinalgOpConversion>(ctx); diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 4c68f0265677c..192179b3ff506 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -1173,10 +1173,6 @@ LogicalResult CopyOp::fold(ArrayRef, SmallVectorImpl &) { return foldMemRefCast(*this); } -LogicalResult DotOp::fold(ArrayRef, - SmallVectorImpl &) { - return foldMemRefCast(*this); -} LogicalResult FillOp::fold(ArrayRef, SmallVectorImpl &) { return foldMemRefCast(*this); @@ -1280,6 +1276,17 @@ static ParseResult parseNamedStructuredOp(OpAsmParser &parser, if (!tensorResultTypes.empty()) result.addTypes(tensorResultTypes); + // The number of parsed arguments must equal + // the number of expected arguments for the current operation. + auto parsedArgs = operandsInfo.size(); + auto expectedArgs = NamedStructuredOpType::getNumInputs() + + NamedStructuredOpType::getNumOutputs(); + if (parsedArgs != expectedArgs) + return parser.emitError(parser.getNameLoc(), + "expects " + std::to_string(expectedArgs) + + " operands, but found " + + std::to_string(parsedArgs)); + buildNamedStructuredOpRegionAndAttributes( parser.getBuilder(), result, operandTypes, tensorResultTypes); @@ -1299,6 +1306,10 @@ LogicalResult BatchMatmulOp::fold(ArrayRef, SmallVectorImpl &) { return foldMemRefCast(*this); } +LogicalResult DotOp::fold(ArrayRef, + SmallVectorImpl &) { + return foldMemRefCast(*this); +} LogicalResult MatmulOp::fold(ArrayRef, SmallVectorImpl &) { return foldMemRefCast(*this); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp index 7fb1018fc5881..32e50cb597d7a 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp @@ -295,18 +295,6 @@ void emitScalarImplementation(ArrayRef allIvs, FillOp fillOp) { nPar > 0 ? O(ivs) = fillOp.value() : O() = fillOp.value(); } -template -void emitScalarImplementation(ArrayRef allIvs, DotOp dotOp) { - assert(dotOp.hasBufferSemantics() && - "expected linalg op with buffer semantics"); - assert(allIvs.size() == 1); - Value r_i(allIvs[0]); - IndexedValueType A(dotOp.getInput(0)), B(dotOp.getInput(1)), - C(dotOp.getOutputBuffer(0)); - // Emit scalar form. - C() = C() + A(r_i) * B(r_i); -} - template Value getConvOpInput(ConvOp convOp, StdIndexedValue im, MutableArrayRef imIdx) { @@ -673,8 +661,6 @@ static Optional linalgOpToLoopsImplSwitch(Operation *op, return linalgOpToLoopsImpl(op, builder); if (isa(op)) return linalgOpToLoopsImpl(op, builder); - if (isa(op)) - return linalgOpToLoopsImpl(op, builder); if (isa(op)) return linalgOpToLoopsImpl(op, builder); if (isa(op)) @@ -693,6 +679,8 @@ static Optional linalgOpToLoopsImplSwitch(Operation *op, return linalgOpToLoopsImpl(op, builder); if (isa(op)) return linalgOpToLoopsImpl(op, builder); + if (isa(op)) + return linalgOpToLoopsImpl(op, builder); if (isa(op)) return linalgOpToLoopsImpl(op, builder); llvm_unreachable("Unexpected op in linalgOpToLoopsImpl"); diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index 79d61d8d78921..ca59ecd387ec3 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -422,8 +422,8 @@ func @generic(%arg0: memref) { // ----- func @generic_result_0_element_type(%arg0: memref) { - // expected-error @+1 {{'linalg.dot' op expected 3 operands, but found 2}} - linalg.dot(%arg0, %arg0): memref, memref + // expected-error @+1 {{'linalg.dot' expects 3 operands, but found 2}} + linalg.dot %arg0, %arg0 : (memref, memref) } // ----- diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir index 30bb90bdd43a1..b01beb7e8f175 100644 --- a/mlir/test/Dialect/Linalg/loops.mlir +++ b/mlir/test/Dialect/Linalg/loops.mlir @@ -123,7 +123,7 @@ func @dot(%arg0: memref, %M: index) { %1 = view %arg0[%c0][%M] : memref to memref %2 = view %arg0[%c0][%M] : memref to memref %3 = view %arg0[%c0][] : memref to memref - linalg.dot(%1, %2, %3) : memref, memref, memref + linalg.dot %1, %2, %3 : (memref, memref, memref) return } // CHECKLOOP-LABEL: func @dot(%{{.*}}: memref, @@ -154,7 +154,9 @@ func @dot(%arg0: memref, %M: index) { func @dot_view(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.dot(%arg0, %arg1, %arg2) : memref, memref, memref + linalg.dot %arg0, %arg1, %arg2 : (memref, + memref, + memref) return } // CHECKLOOP-LABEL: func @dot_view( diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir index 9e6c275479302..2696643246972 100644 --- a/mlir/test/Dialect/Linalg/roundtrip.mlir +++ b/mlir/test/Dialect/Linalg/roundtrip.mlir @@ -88,10 +88,10 @@ func @ops(%arg0: memref, memref) linalg.matvec %arg0, %arg1, %arg2 : (memref, memref, - memref) - linalg.dot(%arg1, %arg2, %arg3) : memref, - memref, - memref + memref) + linalg.dot %arg1, %arg2, %arg3 : (memref, + memref, + memref) return } // CHECK-LABEL: func @ops(% @@ -103,10 +103,10 @@ func @ops(%arg0: memref, // CHECK-SAME: (memref, // CHECK-SAME: memref, // CHECK-SAME: memref) -// CHECK-NEXT: linalg.dot(%{{.*}}, %{{.*}}, %{{.*}}) : -// CHECK-SAME: memref, -// CHECK-SAME: memref, -// CHECK-SAME: memref +// CHECK-NEXT: linalg.dot %{{.*}}, %{{.*}}, %{{.*}} : +// CHECK-SAME: (memref, +// CHECK-SAME: memref, +// CHECK-SAME: memref) // ----- diff --git a/mlir/test/Dialect/Linalg/standard.mlir b/mlir/test/Dialect/Linalg/standard.mlir index 0ba3465443fa1..60b348110c4f6 100644 --- a/mlir/test/Dialect/Linalg/standard.mlir +++ b/mlir/test/Dialect/Linalg/standard.mlir @@ -13,9 +13,9 @@ func @dot(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.dot(%arg0, %arg1, %arg2) : memref, - memref, - memref + linalg.dot %arg0, %arg1, %arg2 : (memref, + memref, + memref) return } // CHECK-LABEL: func @dot( diff --git a/mlir/test/Dialect/Linalg/tile.mlir b/mlir/test/Dialect/Linalg/tile.mlir index 049fb571bd51d..9a1bbfc1dc184 100644 --- a/mlir/test/Dialect/Linalg/tile.mlir +++ b/mlir/test/Dialect/Linalg/tile.mlir @@ -271,7 +271,9 @@ func @matvec(%arg0: memref, %arg1: memref, memref, memref) func @dot(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.dot(%arg0, %arg1, %arg2) : memref, memref, memref + linalg.dot %arg0, %arg1, %arg2 : (memref, + memref, + memref) return } // TILE-2-LABEL: func @dot( @@ -285,7 +287,7 @@ func @dot(%arg0: memref, %arg1: memref to memref -// TILE-2: linalg.dot(%[[sAi]], %[[sBi]], {{.*}}) : memref, memref, memref +// TILE-2: linalg.dot %[[sAi]], %[[sBi]], {{.*}} : (memref, memref, memref) // TILE-02-LABEL: func @dot( // TILE-02-NOT: scf.for @@ -304,7 +306,7 @@ func @dot(%arg0: memref, %arg1: memref to memref -// TILE-234: linalg.dot(%[[sAi]], %[[sBi]], %{{.*}}) : memref, memref, memref +// TILE-234: linalg.dot %[[sAi]], %[[sBi]], %{{.*}} : (memref, memref, memref) func @fill_static(%arg0: memref<127x99xf32>, %arg1: f32) { linalg.fill(%arg0, %arg1) : memref<127x99xf32>, f32 diff --git a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir index b0702f9fdcfd8..83e9461d66cc9 100644 --- a/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir +++ b/mlir/test/Dialect/Linalg/transform-patterns-matmul-to-vector.mlir @@ -36,7 +36,7 @@ func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>, func @contraction_dot(%A: memref<1584xf32>, %B: memref<1584xf32>, %C: memref) { // VECTOR-CONTRACTION: vector.contract // VECTOR-CONTRACTION-SAME: vector<1584xf32>, vector<1584xf32> into f32 - linalg.dot(%A, %B, %C) : memref<1584xf32>, memref<1584xf32>, memref + linalg.dot %A, %B, %C : (memref<1584xf32>, memref<1584xf32>, memref) return } diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir index 3f7d16497253d..1a4100403b007 100644 --- a/mlir/test/Dialect/Linalg/transform-patterns.mlir +++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir @@ -14,10 +14,10 @@ func @dot(%x: memref, %y: memref, %v: memref) { - linalg.dot(%x, %y, %v) { __internal_linalg_transform__ = "MEM" } : - memref, - memref, - memref + linalg.dot %x, %y, %v { __internal_linalg_transform__ = "MEM" } : + (memref, + memref, + memref) return } // CHECK-LABEL: func @dot @@ -28,8 +28,8 @@ func @dot(%x: memref, // CHECK: scf.for {{.*}} = %[[c0]] to {{.*}} step %[[c1]] { // CHECK: load // CHECK: load -// CHECK: mulf // CHECK: load +// CHECK: mulf // CHECK: addf // CHECK: store diff --git a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir index dd6feb96240ee..ba2ea59cb22d7 100644 --- a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir +++ b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir @@ -51,7 +51,7 @@ func @dot() -> f32 { %B = view %bB[%c0][%c16] : memref to memref %C = view %bC[%c0][] : memref to memref - linalg.dot(%A, %B, %C) : memref, memref, memref + linalg.dot %A, %B, %C : (memref, memref, memref) %res = load %C[] : memref dealloc %bC : memref From 7294ca3f6ecacd05a197bbf0637e10afcb99b6d6 Mon Sep 17 00:00:00 2001 From: Kai Nacke Date: Thu, 2 Jul 2020 14:43:42 +0200 Subject: [PATCH 0288/1035] [SystemZ/ZOS] Implement setLastAccessAndModificationTime() The function setLastAccessAndModificationTime() uses function futimens() or futimes() by default. Both functions are not available in z/OS, therefore functionality is implemented using __fchattr() on z/OS. Reviews by: abhina.sreeskantharajan Differential Revision: https://reviews.llvm.org/D83945 --- llvm/lib/Support/Unix/Path.inc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index fa4682dd33d2f..01903ea10e814 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -792,6 +792,16 @@ std::error_code setLastAccessAndModificationTime(int FD, TimePoint<> AccessTime, if (::futimes(FD, Times)) return std::error_code(errno, std::generic_category()); return std::error_code(); +#elif defined(__MVS__) + attrib_t Attr; + memset(&Attr, 0, sizeof(Attr)); + Attr.att_atimechg = 1; + Attr.att_atime = sys::toTimeT(AccessTime); + Attr.att_mtimechg = 1; + Attr.att_mtime = sys::toTimeT(ModificationTime); + if (::__fchattr(FD, &Attr, sizeof(Attr)) != 0) + return std::error_code(errno, std::generic_category()); + return std::error_code(); #else #warning Missing futimes() and futimens() return make_error_code(errc::function_not_supported); From 3a2b05f9fe74fcf9560632cf2695058d47d8683b Mon Sep 17 00:00:00 2001 From: Evgeniy Brevnov Date: Fri, 24 Jul 2020 18:57:10 +0700 Subject: [PATCH 0289/1035] [BPI][NFC] Consolidate code to deal with SCCs under a dedicated data structure. In order to facilitate review of D79485 here is a small NFC change which restructures code around handling of SCCs in BPI. Reviewed By: davidxl Differential Revision: https://reviews.llvm.org/D84514 --- .../llvm/Analysis/BranchProbabilityInfo.h | 71 +++++++- llvm/lib/Analysis/BranchProbabilityInfo.cpp | 164 +++++++++++------- 2 files changed, 169 insertions(+), 66 deletions(-) diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h index 3e72afba36c30..7feb5b6259380 100644 --- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h +++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h @@ -151,13 +151,66 @@ class BranchProbabilityInfo { /// Forget analysis results for the given basic block. void eraseBlock(const BasicBlock *BB); - // Use to track SCCs for handling irreducible loops. - using SccMap = DenseMap; - using SccHeaderMap = DenseMap; - using SccHeaderMaps = std::vector; - struct SccInfo { + class SccInfo { + // Enum of types to classify basic blocks in SCC. Basic block belonging to + // SCC is 'Inner' until it is either 'Header' or 'Exiting'. Note that a + // basic block can be 'Header' and 'Exiting' at the same time. + enum SccBlockType { + Inner = 0x0, + Header = 0x1, + Exiting = 0x2, + }; + // Map of basic blocks to SCC IDs they belong to. If basic block doesn't + // belong to any SCC it is not in the map. + using SccMap = DenseMap; + // Each basic block in SCC is attributed with one or several types from + // SccBlockType. Map value has uint32_t type (instead of SccBlockType) + // since basic block may be for example "Header" and "Exiting" at the same + // time and we need to be able to keep more than one value from + // SccBlockType. + using SccBlockTypeMap = DenseMap; + // Vector containing classification of basic blocks for all SCCs where i'th + // vector element corresponds to SCC with ID equal to i. + using SccBlockTypeMaps = std::vector; + SccMap SccNums; - SccHeaderMaps SccHeaders; + SccBlockTypeMaps SccBlocks; + + public: + explicit SccInfo(const Function &F); + + /// If \p BB belongs to some SCC then ID of that SCC is returned, otherwise + /// -1 is returned. If \p BB belongs to more than one SCC at the same time + /// result is undefined. + int getSCCNum(const BasicBlock *BB) const; + /// Returns true if \p BB is a 'header' block in SCC with \p SccNum ID, + /// false otherwise. + bool isSCCHeader(const BasicBlock *BB, int SccNum) const { + return getSccBlockType(BB, SccNum) & Header; + } + /// Returns true if \p BB is an 'exiting' block in SCC with \p SccNum ID, + /// false otherwise. + bool isSCCExitingBlock(const BasicBlock *BB, int SccNum) const { + return getSccBlockType(BB, SccNum) & Exiting; + } + /// Fills in \p Enters vector with all such blocks that don't belong to + /// SCC with \p SccNum ID but there is an edge to a block belonging to the + /// SCC. + void getSccEnterBlocks(int SccNum, + SmallVectorImpl &Enters) const; + /// Fills in \p Exits vector with all such blocks that don't belong to + /// SCC with \p SccNum ID but there is an edge from a block belonging to the + /// SCC. + void getSccExitBlocks(int SccNum, + SmallVectorImpl &Exits) const; + + private: + /// Returns \p BB's type according to classification given by SccBlockType + /// enum. Please note that \p BB must belong to SSC with \p SccNum ID. + uint32_t getSccBlockType(const BasicBlock *BB, int SccNum) const; + /// Calculates \p BB's type and stores it in internal data structures for + /// future use. Please note that \p BB must belong to SSC with \p SccNum ID. + void calculateSccBlockType(const BasicBlock *BB, int SccNum); }; private: @@ -196,6 +249,9 @@ class BranchProbabilityInfo { /// Track the last function we run over for printing. const Function *LastF = nullptr; + /// Keeps information about all SCCs in a function. + std::unique_ptr SccI; + /// Track the set of blocks directly succeeded by a returning block. SmallPtrSet PostDominatedByUnreachable; @@ -210,8 +266,7 @@ class BranchProbabilityInfo { bool calcMetadataWeights(const BasicBlock *BB); bool calcColdCallHeuristics(const BasicBlock *BB); bool calcPointerHeuristics(const BasicBlock *BB); - bool calcLoopBranchHeuristics(const BasicBlock *BB, const LoopInfo &LI, - SccInfo &SccI); + bool calcLoopBranchHeuristics(const BasicBlock *BB, const LoopInfo &LI); bool calcZeroHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI); bool calcFloatingPointHeuristics(const BasicBlock *BB); bool calcInvokeHeuristics(const BasicBlock *BB); diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp index a396b5ad21c6a..195fc69d9601d 100644 --- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp +++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp @@ -148,6 +148,105 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1; /// instruction. This is essentially never taken. static const uint32_t IH_NONTAKEN_WEIGHT = 1; +BranchProbabilityInfo::SccInfo::SccInfo(const Function &F) { + // Record SCC numbers of blocks in the CFG to identify irreducible loops. + // FIXME: We could only calculate this if the CFG is known to be irreducible + // (perhaps cache this info in LoopInfo if we can easily calculate it there?). + int SccNum = 0; + for (scc_iterator It = scc_begin(&F); !It.isAtEnd(); + ++It, ++SccNum) { + // Ignore single-block SCCs since they either aren't loops or LoopInfo will + // catch them. + const std::vector &Scc = *It; + if (Scc.size() == 1) + continue; + + LLVM_DEBUG(dbgs() << "BPI: SCC " << SccNum << ":"); + for (const auto *BB : Scc) { + LLVM_DEBUG(dbgs() << " " << BB->getName()); + SccNums[BB] = SccNum; + calculateSccBlockType(BB, SccNum); + } + LLVM_DEBUG(dbgs() << "\n"); + } +} + +int BranchProbabilityInfo::SccInfo::getSCCNum(const BasicBlock *BB) const { + auto SccIt = SccNums.find(BB); + if (SccIt == SccNums.end()) + return -1; + return SccIt->second; +} + +void BranchProbabilityInfo::SccInfo::getSccEnterBlocks( + int SccNum, SmallVectorImpl &Enters) const { + + for (auto MapIt : SccBlocks[SccNum]) { + const auto *BB = MapIt.first; + if (isSCCHeader(BB, SccNum)) + for (const auto *Pred : predecessors(BB)) + if (getSCCNum(Pred) != SccNum) + Enters.push_back(const_cast(BB)); + } +} + +void BranchProbabilityInfo::SccInfo::getSccExitBlocks( + int SccNum, SmallVectorImpl &Exits) const { + for (auto MapIt : SccBlocks[SccNum]) { + const auto *BB = MapIt.first; + if (isSCCExitingBlock(BB, SccNum)) + for (const auto *Succ : successors(BB)) + if (getSCCNum(Succ) != SccNum) + Exits.push_back(const_cast(BB)); + } +} + +uint32_t BranchProbabilityInfo::SccInfo::getSccBlockType(const BasicBlock *BB, + int SccNum) const { + assert(getSCCNum(BB) == SccNum); + + assert(SccBlocks.size() > static_cast(SccNum) && "Unknown SCC"); + const auto &SccBlockTypes = SccBlocks[SccNum]; + + auto It = SccBlockTypes.find(BB); + if (It != SccBlockTypes.end()) { + return It->second; + } + return Inner; +} + +void BranchProbabilityInfo::SccInfo::calculateSccBlockType(const BasicBlock *BB, + int SccNum) { + assert(getSCCNum(BB) == SccNum); + uint32_t BlockType = Inner; + + if (llvm::any_of(make_range(pred_begin(BB), pred_end(BB)), + [&](const BasicBlock *Pred) { + // Consider any block that is an entry point to the SCC as + // a header. + return getSCCNum(Pred) != SccNum; + })) + BlockType |= Header; + + if (llvm::any_of( + make_range(succ_begin(BB), succ_end(BB)), + [&](const BasicBlock *Succ) { return getSCCNum(Succ) != SccNum; })) + BlockType |= Exiting; + + // Lazily compute the set of headers for a given SCC and cache the results + // in the SccHeaderMap. + if (SccBlocks.size() <= static_cast(SccNum)) + SccBlocks.resize(SccNum + 1); + auto &SccBlockTypes = SccBlocks[SccNum]; + + if (BlockType != Inner) { + bool IsInserted; + std::tie(std::ignore, IsInserted) = + SccBlockTypes.insert(std::make_pair(BB, BlockType)); + assert(IsInserted && "Duplicated block in SCC"); + } +} + static void UpdatePDTWorklist(const BasicBlock *BB, PostDominatorTree *PDT, SmallVectorImpl &WorkList, SmallPtrSetImpl &TargetSet) { @@ -511,38 +610,6 @@ bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) { return true; } -static int getSCCNum(const BasicBlock *BB, - const BranchProbabilityInfo::SccInfo &SccI) { - auto SccIt = SccI.SccNums.find(BB); - if (SccIt == SccI.SccNums.end()) - return -1; - return SccIt->second; -} - -// Consider any block that is an entry point to the SCC as a header. -static bool isSCCHeader(const BasicBlock *BB, int SccNum, - BranchProbabilityInfo::SccInfo &SccI) { - assert(getSCCNum(BB, SccI) == SccNum); - - // Lazily compute the set of headers for a given SCC and cache the results - // in the SccHeaderMap. - if (SccI.SccHeaders.size() <= static_cast(SccNum)) - SccI.SccHeaders.resize(SccNum + 1); - auto &HeaderMap = SccI.SccHeaders[SccNum]; - bool Inserted; - BranchProbabilityInfo::SccHeaderMap::iterator HeaderMapIt; - std::tie(HeaderMapIt, Inserted) = HeaderMap.insert(std::make_pair(BB, false)); - if (Inserted) { - bool IsHeader = llvm::any_of(make_range(pred_begin(BB), pred_end(BB)), - [&](const BasicBlock *Pred) { - return getSCCNum(Pred, SccI) != SccNum; - }); - HeaderMapIt->second = IsHeader; - return IsHeader; - } else - return HeaderMapIt->second; -} - // Compute the unlikely successors to the block BB in the loop L, specifically // those that are unlikely because this is a loop, and add them to the // UnlikelyBlocks set. @@ -653,12 +720,11 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L, // Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges // as taken, exiting edges as not-taken. bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB, - const LoopInfo &LI, - SccInfo &SccI) { + const LoopInfo &LI) { int SccNum; Loop *L = LI.getLoopFor(BB); if (!L) { - SccNum = getSCCNum(BB, SccI); + SccNum = SccI->getSCCNum(BB); if (SccNum < 0) return false; } @@ -685,9 +751,9 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB, else InEdges.push_back(I.getSuccessorIndex()); } else { - if (getSCCNum(*I, SccI) != SccNum) + if (SccI->getSCCNum(*I) != SccNum) ExitingEdges.push_back(I.getSuccessorIndex()); - else if (isSCCHeader(*I, SccNum, SccI)) + else if (SccI->isSCCHeader(*I, SccNum)) BackEdges.push_back(I.getSuccessorIndex()); else InEdges.push_back(I.getSuccessorIndex()); @@ -1072,26 +1138,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI, assert(PostDominatedByUnreachable.empty()); assert(PostDominatedByColdCall.empty()); - // Record SCC numbers of blocks in the CFG to identify irreducible loops. - // FIXME: We could only calculate this if the CFG is known to be irreducible - // (perhaps cache this info in LoopInfo if we can easily calculate it there?). - int SccNum = 0; - SccInfo SccI; - for (scc_iterator It = scc_begin(&F); !It.isAtEnd(); - ++It, ++SccNum) { - // Ignore single-block SCCs since they either aren't loops or LoopInfo will - // catch them. - const std::vector &Scc = *It; - if (Scc.size() == 1) - continue; - - LLVM_DEBUG(dbgs() << "BPI: SCC " << SccNum << ":"); - for (auto *BB : Scc) { - LLVM_DEBUG(dbgs() << " " << BB->getName()); - SccI.SccNums[BB] = SccNum; - } - LLVM_DEBUG(dbgs() << "\n"); - } + SccI = std::make_unique(F); std::unique_ptr PDTPtr; @@ -1119,7 +1166,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI, continue; if (calcColdCallHeuristics(BB)) continue; - if (calcLoopBranchHeuristics(BB, LI, SccI)) + if (calcLoopBranchHeuristics(BB, LI)) continue; if (calcPointerHeuristics(BB)) continue; @@ -1131,6 +1178,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI, PostDominatedByUnreachable.clear(); PostDominatedByColdCall.clear(); + SccI.release(); if (PrintBranchProb && (PrintBranchProbFuncName.empty() || From 97470897c436a6a5d682fb8ab296d0bcdc6e32a4 Mon Sep 17 00:00:00 2001 From: Stefan Pintilie Date: Mon, 27 Jul 2020 12:27:30 -0500 Subject: [PATCH 0290/1035] [PowerPC] Split s34imm into two types Currently the instruction paddi always takes s34imm as the type for the 34 bit immediate. However, the PC Relative form of the instruction should not produce the same fixup as the non PC Relative form. This patch splits the s34imm type into s34imm and s34imm_pcrel so that two different fixups can be emitted. Reviewed By: nemanjai, #powerpc, kamaub Differential Revision: https://reviews.llvm.org/D83255 --- .../PowerPC/MCTargetDesc/PPCAsmBackend.cpp | 4 +++ .../MCTargetDesc/PPCELFObjectWriter.cpp | 3 ++ .../PowerPC/MCTargetDesc/PPCFixupKinds.h | 3 ++ .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp | 30 ++++++++++++++----- .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.h | 9 +++++- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 8 ++++- llvm/lib/Target/PowerPC/PPCInstrPrefix.td | 4 +-- llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s | 7 +++++ 8 files changed, 57 insertions(+), 11 deletions(-) create mode 100644 llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index dbaf221db9fc9..59cb2b994a4b3 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -46,6 +46,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { case PPC::fixup_ppc_half16ds: return Value & 0xfffc; case PPC::fixup_ppc_pcrel34: + case PPC::fixup_ppc_imm34: return Value & 0x3ffffffff; } } @@ -68,6 +69,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case PPC::fixup_ppc_br24_notoc: return 4; case PPC::fixup_ppc_pcrel34: + case PPC::fixup_ppc_imm34: case FK_Data_8: return 8; case PPC::fixup_ppc_nofixup: @@ -100,6 +102,7 @@ class PPCAsmBackend : public MCAsmBackend { { "fixup_ppc_half16", 0, 16, 0 }, { "fixup_ppc_half16ds", 0, 14, 0 }, { "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_imm34", 0, 34, 0 }, { "fixup_ppc_nofixup", 0, 0, 0 } }; const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = { @@ -112,6 +115,7 @@ class PPCAsmBackend : public MCAsmBackend { { "fixup_ppc_half16", 0, 16, 0 }, { "fixup_ppc_half16ds", 2, 14, 0 }, { "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_imm34", 0, 34, 0 }, { "fixup_ppc_nofixup", 0, 0, 0 } }; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index d8b3301e97f12..1cd190c6b04ea 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -409,6 +409,9 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, break; } break; + case PPC::fixup_ppc_imm34: + report_fatal_error("Unsupported Modifier for fixup_ppc_imm34."); + break; case FK_Data_8: switch (Modifier) { default: llvm_unreachable("Unsupported Modifier"); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index 2fb8947fd4e0f..73292f7b7938f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -43,6 +43,9 @@ enum Fixups { // A 34-bit fixup corresponding to PC-relative paddi. fixup_ppc_pcrel34, + // A 34-bit fixup corresponding to Non-PC-relative paddi. + fixup_ppc_imm34, + /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the /// TLS general and local dynamic models, or inserts the thread-pointer /// register number. diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index fb65e7320f2b0..8c0e0a80b1e2c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -104,20 +104,36 @@ unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, return 0; } -uint64_t -PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { +uint64_t PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI, + MCFixupKind Fixup) const { const MCOperand &MO = MI.getOperand(OpNo); - if (MO.isReg() || MO.isImm()) + assert(!MO.isReg() && "Not expecting a register for this operand."); + if (MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); // Add a fixup for the immediate field. - Fixups.push_back(MCFixup::create(0, MO.getExpr(), - (MCFixupKind)PPC::fixup_ppc_pcrel34)); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Fixup)); return 0; } +uint64_t +PPCMCCodeEmitter::getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return getImm34Encoding(MI, OpNo, Fixups, STI, + (MCFixupKind)PPC::fixup_ppc_imm34); +} + +uint64_t +PPCMCCodeEmitter::getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return getImm34Encoding(MI, OpNo, Fixups, STI, + (MCFixupKind)PPC::fixup_ppc_pcrel34); +} + unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h index 588aa76bd8064..4504cc6a7405e 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -52,7 +52,14 @@ class PPCMCCodeEmitter : public MCCodeEmitter { const MCSubtargetInfo &STI) const; uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; + const MCSubtargetInfo &STI, + MCFixupKind Fixup) const; + uint64_t getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint64_t getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index c565758973bf5..f807d61c75d23 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -757,7 +757,13 @@ def PPCS34ImmAsmOperand : AsmOperandClass { } def s34imm : Operand { let PrintMethod = "printS34ImmOperand"; - let EncoderMethod = "getImm34Encoding"; + let EncoderMethod = "getImm34EncodingNoPCRel"; + let ParserMatchClass = PPCS34ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<34>"; +} +def s34imm_pcrel : Operand { + let PrintMethod = "printS34ImmOperand"; + let EncoderMethod = "getImm34EncodingPCRel"; let ParserMatchClass = PPCS34ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<34>"; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 418ef3b377282..fa21c54efc28b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -459,7 +459,7 @@ let Predicates = [PrefixInstrs] in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { defm PADDI8 : MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI), - (ins immZero:$RA, s34imm:$SI), + (ins immZero:$RA, s34imm_pcrel:$SI), "paddi $RT, $RA, $SI", IIC_LdStLFD>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT), @@ -469,7 +469,7 @@ let Predicates = [PrefixInstrs] in { } defm PADDI : MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI), - (ins immZero:$RA, s34imm:$SI), + (ins immZero:$RA, s34imm_pcrel:$SI), "paddi $RT, $RA, $SI", IIC_LdStLFD>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT), diff --git a/llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s b/llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s new file mode 100644 index 0000000000000..0d2c879380e0a --- /dev/null +++ b/llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s @@ -0,0 +1,7 @@ +# RUN: not --crash llvm-mc -triple powerpc64-- --filetype=obj < %s 2> %t +# RUN: FileCheck < %t %s +# RUN: not --crash llvm-mc -triple powerpc64le-- --filetype=obj < %s 2> %t +# RUN: FileCheck < %t %s + +# CHECK: Unsupported Modifier for fixup_ppc_imm34. +paddi 3, 13, symbol@toc, 0 From 443e734fb98df422c90cbc8177520a8182597912 Mon Sep 17 00:00:00 2001 From: Rainer Orth Date: Tue, 28 Jul 2020 13:21:36 +0200 Subject: [PATCH 0291/1035] [compiler-rt][cmake] Don't pass --version-script to Illumos ld Neither the Illumos `ld` nor the Solaris 11.3 one support the `--version-script` and `z gnu-linker-script-compat` options, which breaks the `compiler-rt` build. This patch checks for both options instead of hardcoding their use. Tested on `amd-pc-solaris2.11` (all of Solaris 11.4, 11.3, and Illumos). Differential Revision: https://reviews.llvm.org/D84559 --- compiler-rt/cmake/config-ix.cmake | 13 +++++++++++++ compiler-rt/lib/asan/CMakeLists.txt | 4 ++-- compiler-rt/lib/ubsan/CMakeLists.txt | 4 ++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 0a27910ed4943..74fef8933ef90 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -157,6 +157,19 @@ check_library_exists(stdc++ __cxa_throw "" COMPILER_RT_HAS_LIBSTDCXX) check_linker_flag("-Wl,-z,text" COMPILER_RT_HAS_Z_TEXT) check_linker_flag("-fuse-ld=lld" COMPILER_RT_HAS_FUSE_LD_LLD_FLAG) +set(VERS_COMPAT_OPTION "-Wl,-z,gnu-version-script-compat") +check_linker_flag("${VERS_COMPAT_OPTION}" COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) + +set(DUMMY_VERS ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/dummy.vers) +file(WRITE ${DUMMY_VERS} "{};") +set(VERS_OPTION "-Wl,--version-script,${DUMMY_VERS}") +if(COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) + # Solaris 11.4 ld only supports --version-script with + # -z gnu-version-script-compat. + string(APPEND VERS_OPTION " ${VERS_COMPAT_OPTION}") +endif() +check_linker_flag("${VERS_OPTION}" COMPILER_RT_HAS_VERSION_SCRIPT) + if(ANDROID) check_linker_flag("-Wl,-z,global" COMPILER_RT_HAS_Z_GLOBAL) check_library_exists(log __android_log_write "" COMPILER_RT_HAS_LIBLOG) diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt index 2a1bbb58cce41..0c29893ebfe23 100644 --- a/compiler-rt/lib/asan/CMakeLists.txt +++ b/compiler-rt/lib/asan/CMakeLists.txt @@ -224,7 +224,7 @@ else() PARENT_TARGET asan) foreach(arch ${ASAN_SUPPORTED_ARCH}) - if (UNIX) + if (UNIX AND COMPILER_RT_HAS_VERSION_SCRIPT) add_sanitizer_rt_version_list(clang_rt.asan-dynamic-${arch} LIBS clang_rt.asan-${arch} clang_rt.asan_cxx-${arch} EXTRA asan.syms.extra) @@ -232,7 +232,7 @@ else() -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/clang_rt.asan-dynamic-${arch}.vers) # The Solaris 11.4 linker supports a subset of GNU ld version scripts, # but requires a special option to enable it. - if (OS_NAME MATCHES "SunOS") + if (COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) list(APPEND VERSION_SCRIPT_FLAG -Wl,-z,gnu-version-script-compat) endif() set_property(SOURCE diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt index dca02a65e9713..00b74a63cbe53 100644 --- a/compiler-rt/lib/ubsan/CMakeLists.txt +++ b/compiler-rt/lib/ubsan/CMakeLists.txt @@ -200,7 +200,7 @@ else() CFLAGS ${UBSAN_CXXFLAGS} PARENT_TARGET ubsan) - if (FUCHSIA OR UNIX) + if ((FUCHSIA OR UNIX) AND COMPILER_RT_HAS_VERSION_SCRIPT) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") add_compiler_rt_object_libraries(RTUbsan_dynamic_version_script_dummy ARCHS ${UBSAN_SUPPORTED_ARCH} @@ -216,7 +216,7 @@ else() -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/clang_rt.ubsan_standalone-dynamic-${arch}.vers) # The Solaris 11.4 linker supports a subset of GNU ld version scripts, # but requires a special option to enable it. - if (OS_NAME MATCHES "SunOS") + if (COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) list(APPEND VERSION_SCRIPT_FLAG -Wl,-z,gnu-version-script-compat) endif() set_property(SOURCE From 0f62a53db64a943972e51d3d58610595d22779fd Mon Sep 17 00:00:00 2001 From: Luofan Chen Date: Tue, 28 Jul 2020 19:19:23 +0800 Subject: [PATCH 0292/1035] [Attributor] Add override keyword to the print function of AA The print() function in the `AbstractAttribute` structure overrides the function in the `AADepGraphNode`, so we need to mark it as override. This should fix a buildbot failure introduced by 5ee07dc. --- llvm/include/llvm/Transforms/IPO/Attributor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 48c65c37eec78..a8076d33018cd 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -2138,7 +2138,7 @@ struct AbstractAttribute : public IRPosition, public AADepGraphNode { /// Helper functions, for debug purposes only. ///{ - virtual void print(raw_ostream &OS) const; + void print(raw_ostream &OS) const override; virtual void printWithDeps(raw_ostream &OS) const; void dump() const { print(dbgs()); } From d3557ecede8e72b60df6a6d933d6c6ec16c48154 Mon Sep 17 00:00:00 2001 From: Anirudh Prasad Date: Tue, 28 Jul 2020 06:40:47 -0400 Subject: [PATCH 0293/1035] [Support] Use InitLLVM in llvm-stress, sancov and TableGen This patch refactors the llvm tools namely, llvm-stress and sancov, as well as the llvm TableGen utility, to use the new InitLLVM interface which encapsulates PrettyStackTrace. This is from https://reviews.llvm.org/D70702, but only for LLVM. Reviewed-by: Kai Differential Revision: https://reviews.llvm.org/D83484 --- llvm/tools/llvm-stress/llvm-stress.cpp | 7 ++----- llvm/tools/sancov/sancov.cpp | 9 ++------- llvm/utils/TableGen/TableGen.cpp | 9 ++------- 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp index 22f530dde1670..77cc59839129a 100644 --- a/llvm/tools/llvm-stress/llvm-stress.cpp +++ b/llvm/tools/llvm-stress/llvm-stress.cpp @@ -38,8 +38,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/InitLLVM.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/raw_ostream.h" #include @@ -733,10 +732,8 @@ static void IntroduceControlFlow(Function *F, Random &R) { int main(int argc, char **argv) { using namespace llvm; - // Init LLVM, call llvm_shutdown() on exit, parse args, etc. - PrettyStackTraceProgram X(argc, argv); + InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv, "llvm codegen stress-tester\n"); - llvm_shutdown_obj Y; auto M = std::make_unique("/tmp/autogen.bc", Context); Function *F = GenEmptyFunction(M.get()); diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp index 37d06e385475e..f1d756f216d1a 100644 --- a/llvm/tools/sancov/sancov.cpp +++ b/llvm/tools/sancov/sancov.cpp @@ -32,15 +32,13 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" #include "llvm/Support/JSON.h" #include "llvm/Support/MD5.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" -#include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/Regex.h" #include "llvm/Support/SHA1.h" -#include "llvm/Support/Signals.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/SpecialCaseList.h" #include "llvm/Support/TargetRegistry.h" @@ -1134,10 +1132,7 @@ readSymbolizeAndMergeCmdArguments(std::vector FileNames) { } // namespace int main(int Argc, char **Argv) { - // Print stack trace if we signal out. - sys::PrintStackTraceOnErrorSignal(Argv[0]); - PrettyStackTraceProgram X(Argc, Argv); - llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. + llvm::InitLLVM X(Argc, Argv); llvm::InitializeAllTargetInfos(); llvm::InitializeAllTargetMCs(); diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp index 8015a58471cad..5215c30b707f4 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/TableGen.cpp @@ -12,9 +12,7 @@ #include "TableGenBackends.h" // Declares all backends. #include "llvm/Support/CommandLine.h" -#include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/PrettyStackTrace.h" -#include "llvm/Support/Signals.h" +#include "llvm/Support/InitLLVM.h" #include "llvm/TableGen/Main.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/SetTheory.h" @@ -278,12 +276,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { } int main(int argc, char **argv) { - sys::PrintStackTraceOnErrorSignal(argv[0]); - PrettyStackTraceProgram X(argc, argv); + InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); - llvm_shutdown_obj Y; - return TableGenMain(argv[0], &LLVMTableGenMain); } From 6d10d317d8b0f1975dbb17850efd7c069f6ee8fd Mon Sep 17 00:00:00 2001 From: Stephan Herhut Date: Tue, 28 Jul 2020 13:09:45 +0200 Subject: [PATCH 0294/1035] [MLIR][Shape] Support transforming shape.num_elements on tensors The current transformation to shape.reduce does not support tensor values. This adds the required changes to make that work, including fixing the builder for shape.reduce. Differential Revision: https://reviews.llvm.org/D84744 --- mlir/lib/Dialect/Shape/IR/Shape.cpp | 8 +++++++- .../Shape/Transforms/ShapeToShapeLowering.cpp | 13 +++++++++---- mlir/test/Dialect/Shape/shape-to-shape.mlir | 15 +++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index 4887c87c1e5f1..3c71e3409923e 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -834,7 +834,13 @@ void ReduceOp::build(OpBuilder &builder, OperationState &result, Value shape, bodyRegion->push_back(new Block); Block &bodyBlock = bodyRegion->front(); bodyBlock.addArgument(builder.getIndexType()); - bodyBlock.addArgument(SizeType::get(builder.getContext())); + + Type elementType; + if (auto tensorType = shape.getType().dyn_cast()) + elementType = tensorType.getElementType(); + else + elementType = SizeType::get(builder.getContext()); + bodyBlock.addArgument(elementType); for (Type initValType : initVals.getTypes()) { bodyBlock.addArgument(initValType); diff --git a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp index bb2b03b8ec081..a84fad1f94602 100644 --- a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp +++ b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp @@ -9,6 +9,7 @@ #include "PassDetail.h" #include "mlir/Dialect/Shape/IR/Shape.h" #include "mlir/Dialect/Shape/Transforms/Passes.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" @@ -32,14 +33,18 @@ LogicalResult NumElementsOpConverter::matchAndRewrite(NumElementsOp op, PatternRewriter &rewriter) const { auto loc = op.getLoc(); - Value init = rewriter.create(loc, rewriter.getIndexAttr(1)); + Type valueType = op.getResult().getType(); + Value init = op.getDialect() + ->materializeConstant(rewriter, rewriter.getIndexAttr(1), + valueType, loc) + ->getResult(0); ReduceOp reduce = rewriter.create(loc, op.shape(), init); // Generate reduce operator. Block *body = reduce.getBody(); OpBuilder b = OpBuilder::atBlockEnd(body); - Value product = b.create(loc, b.getType(), - body->getArgument(1), body->getArgument(2)); + Value product = b.create(loc, valueType, body->getArgument(1), + body->getArgument(2)); b.create(loc, product); rewriter.replaceOp(op, reduce.result()); @@ -60,7 +65,7 @@ void ShapeToShapeLowering::runOnFunction() { populateShapeRewritePatterns(&ctx, patterns); ConversionTarget target(getContext()); - target.addLegalDialect(); + target.addLegalDialect(); target.addIllegalOp(); if (failed(mlir::applyPartialConversion(getFunction(), target, patterns))) signalPassFailure(); diff --git a/mlir/test/Dialect/Shape/shape-to-shape.mlir b/mlir/test/Dialect/Shape/shape-to-shape.mlir index d1b00bc12a22c..481d682942bb0 100644 --- a/mlir/test/Dialect/Shape/shape-to-shape.mlir +++ b/mlir/test/Dialect/Shape/shape-to-shape.mlir @@ -14,3 +14,18 @@ func @num_elements_to_reduce(%shape : !shape.shape) -> !shape.size { // CHECK: } // CHECK: return [[NUM_ELEMENTS]] : !shape.size +// ----- + +// CHECK-LABEL: func @num_elements_to_reduce_on_index +// CHECK-SAME: ([[ARG:%.*]]: tensor) -> index +func @num_elements_to_reduce_on_index(%shape : tensor) -> index { + %num_elements = shape.num_elements %shape : tensor -> index + return %num_elements : index +} +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[NUM_ELEMENTS:%.*]] = shape.reduce([[ARG]], [[C1]]) : tensor -> index +// CHECK: ^bb0({{.*}}: index, [[DIM:%.*]]: index, [[ACC:%.*]]: index +// CHECK: [[NEW_ACC:%.*]] = shape.mul [[DIM]], [[ACC]] +// CHECK: shape.yield [[NEW_ACC]] : index +// CHECK: } +// CHECK: return [[NUM_ELEMENTS]] : index From 39108f4c7a2c52be88f73bd6abaa613f4b28d327 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Mon, 27 Jul 2020 14:37:14 +0100 Subject: [PATCH 0295/1035] ARM: make Thumb1 instructions non-flag-setting in IT block. Many Thumb1 instructions are defined to set CPSR if executed outside an IT block, but leave it alone from inside one. In MachineIR this is represented by whether an optional register is CPSR or NoReg (0), and affects how the instructions are printed. This sets the instruction to the appropriate form during if-conversion. --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 12 ++++++++++++ llvm/lib/Target/ARM/ARMInstrFormats.td | 5 +++-- llvm/test/CodeGen/ARM/thumb2-it-block.ll | 4 +--- llvm/test/CodeGen/Thumb2/ifcvt-rescan-diamonds.ll | 3 ++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 4cc2b6bf7e7e0..d340931f31d8d 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -537,6 +537,18 @@ bool ARMBaseInstrInfo::PredicateInstruction( MachineOperand &PMO = MI.getOperand(PIdx); PMO.setImm(Pred[0].getImm()); MI.getOperand(PIdx+1).setReg(Pred[1].getReg()); + + // Thumb 1 arithmetic instructions do not set CPSR when executed inside an + // IT block. This affects how they are printed. + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) { + assert(MCID.OpInfo[1].isOptionalDef() && "CPSR def isn't expected operand"); + assert((MI.getOperand(1).isDead() || + MI.getOperand(1).getReg() != ARM::CPSR) && + "if conversion tried to stop defining used CPSR"); + MI.getOperand(1).setReg(ARM::NoRegister); + } + return true; } return false; diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td index e13f3437cc7bd..85da7c5a535e9 100644 --- a/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -403,8 +403,9 @@ class InstTemplate Date: Tue, 28 Jul 2020 19:50:40 +0700 Subject: [PATCH 0296/1035] [BPI] Fix memory leak reported by sanitizer bots There is a silly mistake where release() is used instead of reset() for free resources of unique pointer. Reviewed By: ebrevnov Differential Revision: https://reviews.llvm.org/D84747 --- llvm/lib/Analysis/BranchProbabilityInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp index 195fc69d9601d..7e34f3b6c869b 100644 --- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp +++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp @@ -1178,7 +1178,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI, PostDominatedByUnreachable.clear(); PostDominatedByColdCall.clear(); - SccI.release(); + SccI.reset(); if (PrintBranchProb && (PrintBranchProbFuncName.empty() || From 9b4826d18b5fba5fe638e1045c9c47d3fe8da8c8 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 28 Jul 2020 08:57:03 -0400 Subject: [PATCH 0297/1035] [OpenMP] Fix libomptarget negative tests to expect abort On runtime failures, D83963 causes the runtime to abort instead of merely exiting with a non-zero value, but many tests in the libomptarget test suite still expect the former behavior. This patch updates the test suite and was discussed in post-commit comments on D83963 and D84557. --- openmp/libomptarget/test/lit.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg index 77476c6fec793..6936bfa0c3195 100644 --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -114,7 +114,7 @@ for libomptarget_target in config.libomptarget_all_targets: "%t-" + libomptarget_target)) config.substitutions.append(("%libomptarget-run-fail-" + \ libomptarget_target, \ - "%not %t-" + libomptarget_target)) + "%not --crash %t-" + libomptarget_target)) config.substitutions.append(("%clangxx-" + libomptarget_target, \ "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target)) config.substitutions.append(("%clang-" + libomptarget_target, \ From bd93f5ce07ef2fb9c1897bddc576fe4afb464788 Mon Sep 17 00:00:00 2001 From: Georgii Rymar Date: Tue, 28 Jul 2020 12:53:06 +0300 Subject: [PATCH 0298/1035] [yaml2obj] - Add a way to override sh_type section field. This adds the `ShType` key similar to others `Sh*` keys we have. My use case is the following. Imagine we have a `SHT_SYMTAB_SHNDX` section and want to hide it from a dumper. The natural way would be to do something like: ``` - Name: .symtab_shndx Type: [[TYPE=SHT_SYMTAB_SHNDX]] Entries: [ 0, 1 ] ``` and then change the TYPE from `SHT_SYMTAB_SHNDX` to something else, for example to `SHT_PROGBITS`. But we have a problem: regular sections does not have `Entries` key, so yaml2obj will be unable to produce a section. The solution is to introduce a `ShType` key to override the final type. This is not the first time I am facing the need to change the type. I was able to invent workarounds or solved issues differently in the past, but finally came to conclusion that we just should support the `ShType`. Differential revision: https://reviews.llvm.org/D84738 --- llvm/include/llvm/ObjectYAML/ELFYAML.h | 6 +++ llvm/lib/ObjectYAML/ELFEmitter.cpp | 2 + llvm/lib/ObjectYAML/ELFYAML.cpp | 4 +- .../tools/yaml2obj/ELF/override-shtype.yaml | 44 +++++++++++++++++++ 4 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/yaml2obj/ELF/override-shtype.yaml diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h index b1ffb20681ea8..9a5c06fdfcd5c 100644 --- a/llvm/include/llvm/ObjectYAML/ELFYAML.h +++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h @@ -206,6 +206,12 @@ struct Section : public Chunk { // This can be used to override the sh_flags field. Optional ShFlags; + + // This can be used to override the sh_type field. It is useful when we + // want to use specific YAML keys for a section of a particular type to + // describe the content, but still want to have a different final type + // for the section. + Optional ShType; }; // Fill is a block of data which is placed outside of sections. It is diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index 65b03050c7cdb..bc27c03cb6877 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -551,6 +551,8 @@ static void overrideFields(ELFYAML::Section *From, typename ELFT::Shdr &To) { To.sh_offset = *From->ShOffset; if (From->ShSize) To.sh_size = *From->ShSize; + if (From->ShType) + To.sh_type = *From->ShType; } template diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index f85d6a5d30efa..f460a387540d2 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1094,11 +1094,13 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) { // automatically when they are not explicitly defined. assert(!IO.outputting() || (!Section.ShOffset.hasValue() && !Section.ShSize.hasValue() && - !Section.ShName.hasValue() && !Section.ShFlags.hasValue())); + !Section.ShName.hasValue() && !Section.ShFlags.hasValue() && + !Section.ShType.hasValue())); IO.mapOptional("ShName", Section.ShName); IO.mapOptional("ShOffset", Section.ShOffset); IO.mapOptional("ShSize", Section.ShSize); IO.mapOptional("ShFlags", Section.ShFlags); + IO.mapOptional("ShType", Section.ShType); } static void sectionMapping(IO &IO, ELFYAML::DynamicSection &Section) { diff --git a/llvm/test/tools/yaml2obj/ELF/override-shtype.yaml b/llvm/test/tools/yaml2obj/ELF/override-shtype.yaml new file mode 100644 index 0000000000000..ac29b3b0e8bc5 --- /dev/null +++ b/llvm/test/tools/yaml2obj/ELF/override-shtype.yaml @@ -0,0 +1,44 @@ +## Check we are able to override the sh_type field for different sections. +## When doing this we are still able to use YAML keys that can be normally used +## to describe a section with the original type specified with the Type key. + +# RUN: yaml2obj %s -o %t1 +# RUN: llvm-readobj --sections --section-data %t1 | FileCheck %s --check-prefixes=COMMON,ORIGINAL + +## Check we can use a hex value for the ShType. SHT_PROGBITS == 0x1. +# RUN: yaml2obj -DTYPE=0x1 %s -o %t2 +# RUN: llvm-readobj --sections --section-data %t2 | FileCheck %s --check-prefixes=COMMON,OVERRIDE + +# COMMON: Name: .gnu.version_r +# ORIGINAL-NEXT: Type: SHT_GNU_verneed +# OVERRIDE-NEXT: Type: SHT_PROGBITS +# COMMON: SectionData ( +# COMMON-NEXT: 0000: 01000100 04000000 10000000 00000000 | +# COMMON-NEXT: 0010: 91070000 00000300 01000000 00000000 | +# COMMON-NEXT: ) + +## Check we can use a string type name for the ShType. +# RUN: yaml2obj -DTYPE=SHT_PROGBITS %s -o %t3 +# RUN: cmp %t2 %t3 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + ShType: [[TYPE=SHT_GNU_verneed]] + Info: 0 + Dependencies: + - Version: 1 + File: dso.so.0 + Entries: + - Name: v1 + Hash: 1937 + Flags: 0 + Other: 3 +DynamicSymbols: [] From 7bae3188e08746566733148a4ceccdb3cf24e93b Mon Sep 17 00:00:00 2001 From: Nathan James Date: Tue, 28 Jul 2020 14:52:32 +0100 Subject: [PATCH 0299/1035] [clang-tidy][NFC] Make OptionsView methods as const where missing --- clang-tools-extra/clang-tidy/ClangTidyCheck.cpp | 7 +++---- clang-tools-extra/clang-tidy/ClangTidyCheck.h | 17 +++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp index c24b8553999cc..ffd5bf974ba2f 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp @@ -168,10 +168,9 @@ void ClangTidyCheck::OptionsView::store( store(Options, LocalName, Value ? StringRef("true") : StringRef("false")); } -llvm::Expected -ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName, - ArrayRef Mapping, - bool CheckGlobal, bool IgnoreCase) { +llvm::Expected ClangTidyCheck::OptionsView::getEnumInt( + StringRef LocalName, ArrayRef Mapping, bool CheckGlobal, + bool IgnoreCase) const { auto Iter = CheckGlobal ? findPriorityOption(CheckOptions, NamePrefix, LocalName) : CheckOptions.find((NamePrefix + LocalName).str()); diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.h b/clang-tools-extra/clang-tidy/ClangTidyCheck.h index 54b7251267524..4df8071c841e0 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyCheck.h +++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.h @@ -330,7 +330,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, llvm::Expected> - get(StringRef LocalName, bool IgnoreCase = false) { + get(StringRef LocalName, bool IgnoreCase = false) const { if (llvm::Expected ValueOr = getEnumInt(LocalName, typeEraseMapping(), false, IgnoreCase)) return static_cast(*ValueOr); @@ -349,7 +349,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, T> - get(StringRef LocalName, T Default, bool IgnoreCase = false) { + get(StringRef LocalName, T Default, bool IgnoreCase = false) const { if (auto ValueOr = get(LocalName, IgnoreCase)) return *ValueOr; else @@ -370,8 +370,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, llvm::Expected> - getLocalOrGlobal(StringRef LocalName, - bool IgnoreCase = false) { + getLocalOrGlobal(StringRef LocalName, bool IgnoreCase = false) const { if (llvm::Expected ValueOr = getEnumInt(LocalName, typeEraseMapping(), true, IgnoreCase)) return static_cast(*ValueOr); @@ -391,7 +390,8 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, T> - getLocalOrGlobal(StringRef LocalName, T Default, bool IgnoreCase = false) { + getLocalOrGlobal(StringRef LocalName, T Default, + bool IgnoreCase = false) const { if (auto ValueOr = getLocalOrGlobal(LocalName, IgnoreCase)) return *ValueOr; else @@ -420,7 +420,8 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value> - store(ClangTidyOptions::OptionMap &Options, StringRef LocalName, T Value) { + store(ClangTidyOptions::OptionMap &Options, StringRef LocalName, + T Value) const { ArrayRef> Mapping = OptionEnumMapping::getEnumMapping(); auto Iter = llvm::find_if( @@ -436,11 +437,11 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { llvm::Expected getEnumInt(StringRef LocalName, ArrayRef Mapping, - bool CheckGlobal, bool IgnoreCase); + bool CheckGlobal, bool IgnoreCase) const; template std::enable_if_t::value, std::vector> - typeEraseMapping() { + typeEraseMapping() const { ArrayRef> Mapping = OptionEnumMapping::getEnumMapping(); std::vector Result; From d28f86723f37b2329428dfbcf847d3261f38dcc8 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Tue, 28 Jul 2020 03:02:20 +0000 Subject: [PATCH 0300/1035] Re-land "[PowerPC] Remove QPX/A2Q BGQ/BGP CNK support" This reverts commit bf544fa1c3cb80f24d85e84559fb11193846259f. Fixed the typo in PPCInstrInfo.cpp. --- clang/lib/Basic/Targets/PPC.cpp | 39 +- clang/lib/Basic/Targets/PPC.h | 3 - clang/lib/Driver/ToolChains/Arch/PPC.cpp | 1 - clang/lib/Driver/ToolChains/Clang.cpp | 12 - clang/test/Driver/clang-translation.c | 6 - clang/test/Driver/ppc-abi.c | 20 - clang/test/Misc/target-invalid-cpu-note.c | 2 +- clang/test/Preprocessor/init-ppc64.c | 16 - llvm/docs/LangRef.rst | 11 +- llvm/include/llvm/ADT/Triple.h | 3 - llvm/include/llvm/IR/IntrinsicsPowerPC.td | 176 --- llvm/lib/Support/Triple.cpp | 6 - .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 18 - llvm/lib/Target/PowerPC/CMakeLists.txt | 1 - .../PowerPC/Disassembler/PPCDisassembler.cpp | 15 +- .../PowerPC/MCTargetDesc/PPCInstPrinter.cpp | 12 - .../PowerPC/MCTargetDesc/PPCMCTargetDesc.h | 1 - llvm/lib/Target/PowerPC/PPC.h | 2 - llvm/lib/Target/PowerPC/PPC.td | 14 +- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 3 - llvm/lib/Target/PowerPC/PPCCallingConv.td | 16 - llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 16 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 1025 +------------- llvm/lib/Target/PowerPC/PPCISelLowering.h | 20 - llvm/lib/Target/PowerPC/PPCInstrFormats.td | 52 - llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 25 +- llvm/lib/Target/PowerPC/PPCInstrInfo.h | 23 +- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 11 - llvm/lib/Target/PowerPC/PPCInstrQPX.td | 1212 ----------------- llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp | 161 --- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 3 - llvm/lib/Target/PowerPC/PPCRegisterInfo.h | 1 - llvm/lib/Target/PowerPC/PPCRegisterInfo.td | 23 - llvm/lib/Target/PowerPC/PPCScheduleP9.td | 9 +- llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 12 +- llvm/lib/Target/PowerPC/PPCSubtarget.h | 14 - llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 23 +- .../Target/PowerPC/PPCTargetTransformInfo.cpp | 76 +- .../Instrumentation/MemorySanitizer.cpp | 7 +- llvm/test/Analysis/BasicAA/phi-spec-order.ll | 2 +- .../CostModel/PowerPC/unal-vec-ldst.ll | 73 - .../CodeGen/PowerPC/2012-11-16-mischedcall.ll | 2 +- ...leHoistingDueToBlockHotnessProfileData.mir | 2 +- .../NoCRFieldRedefWhenSpillingCRBIT.mir | 2 +- llvm/test/CodeGen/PowerPC/a2q-stackalign.ll | 23 - llvm/test/CodeGen/PowerPC/a2q.ll | 10 - .../PowerPC/aantidep-inline-asm-use.ll | 2 +- llvm/test/CodeGen/PowerPC/asm-Zy.ll | 3 +- llvm/test/CodeGen/PowerPC/asm-constraints.ll | 2 +- ...rt-rr-to-ri-instrs-R0-special-handling.mir | 4 +- .../convert-rr-to-ri-instrs-out-of-range.mir | 2 +- .../PowerPC/convert-rr-to-ri-instrs.mir | 8 +- llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll | 11 - .../CodeGen/PowerPC/ctrloop-shortLoops.ll | 7 - llvm/test/CodeGen/PowerPC/ec-input.ll | 2 +- .../CodeGen/PowerPC/extra-toc-reg-deps.ll | 8 +- .../CodeGen/PowerPC/fast-isel-icmp-split.ll | 2 +- .../PowerPC/fma-mutate-duplicate-vreg.ll | 2 +- .../CodeGen/PowerPC/fp2int2fp-ppcfp128.ll | 3 +- .../CodeGen/PowerPC/glob-comp-aa-crash.ll | 4 +- .../PowerPC/ifcvt-forked-bug-2016-08-08.ll | 2 +- .../test/CodeGen/PowerPC/inlineasm-i64-reg.ll | 4 +- llvm/test/CodeGen/PowerPC/load-two-flts.ll | 3 +- .../PowerPC/loop-data-prefetch-inner.ll | 4 +- .../CodeGen/PowerPC/loop-data-prefetch.ll | 4 +- llvm/test/CodeGen/PowerPC/loop-prep-all.ll | 10 +- .../PowerPC/lxv-aligned-stack-slots.ll | 2 +- llvm/test/CodeGen/PowerPC/machine-combiner.ll | 24 - llvm/test/CodeGen/PowerPC/mc-instrlat.ll | 4 +- llvm/test/CodeGen/PowerPC/mcount-insertion.ll | 3 +- llvm/test/CodeGen/PowerPC/memcpy-vec.ll | 23 - llvm/test/CodeGen/PowerPC/memset-nc.ll | 48 - .../PowerPC/misched-inorder-latency.ll | 3 +- llvm/test/CodeGen/PowerPC/misched.ll | 1 - .../CodeGen/PowerPC/optnone-crbits-i1-ret.ll | 3 +- .../CodeGen/PowerPC/pcrel-local-caller-toc.ll | 6 +- llvm/test/CodeGen/PowerPC/popcnt.ll | 2 - llvm/test/CodeGen/PowerPC/ppc-passname.ll | 11 - llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll | 21 +- llvm/test/CodeGen/PowerPC/pr24546.ll | 4 +- llvm/test/CodeGen/PowerPC/pr27350.ll | 2 +- llvm/test/CodeGen/PowerPC/pr28130.ll | 2 +- .../CodeGen/PowerPC/preinc-ld-sel-crash.ll | 2 +- llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll | 33 - llvm/test/CodeGen/PowerPC/qpx-bv.ll | 37 - llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll | 22 - llvm/test/CodeGen/PowerPC/qpx-load-splat.ll | 80 -- llvm/test/CodeGen/PowerPC/qpx-load.ll | 26 - llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll | 79 -- llvm/test/CodeGen/PowerPC/qpx-recipest.ll | 473 ------- llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll | 109 -- llvm/test/CodeGen/PowerPC/qpx-s-load.ll | 26 - llvm/test/CodeGen/PowerPC/qpx-s-sel.ll | 143 -- llvm/test/CodeGen/PowerPC/qpx-s-store.ll | 25 - llvm/test/CodeGen/PowerPC/qpx-sel.ll | 151 -- llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll | 31 - llvm/test/CodeGen/PowerPC/qpx-store.ll | 25 - .../test/CodeGen/PowerPC/qpx-unal-cons-lds.ll | 217 --- llvm/test/CodeGen/PowerPC/qpx-unalperm.ll | 64 - llvm/test/CodeGen/PowerPC/rlwimi-and.ll | 4 +- .../CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir | 2 +- .../CodeGen/PowerPC/s000-alias-misched.ll | 5 +- llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll | 571 -------- .../selectiondag-extload-computeknownbits.ll | 2 +- llvm/test/CodeGen/PowerPC/setcr_bc.mir | 4 +- llvm/test/CodeGen/PowerPC/setcr_bc2.mir | 4 +- llvm/test/CodeGen/PowerPC/stwu-sched.ll | 2 +- llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll | 149 -- llvm/test/CodeGen/PowerPC/uwtables.ll | 2 +- .../MemorySanitizer/PowerPC/vararg-ppc64.ll | 15 - llvm/test/MC/Disassembler/PowerPC/qpx.txt | 371 ----- llvm/test/MC/PowerPC/qpx.s | 252 ---- .../IPConstantProp/fp-bc-icmp-const-fold.ll | 2 +- .../MSSA/combined-partial-overwrites.ll | 2 +- .../combined-partial-overwrites.ll | 2 +- .../EntryExitInstrumenter/mcount.ll | 2 +- .../InstCombine/PowerPC/aligned-qpx.ll | 165 --- .../LoopDataPrefetch/PowerPC/basic.ll | 5 +- .../test/Transforms/LoopSimplify/dup-preds.ll | 2 +- llvm/test/Transforms/LoopUnroll/pr14167.ll | 2 +- .../PowerPC/agg-interleave-a2.ll | 40 - .../PowerPC/vectorize-only-for-real.ll | 2 +- llvm/test/Transforms/NewGVN/pr31483.ll | 4 +- .../Transforms/SCCP/fp-bc-icmp-const-fold.ll | 2 +- llvm/unittests/ADT/TripleTest.cpp | 35 - .../llvm/lib/Target/PowerPC/BUILD.gn | 1 - openmp/runtime/src/kmp.h | 3 - openmp/runtime/src/kmp_csupport.cpp | 11 - openmp/runtime/src/kmp_lock.h | 2 +- openmp/runtime/src/kmp_os.h | 2 +- openmp/runtime/src/kmp_platform.h | 6 - openmp/runtime/src/z_Linux_asm.S | 7 +- openmp/runtime/src/z_Linux_util.cpp | 2 +- polly/lib/External/isl/config.sub | 11 +- polly/lib/External/ppcg/config.sub | 11 +- 135 files changed, 174 insertions(+), 6523 deletions(-) delete mode 100644 llvm/lib/Target/PowerPC/PPCInstrQPX.td delete mode 100644 llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp delete mode 100644 llvm/test/CodeGen/PowerPC/a2q-stackalign.ll delete mode 100644 llvm/test/CodeGen/PowerPC/a2q.ll delete mode 100644 llvm/test/CodeGen/PowerPC/memset-nc.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-bv.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-load-splat.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-load.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-recipest.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-load.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-sel.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-s-store.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-sel.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-store.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll delete mode 100644 llvm/test/CodeGen/PowerPC/qpx-unalperm.ll delete mode 100644 llvm/test/MC/Disassembler/PowerPC/qpx.txt delete mode 100644 llvm/test/MC/PowerPC/qpx.s delete mode 100644 llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll delete mode 100644 llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index f0de2bf070ea4..5f716a541ae92 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -46,8 +46,6 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector &Features, HasP8Crypto = true; } else if (Feature == "+direct-move") { HasDirectMove = true; - } else if (Feature == "+qpx") { - HasQPX = true; } else if (Feature == "+htm") { HasHTM = true; } else if (Feature == "+float128") { @@ -99,7 +97,7 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, } // ABI options. - if (ABI == "elfv1" || ABI == "elfv1-qpx") + if (ABI == "elfv1") Builder.defineMacro("_CALL_ELF", "1"); if (ABI == "elfv2") Builder.defineMacro("_CALL_ELF", "2"); @@ -159,22 +157,11 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("_ARCH_PWR10"); if (ArchDefs & ArchDefineA2) Builder.defineMacro("_ARCH_A2"); - if (ArchDefs & ArchDefineA2q) { - Builder.defineMacro("_ARCH_A2Q"); - Builder.defineMacro("_ARCH_QP"); - } if (ArchDefs & ArchDefineE500) Builder.defineMacro("__NO_LWSYNC__"); if (ArchDefs & ArchDefineFuture) Builder.defineMacro("_ARCH_PWR_FUTURE"); - if (getTriple().getVendor() == llvm::Triple::BGQ) { - Builder.defineMacro("__bg__"); - Builder.defineMacro("__THW_BLUEGENE__"); - Builder.defineMacro("__bgq__"); - Builder.defineMacro("__TOS_BGQ__"); - } - if (HasAltivec) { Builder.defineMacro("__VEC__", "10206"); Builder.defineMacro("__ALTIVEC__"); @@ -277,7 +264,6 @@ bool PPCTargetInfo::initFeatureMap( .Case("ppc64le", true) .Default(false); - Features["qpx"] = (CPU == "a2q"); Features["power9-vector"] = (CPU == "pwr9"); Features["crypto"] = llvm::StringSwitch(CPU) .Case("ppc64le", true) @@ -373,7 +359,6 @@ bool PPCTargetInfo::hasFeature(StringRef Feature) const { .Case("power8-vector", HasP8Vector) .Case("crypto", HasP8Crypto) .Case("direct-move", HasDirectMove) - .Case("qpx", HasQPX) .Case("htm", HasHTM) .Case("bpermd", HasBPERMD) .Case("extdiv", HasExtDiv) @@ -503,17 +488,17 @@ ArrayRef PPCTargetInfo::getGCCAddlRegNames() const { } static constexpr llvm::StringLiteral ValidCPUNames[] = { - {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, - {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, - {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, - {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, - {"g5"}, {"a2"}, {"a2q"}, {"e500"}, {"e500mc"}, - {"e5500"}, {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, - {"power5"}, {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, - {"pwr6"}, {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, - {"power8"}, {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, - {"pwr10"}, {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, - {"powerpc64le"}, {"ppc64le"}, {"future"}}; + {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, + {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, + {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, + {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, + {"g5"}, {"a2"}, {"e500"}, {"e500mc"}, {"e5500"}, + {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, {"power5"}, + {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, {"pwr6"}, + {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, {"power8"}, + {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, {"pwr10"}, + {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, {"powerpc64le"}, + {"ppc64le"}, {"future"}}; bool PPCTargetInfo::isValidCPUName(StringRef Name) const { return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames); diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index ff8579b6c3cf4..c2048b2145918 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -46,7 +46,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { ArchDefinePwr10 = 1 << 14, ArchDefineFuture = 1 << 15, ArchDefineA2 = 1 << 16, - ArchDefineA2q = 1 << 17, ArchDefineE500 = 1 << 18 } ArchDefineTypes; @@ -63,7 +62,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool HasP8Vector = false; bool HasP8Crypto = false; bool HasDirectMove = false; - bool HasQPX = false; bool HasHTM = false; bool HasBPERMD = false; bool HasExtDiv = false; @@ -118,7 +116,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { .Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) .Case("a2", ArchDefineA2) - .Case("a2q", ArchDefineName | ArchDefineA2 | ArchDefineA2q) .Cases("power3", "pwr3", ArchDefinePpcgr) .Cases("power4", "pwr4", ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index 144e276a6bd87..bcaecf4b2d980 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -57,7 +57,6 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) { .Case("970", "970") .Case("G5", "g5") .Case("a2", "a2") - .Case("a2q", "a2q") .Case("e500", "e500") .Case("e500mc", "e500mc") .Case("e5500", "e5500") diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7a73eea013bdf..b0de225f8abf5 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1883,18 +1883,6 @@ void Clang::AddPPCTargetArgs(const ArgList &Args, if (T.isOSBinFormatELF()) { switch (getToolChain().getArch()) { case llvm::Triple::ppc64: { - // When targeting a processor that supports QPX, or if QPX is - // specifically enabled, default to using the ABI that supports QPX (so - // long as it is not specifically disabled). - bool HasQPX = false; - if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) - HasQPX = A->getValue() == StringRef("a2q"); - HasQPX = Args.hasFlag(options::OPT_mqpx, options::OPT_mno_qpx, HasQPX); - if (HasQPX) { - ABIName = "elfv1-qpx"; - break; - } - if (T.isMusl() || (T.isOSFreeBSD() && T.getOSMajorVersion() >= 13)) ABIName = "elfv2"; else diff --git a/clang/test/Driver/clang-translation.c b/clang/test/Driver/clang-translation.c index 2f02970a2a8ee..d1daeb80004b7 100644 --- a/clang/test/Driver/clang-translation.c +++ b/clang/test/Driver/clang-translation.c @@ -167,12 +167,6 @@ // PPCPWR8: "-cc1" // PPCPWR8: "-target-cpu" "pwr8" -// RUN: %clang -target powerpc64-unknown-linux-gnu \ -// RUN: -### -S %s -mcpu=a2q 2>&1 | FileCheck -check-prefix=PPCA2Q %s -// PPCA2Q: clang -// PPCA2Q: "-cc1" -// PPCA2Q: "-target-cpu" "a2q" - // RUN: %clang -target powerpc64-unknown-linux-gnu \ // RUN: -### -S %s -mcpu=630 2>&1 | FileCheck -check-prefix=PPC630 %s // PPC630: clang diff --git a/clang/test/Driver/ppc-abi.c b/clang/test/Driver/ppc-abi.c index acc4981a2eee6..2b5cc463e7c3d 100644 --- a/clang/test/Driver/ppc-abi.c +++ b/clang/test/Driver/ppc-abi.c @@ -5,14 +5,6 @@ // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1 %s // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1 %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-BE %s // RUN: %clang -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -34,8 +26,6 @@ // CHECK-ELFv1: "-target-abi" "elfv1" // CHECK-ELFv1-LE: "-mrelocation-model" "static" // CHECK-ELFv1-LE: "-target-abi" "elfv1" -// CHECK-ELFv1-QPX: "-mrelocation-model" "static" -// CHECK-ELFv1-QPX: "-target-abi" "elfv1-qpx" // CHECK-ELFv2: "-mrelocation-model" "static" // CHECK-ELFv2: "-target-abi" "elfv2" // CHECK-ELFv2-BE: "-mrelocation-model" "static" @@ -48,14 +38,6 @@ // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1-PIC %s // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-PIC %s // RUN: %clang -fPIC -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -69,8 +51,6 @@ // CHECK-ELFv1-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv1-PIC: "-target-abi" "elfv1" -// CHECK-ELFv1-QPX-PIC: "-mrelocation-model" "pic" "-pic-level" "2" -// CHECK-ELFv1-QPX-PIC: "-target-abi" "elfv1-qpx" // CHECK-ELFv2-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv2-PIC: "-target-abi" "elfv2" diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index 3a376a7caab46..bf6eaefe0b3ca 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -79,7 +79,7 @@ // PPC: error: unknown target CPU 'not-a-cpu' // PPC: note: valid target CPU values are: generic, 440, 450, 601, 602, 603, // PPC-SAME: 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, -// PPC-SAME: 8548, 970, g5, a2, a2q, e500, e500mc, e5500, power3, pwr3, power4, +// PPC-SAME: 8548, 970, g5, a2, e500, e500mc, e5500, power3, pwr3, power4, // PPC-SAME: pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x, // PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, powerpc64, // PPC-SAME: ppc64, powerpc64le, ppc64le, future diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c index ed8601636554e..48d35c95aa570 100644 --- a/clang/test/Preprocessor/init-ppc64.c +++ b/clang/test/Preprocessor/init-ppc64.c @@ -408,21 +408,6 @@ // PPC64LE:#define __ppc64__ 1 // PPC64LE:#define __ppc__ 1 // -// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCA2Q %s -// -// PPCA2Q:#define _ARCH_A2 1 -// PPCA2Q:#define _ARCH_A2Q 1 -// PPCA2Q:#define _ARCH_PPC 1 -// PPCA2Q:#define _ARCH_PPC64 1 -// PPCA2Q:#define _ARCH_QP 1 -// -// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCBGQ %s -// -// PPCBGQ:#define __THW_BLUEGENE__ 1 -// PPCBGQ:#define __TOS_BGQ__ 1 -// PPCBGQ:#define __bg__ 1 -// PPCBGQ:#define __bgq__ 1 -// // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC630 %s // // PPC630:#define _ARCH_630 1 @@ -1069,7 +1054,6 @@ // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s -// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 6b9c5c6899819..af93a6ed5c56e 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4310,14 +4310,9 @@ PowerPC: - ``r``: A 32 or 64-bit integer register. - ``b``: A 32 or 64-bit integer register, excluding ``R0`` (that is: ``R1-R31``). -- ``f``: A 32 or 64-bit float register (``F0-F31``), or when QPX is enabled, a - 128 or 256-bit QPX register (``Q0-Q31``; aliases the ``F`` registers). -- ``v``: For ``4 x f32`` or ``4 x f64`` types, when QPX is enabled, a - 128 or 256-bit QPX register (``Q0-Q31``), otherwise a 128-bit - altivec vector register (``V0-V31``). - - .. FIXME: is this a bug that v accepts QPX registers? I think this - is supposed to only use the altivec vector registers? +- ``f``: A 32 or 64-bit float register (``F0-F31``), +- ``v``: For ``4 x f32`` or ``4 x f64`` types, a 128-bit altivec vector + register (``V0-V31``). - ``y``: Condition register (``CR0-CR7``). - ``wc``: An individual CR bit in a CR register. diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h index 6bad18f19244e..c578c097c6f64 100644 --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -142,8 +142,6 @@ class Triple { Apple, PC, SCEI, - BGP, - BGQ, Freescale, IBM, ImaginationTechnologies, @@ -179,7 +177,6 @@ class Triple { Minix, RTEMS, NaCl, // Native Client - CNK, // BG/P Compute-Node Kernel AIX, CUDA, // NVIDIA CUDA NVCL, // NVIDIA OpenCL diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index 23bcf3ce1959c..853d26c67ee3d 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1109,182 +1109,6 @@ def int_ppc_vsx_xxblendvd: GCCBuiltin<"__builtin_vsx_xxblendvd">, [IntrNoMem]>; } -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsics. -// - -let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.". - /// PowerPC_QPX_Intrinsic - Base class for all QPX intrinsics. - class PowerPC_QPX_Intrinsic ret_types, - list param_types, - list properties> - : GCCBuiltin, - Intrinsic; -} - -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsic Class Definitions. -// - -/// PowerPC_QPX_FF_Intrinsic - A PowerPC intrinsic that takes one v4f64 -/// vector and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f64 -/// vectors and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FFF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_FFFF_Intrinsic - A PowerPC intrinsic that takes three v4f64 -/// vectors and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FFFF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_Load_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and returns a v4f64. -class PowerPC_QPX_Load_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_LoadPerm_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and returns a v4f64 permutation. -class PowerPC_QPX_LoadPerm_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_Store_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and stores a v4f64. -class PowerPC_QPX_Store_Intrinsic - : PowerPC_QPX_Intrinsic; - -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsic Definitions. - -let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". - // Add Instructions - def int_ppc_qpx_qvfadd : PowerPC_QPX_FFF_Intrinsic<"qvfadd">; - def int_ppc_qpx_qvfadds : PowerPC_QPX_FFF_Intrinsic<"qvfadds">; - def int_ppc_qpx_qvfsub : PowerPC_QPX_FFF_Intrinsic<"qvfsub">; - def int_ppc_qpx_qvfsubs : PowerPC_QPX_FFF_Intrinsic<"qvfsubs">; - - // Estimate Instructions - def int_ppc_qpx_qvfre : PowerPC_QPX_FF_Intrinsic<"qvfre">; - def int_ppc_qpx_qvfres : PowerPC_QPX_FF_Intrinsic<"qvfres">; - def int_ppc_qpx_qvfrsqrte : PowerPC_QPX_FF_Intrinsic<"qvfrsqrte">; - def int_ppc_qpx_qvfrsqrtes : PowerPC_QPX_FF_Intrinsic<"qvfrsqrtes">; - - // Multiply Instructions - def int_ppc_qpx_qvfmul : PowerPC_QPX_FFF_Intrinsic<"qvfmul">; - def int_ppc_qpx_qvfmuls : PowerPC_QPX_FFF_Intrinsic<"qvfmuls">; - def int_ppc_qpx_qvfxmul : PowerPC_QPX_FFF_Intrinsic<"qvfxmul">; - def int_ppc_qpx_qvfxmuls : PowerPC_QPX_FFF_Intrinsic<"qvfxmuls">; - - // Multiply-add instructions - def int_ppc_qpx_qvfmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfmadd">; - def int_ppc_qpx_qvfmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfmadds">; - def int_ppc_qpx_qvfnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadd">; - def int_ppc_qpx_qvfnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadds">; - def int_ppc_qpx_qvfmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfmsub">; - def int_ppc_qpx_qvfmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfmsubs">; - def int_ppc_qpx_qvfnmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsub">; - def int_ppc_qpx_qvfnmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsubs">; - def int_ppc_qpx_qvfxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadd">; - def int_ppc_qpx_qvfxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadds">; - def int_ppc_qpx_qvfxxnpmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadd">; - def int_ppc_qpx_qvfxxnpmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadds">; - def int_ppc_qpx_qvfxxcpnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadd">; - def int_ppc_qpx_qvfxxcpnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadds">; - def int_ppc_qpx_qvfxxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadd">; - def int_ppc_qpx_qvfxxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadds">; - - // Select Instruction - def int_ppc_qpx_qvfsel : PowerPC_QPX_FFFF_Intrinsic<"qvfsel">; - - // Permute Instruction - def int_ppc_qpx_qvfperm : PowerPC_QPX_FFFF_Intrinsic<"qvfperm">; - - // Convert and Round Instructions - def int_ppc_qpx_qvfctid : PowerPC_QPX_FF_Intrinsic<"qvfctid">; - def int_ppc_qpx_qvfctidu : PowerPC_QPX_FF_Intrinsic<"qvfctidu">; - def int_ppc_qpx_qvfctidz : PowerPC_QPX_FF_Intrinsic<"qvfctidz">; - def int_ppc_qpx_qvfctiduz : PowerPC_QPX_FF_Intrinsic<"qvfctiduz">; - def int_ppc_qpx_qvfctiw : PowerPC_QPX_FF_Intrinsic<"qvfctiw">; - def int_ppc_qpx_qvfctiwu : PowerPC_QPX_FF_Intrinsic<"qvfctiwu">; - def int_ppc_qpx_qvfctiwz : PowerPC_QPX_FF_Intrinsic<"qvfctiwz">; - def int_ppc_qpx_qvfctiwuz : PowerPC_QPX_FF_Intrinsic<"qvfctiwuz">; - def int_ppc_qpx_qvfcfid : PowerPC_QPX_FF_Intrinsic<"qvfcfid">; - def int_ppc_qpx_qvfcfidu : PowerPC_QPX_FF_Intrinsic<"qvfcfidu">; - def int_ppc_qpx_qvfcfids : PowerPC_QPX_FF_Intrinsic<"qvfcfids">; - def int_ppc_qpx_qvfcfidus : PowerPC_QPX_FF_Intrinsic<"qvfcfidus">; - def int_ppc_qpx_qvfrsp : PowerPC_QPX_FF_Intrinsic<"qvfrsp">; - def int_ppc_qpx_qvfriz : PowerPC_QPX_FF_Intrinsic<"qvfriz">; - def int_ppc_qpx_qvfrin : PowerPC_QPX_FF_Intrinsic<"qvfrin">; - def int_ppc_qpx_qvfrip : PowerPC_QPX_FF_Intrinsic<"qvfrip">; - def int_ppc_qpx_qvfrim : PowerPC_QPX_FF_Intrinsic<"qvfrim">; - - // Move Instructions - def int_ppc_qpx_qvfneg : PowerPC_QPX_FF_Intrinsic<"qvfneg">; - def int_ppc_qpx_qvfabs : PowerPC_QPX_FF_Intrinsic<"qvfabs">; - def int_ppc_qpx_qvfnabs : PowerPC_QPX_FF_Intrinsic<"qvfnabs">; - def int_ppc_qpx_qvfcpsgn : PowerPC_QPX_FFF_Intrinsic<"qvfcpsgn">; - - // Compare Instructions - def int_ppc_qpx_qvftstnan : PowerPC_QPX_FFF_Intrinsic<"qvftstnan">; - def int_ppc_qpx_qvfcmplt : PowerPC_QPX_FFF_Intrinsic<"qvfcmplt">; - def int_ppc_qpx_qvfcmpgt : PowerPC_QPX_FFF_Intrinsic<"qvfcmpgt">; - def int_ppc_qpx_qvfcmpeq : PowerPC_QPX_FFF_Intrinsic<"qvfcmpeq">; - - // Load instructions - def int_ppc_qpx_qvlfd : PowerPC_QPX_Load_Intrinsic<"qvlfd">; - def int_ppc_qpx_qvlfda : PowerPC_QPX_Load_Intrinsic<"qvlfda">; - def int_ppc_qpx_qvlfs : PowerPC_QPX_Load_Intrinsic<"qvlfs">; - def int_ppc_qpx_qvlfsa : PowerPC_QPX_Load_Intrinsic<"qvlfsa">; - - def int_ppc_qpx_qvlfcda : PowerPC_QPX_Load_Intrinsic<"qvlfcda">; - def int_ppc_qpx_qvlfcd : PowerPC_QPX_Load_Intrinsic<"qvlfcd">; - def int_ppc_qpx_qvlfcsa : PowerPC_QPX_Load_Intrinsic<"qvlfcsa">; - def int_ppc_qpx_qvlfcs : PowerPC_QPX_Load_Intrinsic<"qvlfcs">; - def int_ppc_qpx_qvlfiwaa : PowerPC_QPX_Load_Intrinsic<"qvlfiwaa">; - def int_ppc_qpx_qvlfiwa : PowerPC_QPX_Load_Intrinsic<"qvlfiwa">; - def int_ppc_qpx_qvlfiwza : PowerPC_QPX_Load_Intrinsic<"qvlfiwza">; - def int_ppc_qpx_qvlfiwz : PowerPC_QPX_Load_Intrinsic<"qvlfiwz">; - - def int_ppc_qpx_qvlpcld : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcld">; - def int_ppc_qpx_qvlpcls : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcls">; - def int_ppc_qpx_qvlpcrd : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrd">; - def int_ppc_qpx_qvlpcrs : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrs">; - - // Store instructions - def int_ppc_qpx_qvstfd : PowerPC_QPX_Store_Intrinsic<"qvstfd">; - def int_ppc_qpx_qvstfda : PowerPC_QPX_Store_Intrinsic<"qvstfda">; - def int_ppc_qpx_qvstfs : PowerPC_QPX_Store_Intrinsic<"qvstfs">; - def int_ppc_qpx_qvstfsa : PowerPC_QPX_Store_Intrinsic<"qvstfsa">; - - def int_ppc_qpx_qvstfcda : PowerPC_QPX_Store_Intrinsic<"qvstfcda">; - def int_ppc_qpx_qvstfcd : PowerPC_QPX_Store_Intrinsic<"qvstfcd">; - def int_ppc_qpx_qvstfcsa : PowerPC_QPX_Store_Intrinsic<"qvstfcsa">; - def int_ppc_qpx_qvstfcs : PowerPC_QPX_Store_Intrinsic<"qvstfcs">; - def int_ppc_qpx_qvstfiwa : PowerPC_QPX_Store_Intrinsic<"qvstfiwa">; - def int_ppc_qpx_qvstfiw : PowerPC_QPX_Store_Intrinsic<"qvstfiw">; - - // Logical and permutation formation - def int_ppc_qpx_qvflogical : PowerPC_QPX_Intrinsic<"qvflogical", - [llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_ppc_qpx_qvgpci : PowerPC_QPX_Intrinsic<"qvgpci", - [llvm_v4f64_ty], [llvm_i32_ty], [IntrNoMem]>; -} - //===----------------------------------------------------------------------===// // PowerPC HTM Intrinsic Definitions. diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp index fec1985ccacae..72648273b4cd5 100644 --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -160,8 +160,6 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case AMD: return "amd"; case Apple: return "apple"; - case BGP: return "bgp"; - case BGQ: return "bgq"; case CSR: return "csr"; case Freescale: return "fsl"; case IBM: return "ibm"; @@ -187,7 +185,6 @@ StringRef Triple::getOSTypeName(OSType Kind) { case AMDHSA: return "amdhsa"; case AMDPAL: return "amdpal"; case Ananas: return "ananas"; - case CNK: return "cnk"; case CUDA: return "cuda"; case CloudABI: return "cloudabi"; case Contiki: return "contiki"; @@ -470,8 +467,6 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("apple", Triple::Apple) .Case("pc", Triple::PC) .Case("scei", Triple::SCEI) - .Case("bgp", Triple::BGP) - .Case("bgq", Triple::BGQ) .Case("fsl", Triple::Freescale) .Case("ibm", Triple::IBM) .Case("img", Triple::ImaginationTechnologies) @@ -508,7 +503,6 @@ static Triple::OSType parseOS(StringRef OSName) { .StartsWith("minix", Triple::Minix) .StartsWith("rtems", Triple::RTEMS) .StartsWith("nacl", Triple::NaCl) - .StartsWith("cnk", Triple::CNK) .StartsWith("aix", Triple::AIX) .StartsWith("cuda", Triple::CUDA) .StartsWith("nvcl", Triple::NVCL) diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 13fd7d05ab9f4..81008d3ea5662 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -492,21 +492,6 @@ struct PPCOperand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()])); } - void addRegQFRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - - void addRegQSRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - - void addRegQBRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(RRegs[getReg()])); @@ -1207,9 +1192,6 @@ bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) { } else if (Name.startswith_lower("v") && !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { RegNo = VRegs[IntVal]; - } else if (Name.startswith_lower("q") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = QFRegs[IntVal]; } else if (Name.startswith_lower("cr") && !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { RegNo = CRRegs[IntVal]; diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index 91021d4e584e1..5a06faa16be19 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -36,7 +36,6 @@ add_llvm_target(PowerPCCodeGen PPCMacroFusion.cpp PPCMIPeephole.cpp PPCRegisterInfo.cpp - PPCQPXLoadSplat.cpp PPCSubtarget.cpp PPCTargetMachine.cpp PPCTargetObjectFile.cpp diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 74c6fd3733f03..362ddf7204557 100644 --- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -167,12 +167,6 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass -static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo, - uint64_t Address, - const void *Decoder) { - return decodeRegisterClass(Inst, RegNo, QFRegs); -} - static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { @@ -401,14 +395,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Read the instruction in the proper endianness. uint64_t Inst = ReadFunc(Bytes.data()); - if (STI.getFeatureBits()[PPC::FeatureQPX]) { - DecodeStatus result = - decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); - if (result != MCDisassembler::Fail) - return result; - } else if (STI.getFeatureBits()[PPC::FeatureSPE]) { + if (STI.getFeatureBits()[PPC::FeatureSPE]) { DecodeStatus result = - decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); + decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); if (result != MCDisassembler::Fail) return result; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index 222bf2fa82836..ce1a43a0c25b2 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -49,18 +49,6 @@ FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { const char *RegName = getRegisterName(RegNo); - if (RegName[0] == 'q' /* QPX */) { - // The system toolchain on the BG/Q does not understand QPX register names - // in .cfi_* directives, so print the name of the floating-point - // subregister instead. - std::string RN(RegName); - - RN[0] = 'f'; - OS << RN; - - return; - } - OS << RegName; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 719e005d98135..325ede0fc17ac 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -159,7 +159,6 @@ using llvm::MCPhysReg; static const MCPhysReg SPERegs[32] = PPC_REGS0_31(PPC::S); \ static const MCPhysReg VFRegs[32] = PPC_REGS0_31(PPC::VF); \ static const MCPhysReg VRegs[32] = PPC_REGS0_31(PPC::V); \ - static const MCPhysReg QFRegs[32] = PPC_REGS0_31(PPC::QF); \ static const MCPhysReg RRegsNoR0[32] = \ PPC_REGS_NO0_31(PPC::ZERO, PPC::R); \ static const MCPhysReg XRegsNoX0[32] = \ diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 3106290442afa..24a9d419d3ea5 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -44,7 +44,6 @@ namespace llvm { FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCBranchCoalescingPass(); - FunctionPass *createPPCQPXLoadSplatPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); @@ -68,7 +67,6 @@ namespace llvm { void initializePPCReduceCRLogicalsPass(PassRegistry&); void initializePPCBSelPass(PassRegistry&); void initializePPCBranchCoalescingPass(PassRegistry&); - void initializePPCQPXLoadSplatPass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCPreEmitPeepholePass(PassRegistry &); diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 9ad78bf67fe6c..adb9366217d51 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -132,9 +132,6 @@ def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true", "Enable PPC 4xx instructions">; def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true", "Enable PPC 6xx instructions">; -def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", - "Enable QPX instructions", - [FeatureFPU]>; def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", "Enable VSX instructions", [FeatureAltivec]>; @@ -193,7 +190,7 @@ def FeatureFloat128 : def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD", "POPCNTD_Fast", "Enable the popcnt[dw] instructions">; -// Note that for the a2/a2q processor models we should not use popcnt[dw] by +// Note that for the a2 processor models we should not use popcnt[dw] by // default. These processors do support the instructions, but they're // microcoded, and the software emulation is about twice as fast. def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD", @@ -514,15 +511,6 @@ def : ProcessorModel<"a2", PPCA2Model, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>; -def : ProcessorModel<"a2q", PPCA2Model, - [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, - FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, - FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, - FeatureSTFIWX, FeatureLFIWAX, - FeatureFPRND, FeatureFPCVT, FeatureISEL, - FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, - Feature64Bit /*, Feature64BitRegs */, FeatureQPX, - FeatureMFTB]>; def : ProcessorModel<"pwr3", G5Model, [DirectivePwr3, FeatureAltivec, FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 540e620a845bc..5affddd8d147a 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -549,9 +549,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { if (Subtarget->hasSPE()) { if (PPC::F4RCRegClass.contains(Reg) || PPC::F8RCRegClass.contains(Reg) || - PPC::QBRCRegClass.contains(Reg) || - PPC::QFRCRegClass.contains(Reg) || - PPC::QSRCRegClass.contains(Reg) || PPC::VFRCRegClass.contains(Reg) || PPC::VRRCRegClass.contains(Reg) || PPC::VSFRCRegClass.contains(Reg) || diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td index 1eaa7f7a44b39..9a15490f1fb0d 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -61,9 +61,6 @@ def RetCC_PPC_Cold : CallingConv<[ CCIfType<[f64], CCAssignToReg<[F1]>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>, - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>, - CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>> @@ -98,10 +95,6 @@ def RetCC_PPC : CallingConv<[ CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - // QPX vectors are returned in QF1 and QF2. - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, - // Vector types returned as "direct" go into V2 .. V9; note that only the // ELFv2 ABI fully utilizes all these registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], @@ -158,8 +151,6 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>> @@ -223,9 +214,6 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>, CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>, - // QPX vectors that are stored in double precision need 32-byte alignment. - CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>, - // Vectors and float128 get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>> @@ -243,10 +231,6 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[ // put vector arguments in vector registers before putting them on the stack. let Entry = 1 in def CC_PPC32_SVR4 : CallingConv<[ - // QPX vectors mirror the scalar FP convention. - CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()", - CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>, - // The first 12 Vector arguments are passed in AltiVec registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 8ffd89ef5ccd2..3e218e14d8d44 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4142,7 +4142,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { // Altivec Vector compare instructions do not set any CR register by default and // vector compare operations return the same type as the operands. if (LHS.getValueType().isVector()) { - if (Subtarget->hasQPX() || Subtarget->hasSPE()) + if (Subtarget->hasSPE()) return false; EVT VecVT = LHS.getValueType(); @@ -4813,8 +4813,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); switch (LoadedVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid PPC load type!"); - case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX - case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX case MVT::f64: Opcode = PPC::LFDUX; break; case MVT::f32: Opcode = PPC::LFSUX; break; case MVT::i32: Opcode = PPC::LWZUX; break; @@ -5095,12 +5093,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SelectCCOp = PPC::SELECT_CC_F16; else if (Subtarget->hasSPE()) SelectCCOp = PPC::SELECT_CC_SPE; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64) - SelectCCOp = PPC::SELECT_CC_QFRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32) - SelectCCOp = PPC::SELECT_CC_QSRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1) - SelectCCOp = PPC::SELECT_CC_QBRC; else if (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64) SelectCCOp = PPC::SELECT_CC_VSRC; @@ -5856,9 +5848,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: @@ -6177,9 +6166,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ae840a9fa37de..db3833d595797 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1095,161 +1095,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } } - if (Subtarget.hasQPX()) { - setOperationAction(ISD::FADD, MVT::v4f64, Legal); - setOperationAction(ISD::FSUB, MVT::v4f64, Legal); - setOperationAction(ISD::FMUL, MVT::v4f64, Legal); - setOperationAction(ISD::FREM, MVT::v4f64, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f64, Custom); - setOperationAction(ISD::STORE , MVT::v4f64, Custom); - - setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f64, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); - - setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); - - setOperationAction(ISD::FNEG , MVT::v4f64, Legal); - setOperationAction(ISD::FABS , MVT::v4f64, Legal); - setOperationAction(ISD::FSIN , MVT::v4f64, Expand); - setOperationAction(ISD::FCOS , MVT::v4f64, Expand); - setOperationAction(ISD::FPOW , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); - - addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); - - setOperationAction(ISD::FADD, MVT::v4f32, Legal); - setOperationAction(ISD::FSUB, MVT::v4f32, Legal); - setOperationAction(ISD::FMUL, MVT::v4f32, Legal); - setOperationAction(ISD::FREM, MVT::v4f32, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f32, Custom); - setOperationAction(ISD::STORE , MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f32, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); - - setOperationAction(ISD::FNEG , MVT::v4f32, Legal); - setOperationAction(ISD::FABS , MVT::v4f32, Legal); - setOperationAction(ISD::FSIN , MVT::v4f32, Expand); - setOperationAction(ISD::FCOS , MVT::v4f32, Expand); - setOperationAction(ISD::FPOW , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); - - addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); - - setOperationAction(ISD::AND , MVT::v4i1, Legal); - setOperationAction(ISD::OR , MVT::v4i1, Legal); - setOperationAction(ISD::XOR , MVT::v4i1, Legal); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); - - setOperationAction(ISD::LOAD , MVT::v4i1, Custom); - setOperationAction(ISD::STORE , MVT::v4i1, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); - - setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); - - addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); - - setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); - setOperationAction(ISD::FROUND, MVT::v4f64, Legal); - - setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); - setOperationAction(ISD::FROUND, MVT::v4f32, Legal); - - setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); - - // These need to set FE_INEXACT, and so cannot be vectorized here. - setOperationAction(ISD::FRINT, MVT::v4f64, Expand); - setOperationAction(ISD::FRINT, MVT::v4f32, Expand); - - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); - - setOperationAction(ISD::FDIV, MVT::v4f32, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); - } else { - setOperationAction(ISD::FDIV, MVT::v4f64, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); - - setOperationAction(ISD::FDIV, MVT::v4f32, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); - } - - // TODO: Handle constrained floating-point operations of v4f64 - } - if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); @@ -1438,8 +1283,8 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4); - if (Subtarget.hasAltivec() || Subtarget.hasQPX()) - getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16)); + if (Subtarget.hasAltivec()) + getMaxByValAlign(Ty, Alignment, Align(16)); return Alignment.value(); } @@ -1577,12 +1422,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::VABSD: return "PPCISD::VABSD"; - case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; - case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; - case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; - case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; - case PPCISD::QBFLT: return "PPCISD::QBFLT"; - case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; @@ -1601,9 +1440,6 @@ EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; - if (Subtarget.hasQPX()) - return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); - return VT.changeVectorElementTypeToInteger(); } @@ -2777,16 +2613,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, return false; } - // PowerPC doesn't have preinc load/store instructions for vectors (except - // for QPX, which does have preinc r+r forms). - if (VT.isVector()) { - if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { - return false; - } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { - AM = ISD::PRE_INC; - return true; - } - } + // PowerPC doesn't have preinc load/store instructions for vectors + if (VT.isVector()) + return false; if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { // Common code will reject creating a pre-inc form if the base pointer @@ -3508,11 +3337,6 @@ static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; -/// QFPR - The set of QPX registers that should be allocated for arguments. -static const MCPhysReg QFPR[] = { - PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, - PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; - /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, @@ -3542,10 +3366,6 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128 || ArgVT == MVT::f128) Alignment = Align(16); - // QPX vector types stored in double-precision are padded to a 32 byte - // boundary. - else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) - Alignment = Align(32); // ByVal parameters are aligned as requested. if (Flags.isByVal()) { @@ -3577,14 +3397,11 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. -static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, - ISD::ArgFlagsTy Flags, - unsigned PtrByteSize, - unsigned LinkageSize, - unsigned ParamAreaSize, - unsigned &ArgOffset, +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, unsigned LinkageSize, + unsigned ParamAreaSize, unsigned &ArgOffset, unsigned &AvailableFPRs, - unsigned &AvailableVRs, bool HasQPX) { + unsigned &AvailableVRs) { bool UseMemory = false; // Respect alignment of argument on the stack. @@ -3608,11 +3425,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, // However, if the argument is actually passed in an FPR or a VR, // we don't use memory after all. if (!Flags.isByVal()) { - if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || - // QPX registers overlap with the scalar FP registers. - (HasQPX && (ArgVT == MVT::v4f32 || - ArgVT == MVT::v4f64 || - ArgVT == MVT::v4i1))) + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) if (AvailableFPRs > 0) { --AvailableFPRs; return false; @@ -3751,18 +3564,12 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( RC = &PPC::VRRCRegClass; break; case MVT::v4f32: - RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; + RC = &PPC::VRRCRegClass; break; case MVT::v2f64: case MVT::v2i64: RC = &PPC::VRRCRegClass; break; - case MVT::v4f64: - RC = &PPC::QFRCRegClass; - break; - case MVT::v4i1: - RC = &PPC::QBRCRegClass; - break; } SDValue ArgValue; @@ -3961,7 +3768,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; const unsigned Num_VR_Regs = array_lengthof(VR); - const unsigned Num_QFPR_Regs = Num_FPR_Regs; // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area @@ -3980,8 +3786,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + NumBytes, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } @@ -3991,7 +3796,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; @@ -4234,51 +4038,20 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { - // These can be scalar arguments or elements of a vector array type - // passed directly. The latter are used to implement ELFv2 homogenous - // vector aggregates. - if (VR_idx != Num_VR_Regs) { - unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++VR_idx; - } else { - if (CallConv == CallingConv::Fast) - ComputeArgOffset(); - needsLoad = true; - } - if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += 16; - break; - } // not QPX - - assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - LLVM_FALLTHROUGH; - - case MVT::v4f64: - case MVT::v4i1: - // QPX vectors are treated like their scalar floating-point subregisters - // (except that they're larger). - unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; - if (QFPR_idx != Num_QFPR_Regs) { - const TargetRegisterClass *RC; - switch (ObjectVT.getSimpleVT().SimpleTy) { - case MVT::v4f64: RC = &PPC::QFRCRegClass; break; - case MVT::v4f32: RC = &PPC::QSRCRegClass; break; - default: RC = &PPC::QBRCRegClass; break; - } - - unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++QFPR_idx; + ++VR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += Sz; + ArgOffset += 16; break; } @@ -4831,10 +4604,9 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, for (const ISD::OutputArg& Param : Outs) { if (Param.Flags.isNest()) continue; - if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize, + LinkageSize, ParamAreaSize, NumBytes, + AvailableFPRs, AvailableVRs)) return true; } return false; @@ -6064,7 +5836,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -6078,7 +5849,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); - const unsigned NumQFPRs = NumFPRs; // On ELFv2, we can avoid allocating the parameter area if all the arguments // can be passed to the callee in registers. @@ -6093,9 +5863,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( for (unsigned i = 0; i != NumOps; ++i) { if (Outs[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytesTmp, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytesTmp, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } } @@ -6143,20 +5912,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( continue; break; case MVT::v4f32: - // When using QPX, this is handled like a FP register, otherwise, it - // is an Altivec register. - if (Subtarget.hasQPX()) { - if (++NumFPRsUsed <= NumFPRs) - continue; - } else { - if (++NumVRsUsed <= NumVRs) - continue; - } + if (++NumVRsUsed <= NumVRs) + continue; break; case MVT::f32: case MVT::f64: - case MVT::v4f64: // QPX - case MVT::v4i1: // QPX if (++NumFPRsUsed <= NumFPRs) continue; break; @@ -6518,7 +6278,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. @@ -6574,63 +6333,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (!IsFastCall) ArgOffset += 16; break; - } // not QPX - - assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - - LLVM_FALLTHROUGH; - case MVT::v4f64: - case MVT::v4i1: { - bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; - if (CFlags.IsVarArg) { - assert(HasParameterArea && - "Parameter area must exist if we have a varargs call."); - // We could elide this store in the case where the object fits - // entirely in R registers. Maybe later. - SDValue Store = - DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Store); - if (QFPR_idx != NumQFPRs) { - SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, - PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); - } - ArgOffset += (IsF32 ? 16 : 32); - for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { - if (GPR_idx == NumGPRs) - break; - SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, - DAG.getConstant(i, dl, PtrVT)); - SDValue Load = - DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - break; - } - - // Non-varargs QPX params go into registers or on the stack. - if (QFPR_idx != NumQFPRs) { - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); - } else { - if (IsFastCall) - ComputePtrOff(); - - assert(HasParameterArea && - "Parameter area must exist to pass an argument in memory."); - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - true, CFlags.IsTailCall, true, MemOpChains, - TailCallArguments, dl); - if (IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - } - - if (!IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - break; - } } } @@ -7301,8 +7003,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( const PPCSubtarget &Subtarget = static_cast(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX support is not supported on AIX."); const bool IsPPC64 = Subtarget.isPPC64(); const unsigned PtrByteSize = IsPPC64 ? 8 : 4; @@ -7522,8 +7222,6 @@ SDValue PPCTargetLowering::LowerCall_AIX( const PPCSubtarget& Subtarget = static_cast(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX is not supported on AIX."); if (Subtarget.hasAltivec()) report_fatal_error("Altivec support is unimplemented on AIX."); @@ -7991,8 +7689,6 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) - return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); @@ -8016,9 +7712,6 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(1).getValueType().isVector()) - return LowerVectorStore(Op, DAG); - assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); @@ -8595,27 +8288,6 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType() == MVT::f128) return Op; - if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { - if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) - return SDValue(); - - SDValue Value = Op.getOperand(0); - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - if (Op.getValueType() != MVT::v4f64) - Value = DAG.getNode(ISD::FP_ROUND, dl, - Op.getValueType(), Value, - DAG.getIntPtrConstant(1, dl)); - return Value; - } - // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); @@ -9184,110 +8856,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); - if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { - // We first build an i32 vector, load it into a QPX register, - // then convert it to a floating-point vector and compare it - // to a zero vector to get the boolean result. - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - assert(BVN->getNumOperands() == 4 && - "BUILD_VECTOR for v4i1 does not have 4 operands"); - - bool IsConst = true; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - if (!isa(BVN->getOperand(i))) { - IsConst = false; - break; - } - } - - if (IsConst) { - Constant *One = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); - Constant *NegOne = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); - - Constant *CV[4]; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) - CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); - else if (isNullConstant(BVN->getOperand(i))) - CV[i] = NegOne; - else - CV[i] = One; - } - - Constant *CP = ConstantVector::get(CV); - SDValue CPIdx = - DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16)); - - SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); - return DAG.getMemIntrinsicNode( - PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); - } - - SmallVector Stores; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); - if (StoreSize > 4) { - Stores.push_back( - DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, - PtrInfo.getWithOffset(Offset), MVT::i32)); - } else { - SDValue StoreValue = BVN->getOperand(i); - if (StoreSize < 4) - StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); - - Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, - PtrInfo.getWithOffset(Offset))); - } - } - - SDValue StoreChain; - if (!Stores.empty()) - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - else - StoreChain = DAG.getEntryNode(); - - // Now load from v4i32 into the QPX register; this will extend it to - // v4i64 but not yet convert it to a floating point. Nevertheless, this - // is typed as v4f64 because the QPX register integer states are not - // explicitly represented. - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), - FIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); - - SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), - LoadedVect); - - SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); - - return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); - } - - // All other QPX vectors are handled by generic code. - if (Subtarget.hasQPX()) - return SDValue(); - // Check if this is a splat of a constant value. APInt APSplatBits, APSplatUndef; unsigned SplatBitSize; @@ -10080,42 +9648,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } } - if (Subtarget.hasQPX()) { - if (VT.getVectorNumElements() != 4) - return SDValue(); - - if (V2.isUndef()) V2 = V1; - - int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); - if (AlignIdx != -1) { - return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, - DAG.getConstant(AlignIdx, dl, MVT::i32)); - } else if (SVOp->isSplat()) { - int SplatIdx = SVOp->getSplatIndex(); - if (SplatIdx >= 4) { - std::swap(V1, V2); - SplatIdx -= 4; - } - - return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, - DAG.getConstant(SplatIdx, dl, MVT::i32)); - } - - // Lower this into a qvgpci/qvfperm pair. - - // Compute the qvgpci literal - unsigned idx = 0; - for (unsigned i = 0; i < 4; ++i) { - int m = SVOp->getMaskElt(i); - unsigned mm = m >= 0 ? (unsigned) m : i; - idx |= mm << (3-i)*3; - } - - SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, - DAG.getConstant(idx, dl, MVT::i32)); - return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); - } - // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be // selected by the instruction selector. @@ -10703,279 +10235,6 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return Op; } -SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - SDNode *N = Op.getNode(); - - assert(N->getOperand(0).getValueType() == MVT::v4i1 && - "Unknown extract_vector_elt type"); - - SDValue Value = N->getOperand(0); - - // The first part of this is like the store lowering except that we don't - // need to track the chain. - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue StoreChain = DAG.getEntryNode(); - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Extract the value requested. - unsigned Offset = 4*cast(N->getOperand(1))->getZExtValue(); - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - SDValue IntVal = - DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); - - if (!Subtarget.useCRBits()) - return IntVal; - - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); -} - -/// Lowering for QPX v4i1 loads -SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - LoadSDNode *LN = cast(Op.getNode()); - SDValue LoadChain = LN->getChain(); - SDValue BasePtr = LN->getBasePtr(); - - if (Op.getValueType() == MVT::v4f64 || - Op.getValueType() == MVT::v4f32) { - EVT MemVT = LN->getMemoryVT(); - unsigned Alignment = LN->getAlignment(); - - // If this load is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Op.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Vals[4], LoadChains[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Load; - if (ScalarVT != ScalarMemVT) - Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, - BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - else - Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - - if (Idx == 0 && LN->isIndexed()) { - assert(LN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector load"); - Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), - LN->getAddressingMode()); - } - - Vals[Idx] = Load; - LoadChains[Idx] = Load.getValue(1); - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); - - if (LN->isIndexed()) { - SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; - return DAG.getMergeValues(RetOps, dl); - } - - SDValue RetOps[] = { Value, TF }; - return DAG.getMergeValues(RetOps, dl); - } - - assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); - assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); - - // To lower v4i1 from a byte array, we load the byte elements of the - // vector and then reuse the BUILD_VECTOR logic. - - SDValue VectElmts[4], VectElmtChains[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - VectElmts[i] = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, - LN->getPointerInfo().getWithOffset(i), MVT::i8, - /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); - VectElmtChains[i] = VectElmts[i].getValue(1); - } - - LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); - SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); - - SDValue RVals[] = { Value, LoadChain }; - return DAG.getMergeValues(RVals, dl); -} - -/// Lowering for QPX v4i1 stores -SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - StoreSDNode *SN = cast(Op.getNode()); - SDValue StoreChain = SN->getChain(); - SDValue BasePtr = SN->getBasePtr(); - SDValue Value = SN->getValue(); - - if (Value.getValueType() == MVT::v4f64 || - Value.getValueType() == MVT::v4f32) { - EVT MemVT = SN->getMemoryVT(); - unsigned Alignment = SN->getAlignment(); - - // If this store is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Value.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Stores[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, - DAG.getVectorIdxConstant(Idx, dl)); - SDValue Store; - if (ScalarVT != ScalarMemVT) - Store = - DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - else - Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - - if (Idx == 0 && SN->isIndexed()) { - assert(SN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector store"); - Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), - SN->getAddressingMode()); - } - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - Stores[Idx] = Store; - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - - if (SN->isIndexed()) { - SDValue RetOps[] = { TF, Stores[0].getValue(1) }; - return DAG.getMergeValues(RetOps, dl); - } - - return TF; - } - - assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); - assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Move data into the byte array. - SDValue Loads[4], LoadChains[4]; - for (unsigned i = 0; i < 4; ++i) { - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, - PtrInfo.getWithOffset(Offset)); - LoadChains[i] = Loads[i].getValue(1); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - - SDValue Stores[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - Stores[i] = DAG.getTruncStore( - StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), - MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), - SN->getAAInfo()); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - - return StoreChain; -} - SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { @@ -11204,7 +10463,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); @@ -12148,9 +11406,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } else if (MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_F16 || - MI.getOpcode() == PPC::SELECT_CC_QFRC || - MI.getOpcode() == PPC::SELECT_CC_QSRC || - MI.getOpcode() == PPC::SELECT_CC_QBRC || MI.getOpcode() == PPC::SELECT_CC_VRRC || MI.getOpcode() == PPC::SELECT_CC_VSFRC || MI.getOpcode() == PPC::SELECT_CC_VSSRC || @@ -12160,9 +11415,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_SPE || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_VRRC || @@ -12200,9 +11452,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F16 || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_SPE || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || @@ -12895,9 +12144,7 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); @@ -12916,9 +12163,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); @@ -13016,24 +12261,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_vsx_lxvw4x: @@ -13062,24 +12289,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvstfiw: - case Intrinsic::ppc_qpx_qvstfiwa: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_vsx_stxvw4x: @@ -15077,18 +14286,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty); - Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); - Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v4f32)) || - (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && - LD->getAlign() >= ScalarABIAlignment)) && + VT == MVT::v4f32))) && LD->getAlign() < ABIAlignment) { - // This is a type-legal unaligned Altivec or QPX load. + // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); bool isLittleEndian = Subtarget.isLittleEndian(); @@ -15119,24 +14324,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // optimization later. Intrinsic::ID Intr, IntrLD, IntrPerm; MVT PermCntlTy, PermTy, LDTy; - if (Subtarget.hasAltivec()) { - Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : - Intrinsic::ppc_altivec_lvsl; - IntrLD = Intrinsic::ppc_altivec_lvx; - IntrPerm = Intrinsic::ppc_altivec_vperm; - PermCntlTy = MVT::v16i8; - PermTy = MVT::v4i32; - LDTy = MVT::v4i32; - } else { - Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : - Intrinsic::ppc_qpx_qvlpcls; - IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : - Intrinsic::ppc_qpx_qvlfs; - IntrPerm = Intrinsic::ppc_qpx_qvfperm; - PermCntlTy = MVT::v4f64; - PermTy = MVT::v4f64; - LDTy = MemVT.getSimpleVT(); - } + Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr + : Intrinsic::ppc_altivec_lvsl; + IntrLD = Intrinsic::ppc_altivec_lvx; + IntrPerm = Intrinsic::ppc_altivec_vperm; + PermCntlTy = MVT::v16i8; + PermTy = MVT::v4i32; + LDTy = MVT::v4i32; SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); @@ -15207,10 +14401,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != PermTy) - Perm = Subtarget.hasAltivec() ? - DAG.getNode(ISD::BITCAST, dl, VT, Perm) : - DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX - DAG.getTargetConstant(1, dl, MVT::i64)); + Perm = Subtarget.hasAltivec() + ? DAG.getNode(ISD::BITCAST, dl, VT, Perm) + : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, + DAG.getTargetConstant(1, dl, MVT::i64)); // second argument is 1 because this rounding // is always exact. @@ -15226,14 +14420,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, unsigned IID = cast(N->getOperand(0))->getZExtValue(); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); - if ((IID == Intr || - IID == Intrinsic::ppc_qpx_qvlpcld || - IID == Intrinsic::ppc_qpx_qvlpcls) && - N->getOperand(1)->getOpcode() == ISD::ADD) { + if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); - int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? - 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; + int Bits = 4 /* 16 byte alignment */; if (DAG.MaskedValueIsZero(Add->getOperand(1), APInt::getAllOnesValue(Bits /* alignment */) @@ -15243,7 +14433,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast(UI->getOperand(0))->getZExtValue() == IID) { + cast(UI->getOperand(0))->getZExtValue() == + IID) { // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -15792,17 +14983,9 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::F4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); } break; case 'v': - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); break; @@ -16094,12 +15277,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: @@ -16121,18 +15298,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_lxvd2x: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvlfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16147,45 +15312,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad; return true; } - case Intrinsic::ppc_qpx_qvlfda: - case Intrinsic::ppc_qpx_qvlfsa: - case Intrinsic::ppc_qpx_qvlfcda: - case Intrinsic::ppc_qpx_qvlfcsa: - case Intrinsic::ppc_qpx_qvlfiwaa: - case Intrinsic::ppc_qpx_qvlfiwza: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: @@ -16207,18 +15333,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvstfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16233,39 +15347,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } - case Intrinsic::ppc_qpx_qvstfda: - case Intrinsic::ppc_qpx_qvstfsa: - case Intrinsic::ppc_qpx_qvstfcda: - case Intrinsic::ppc_qpx_qvstfcsa: - case Intrinsic::ppc_qpx_qvstfiwa: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(1); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOStore; - return true; - } default: break; } @@ -16278,14 +15359,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, EVT PPCTargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - // When expanding a memset, require at least two QPX instructions to cover - // the cost of loading the value to be stored from the constant pool. - if (Subtarget.hasQPX() && Op.size() >= 32 && - (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) && - !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { - return MVT::v4f64; - } - // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. if (Subtarget.hasAltivec() && Op.size() >= 16 && @@ -16504,7 +15577,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles( if (VT == MVT::v2i64) return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves - if (Subtarget.hasVSX() || Subtarget.hasQPX()) + if (Subtarget.hasVSX()) return true; return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); @@ -16550,8 +15623,7 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, switch (Opc) { case PPCISD::FNMSUB: - // TODO: QPX subtarget is deprecated. No transformation here. - if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX()) + if (!Op.hasOneUse() || !isTypeLegal(VT)) break; const TargetOptions &Options = getTargetMachine().Options; @@ -17032,8 +16104,7 @@ SDValue PPCTargetLowering::combineFMALike(SDNode *N, bool LegalOps = !DCI.isBeforeLegalizeOps(); SDLoc Loc(N); - // TODO: QPX subtarget is deprecated. No transformation here. - if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT)) + if (!isOperationLegal(ISD::FMA, VT)) return SDValue(); // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0 diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 768eaa43e0135..8cc42226d7f0b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -427,22 +427,6 @@ namespace llvm { /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) VABSD, - /// QVFPERM = This corresponds to the QPX qvfperm instruction. - QVFPERM, - - /// QVGPCI = This corresponds to the QPX qvgpci instruction. - QVGPCI, - - /// QVALIGNI = This corresponds to the QPX qvaligni instruction. - QVALIGNI, - - /// QVESPLATI = This corresponds to the QPX qvesplati instruction. - QVESPLATI, - - /// QBFLT = Access the underlying QPX floating-point boolean - /// representation. - QBFLT, - /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or /// lower (IDX=1) half of v4f32 to v2f64. FP_EXTEND_HALF, @@ -519,10 +503,6 @@ namespace llvm { /// Store scalar integers from VSR. ST_VSR_SCAL_INT, - /// QBRC, CHAIN = QVLFSb CHAIN, Ptr - /// The 4xf32 load used for v4i1 constants. - QVLFSb, - /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes /// except they ensure that the compare input is zero-extended for /// sub-word versions because the atomic loads zero-extend. diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 632d4d9deb8a2..5ff5fc78326ba 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -642,7 +642,6 @@ class XForm_17a opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let FRA = 0; } -// Used for QPX class XForm_18 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { @@ -1781,14 +1780,6 @@ class AForm_4 opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } -// Used for QPX -class AForm_4a opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : AForm_1 { - let FRA = 0; - let FRC = 0; -} - // 1.7.13 M-Form class MForm_1 opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> @@ -2099,49 +2090,6 @@ class VX_RD5_RSp5_PS1_XO9 xo, dag OOL, dag IOL, string asmstr, let Inst{23-31} = xo; } -// Z23-Form (used by QPX) -class Z23Form_1 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : I { - bits<5> FRT; - bits<5> FRA; - bits<5> FRB; - bits<2> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-15} = FRA; - let Inst{16-20} = FRB; - let Inst{21-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - -class Z23Form_2 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : Z23Form_1 { - let FRB = 0; -} - -class Z23Form_3 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : I { - bits<5> FRT; - bits<12> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - class Z23Form_8 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 99e25bb130ce4..34618ed058050 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -259,14 +259,6 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case PPC::XVMULDP: case PPC::XVMULSP: case PPC::XSMULSP: - // QPX Add: - case PPC::QVFADD: - case PPC::QVFADDS: - case PPC::QVFADDSs: - // QPX Multiply: - case PPC::QVFMUL: - case PPC::QVFMULS: - case PPC::QVFMULSs: return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); // Fixed point: @@ -300,9 +292,7 @@ static const uint16_t FMAOpIdxInfo[][5] = { {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2}, {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2}, {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1}, - {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}, - {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1}, - {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}}; + {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}}; // Check if an opcode is a FMA instruction. If it is, return the index in array // FMAOpIdxInfo. Otherwise, return -1. @@ -666,7 +656,6 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::LI8: case PPC::LIS: case PPC::LIS8: - case PPC::QVGPCI: case PPC::ADDIStocHA: case PPC::ADDIStocHA8: case PPC::ADDItocL: @@ -1343,12 +1332,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) || PPC::VSSRCRegClass.contains(DestReg, SrcReg)) Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf; - else if (PPC::QFRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMR; - else if (PPC::QSRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRs; - else if (PPC::QBRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRb; else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) @@ -1393,12 +1376,6 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) { OpcodeIndex = SOK_VectorFloat4Spill; } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_VRSaveSpill; - } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat8Spill; - } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat4Spill; - } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadBitSpill; } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SpillToVSR; } else { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 43973c627fcf1..bdcfa76505daf 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -123,9 +123,6 @@ enum SpillOpcodeKey { SOK_VectorFloat8Spill, SOK_VectorFloat4Spill, SOK_VRSaveSpill, - SOK_QuadFloat8Spill, - SOK_QuadFloat4Spill, - SOK_QuadBitSpill, SOK_SpillToVSR, SOK_SPESpill, SOK_LastOpcodeSpill // This must be last on the enum. @@ -136,32 +133,28 @@ enum SpillOpcodeKey { { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \ - PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb, \ - PPC::SPILLTOVSR_LD, PPC::EVLDD \ + PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD, PPC::EVLDD \ } #define Pwr9LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ - PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, \ - PPC::QVLFDXb, PPC::SPILLTOVSR_LD \ + PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD \ } #define Pwr8StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \ - PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST, \ - PPC::EVSTDD \ + PPC::SPILLTOVSR_ST, PPC::EVSTDD \ } #define Pwr9StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ - PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, \ - PPC::SPILLTOVSR_ST \ + PPC::SPILL_VRSAVE, PPC::SPILLTOVSR_ST \ } // Initialize arrays for load and store spill opcodes on supported subtargets. @@ -273,10 +266,10 @@ class PPCInstrInfo : public PPCGenInstrInfo { } static bool isSameClassPhysRegCopy(unsigned Opcode) { - unsigned CopyOpcodes[] = - { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf, - PPC::XSCPSGNDP, PPC::MCRF, PPC::QVFMR, PPC::QVFMRs, PPC::QVFMRb, - PPC::CROR, PPC::EVOR, -1U }; + unsigned CopyOpcodes[] = {PPC::OR, PPC::OR8, PPC::FMR, + PPC::VOR, PPC::XXLOR, PPC::XXLORf, + PPC::XSCPSGNDP, PPC::MCRF, PPC::CROR, + PPC::EVOR, -1U}; for (int i = 0; CopyOpcodes[i] != -1U; i++) if (Opcode == CopyOpcodes[i]) return true; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index f807d61c75d23..264002c229b9e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -203,16 +203,6 @@ def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; -def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; -def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>; -def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>; -def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>; - -def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>; - -def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb, - [SDNPHasChain, SDNPMayLoad]>; - def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>; // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift @@ -3467,7 +3457,6 @@ include "PPCInstrAltivec.td" include "PPCInstrSPE.td" include "PPCInstr64Bit.td" include "PPCInstrVSX.td" -include "PPCInstrQPX.td" include "PPCInstrHTM.td" def crnot : OutPatFrag<(ops node:$in), diff --git a/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/llvm/lib/Target/PowerPC/PPCInstrQPX.td deleted file mode 100644 index 2265af2815cb5..0000000000000 --- a/llvm/lib/Target/PowerPC/PPCInstrQPX.td +++ /dev/null @@ -1,1212 +0,0 @@ -//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the QPX extension to the PowerPC instruction set. -// Reference: -// Book Q: QPX Architecture Definition. IBM (as updated in) 2011. -// -//===----------------------------------------------------------------------===// - -def PPCRegQFRCAsmOperand : AsmOperandClass { - let Name = "RegQFRC"; let PredicateMethod = "isRegNumber"; -} -def qfrc : RegisterOperand { - let ParserMatchClass = PPCRegQFRCAsmOperand; -} -def PPCRegQSRCAsmOperand : AsmOperandClass { - let Name = "RegQSRC"; let PredicateMethod = "isRegNumber"; -} -def qsrc : RegisterOperand { - let ParserMatchClass = PPCRegQSRCAsmOperand; -} -def PPCRegQBRCAsmOperand : AsmOperandClass { - let Name = "RegQBRC"; let PredicateMethod = "isRegNumber"; -} -def qbrc : RegisterOperand { - let ParserMatchClass = PPCRegQBRCAsmOperand; -} - -//===----------------------------------------------------------------------===// -// Helpers for defining instructions that directly correspond to intrinsics. - -// QPXA1_Int - A AForm_1 intrinsic definition. -class QPXA1_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1; -// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions). -class QPXA1s_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1; -// QPXA2_Int - A AForm_2 intrinsic definition. -class QPXA2_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_2; -// QPXA3_Int - A AForm_3 intrinsic definition. -class QPXA3_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_3; -// QPXA4_Int - A AForm_4a intrinsic definition. -class QPXA4_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_4a; -// QPXX18_Int - A XForm_18 intrinsic definition. -class QPXX18_Int opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_18; -// QPXX19_Int - A XForm_19 intrinsic definition. -class QPXX19_Int opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_19; - -//===----------------------------------------------------------------------===// -// Pattern Frags. - -def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr), - (truncstore node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; -def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset), - (pre_truncst node:$val, - node:$base, node:$offset), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast(N->getOperand(1))->getZExtValue() == 0; -}]>; - -def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast(N->getOperand(1))->getZExtValue() == 1; -}]>; - -let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs. - def u12 : ImmLeaf; - -//===----------------------------------------------------------------------===// -// Instruction Definitions. - -def HasQPX : Predicate<"Subtarget->hasQPX()">; -let Predicates = [HasQPX] in { -let DecoderNamespace = "QPX" in { -let hasSideEffects = 0 in { // QPX instructions don't have side effects. -let Uses = [RM] in { - // Add Instructions - let isCommutable = 1 in { - def QVFADD : AForm_2<4, 21, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>; - def QVFADDSs : AForm_2<0, 21, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>; - } - def QVFSUB : AForm_2<4, 20, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>; - def QVFSUBSs : AForm_2<0, 20, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>; - - // Estimate Instructions - def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfre $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>; - def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>; - let isCodeGenOnly = 1 in - def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfres $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>; - - def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrsqrte $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>; - def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>; - let isCodeGenOnly = 1 in - def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>; - - // Multiply Instructions - let isCommutable = 1 in { - def QVFMUL : AForm_3<4, 25, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), - "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>; - def QVFMULSs : AForm_3<0, 25, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC), - "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>; - } - def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>; - def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>; - - // Multiply-add instructions - def QVFMADD : AForm_1<4, 29, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>; - def QVFMADDSs : AForm_1<0, 29, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>; - def QVFNMADD : AForm_1<4, 31, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>; - def QVFNMADDSs : AForm_1<0, 31, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - v4f32:$FRB)))]>; - def QVFMSUB : AForm_1<4, 28, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>; - def QVFMSUBSs : AForm_1<0, 28, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB)))]>; - def QVFNMSUB : AForm_1<4, 30, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB))))]>; - let isCodeGenOnly = 1 in - def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>; - def QVFNMSUBSs : AForm_1<0, 30, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB))))]>; - def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>; - def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>; - def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>; - def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>; - def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>; - def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>; - def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>; - def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>; - - // Select Instruction - let isCodeGenOnly = 1 in - def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>; - def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT), - (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (vselect v4i1:$FRA, - v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT), - (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (vselect v4i1:$FRA, - v4f32:$FRC, v4f32:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT), - (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4i1:$FRT, (vselect v4i1:$FRA, - v4i1:$FRC, v4i1:$FRB))]>; - - // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after - // instruction selection into a branch sequence. - def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, - i32imm:$BROPC), "#SELECT_CC_QFRC", - []>; - def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, - i32imm:$BROPC), "#SELECT_CC_QSRC", - []>; - def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, - i32imm:$BROPC), "#SELECT_CC_QBRC", - []>; - - // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition - // register bit directly. - def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond, - qfrc:$T, qfrc:$F), "#SELECT_QFRC", - [(set v4f64:$dst, - (select i1:$cond, v4f64:$T, v4f64:$F))]>; - def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond, - qsrc:$T, qsrc:$F), "#SELECT_QSRC", - [(set v4f32:$dst, - (select i1:$cond, v4f32:$T, v4f32:$F))]>; - def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond, - qbrc:$T, qbrc:$F), "#SELECT_QBRC", - [(set v4i1:$dst, - (select i1:$cond, v4i1:$T, v4i1:$F))]>; - - // Convert and Round Instructions - def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>; - let isCodeGenOnly = 1 in - def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfctid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>; - def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>; - def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>; - def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>; - def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>; - def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>; - def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>; - def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>; - let isCodeGenOnly = 1 in - def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>; - def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>; - def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>; - - let isCodeGenOnly = 1 in - def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>; - def QVFRSPs : XForm_19<4, 12, - (outs qsrc:$FRT), (ins qfrc:$FRB), - "qvfrsp $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>; - - def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>; - - def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fround v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround v4f32:$FRB))]>; - - def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fceil v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fceil v4f32:$FRB))]>; - - def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>; - - // Move Instructions - def QVFMR : XForm_19<4, 72, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f64:$FRT, v4f64:$FRB) */]>; - let isCodeGenOnly = 1 in { - def QVFMRs : XForm_19<4, 72, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f32:$FRT, v4f32:$FRB) */]>; - def QVFMRb : XForm_19<4, 72, - (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4i1:$FRT, v4i1:$FRB) */]>; - } - def QVFNEG : XForm_19<4, 40, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFNEGs : XForm_19<4, 40, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg v4f32:$FRB))]>; - def QVFABS : XForm_19<4, 264, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fabs v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFABSs : XForm_19<4, 264, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fabs v4f32:$FRB))]>; - def QVFNABS : XForm_19<4, 136, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNABSs : XForm_19<4, 136, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>; - def QVFCPSGN : XForm_18<4, 8, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>; - let isCodeGenOnly = 1 in - def QVFCPSGNs : XForm_18<4, 8, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>; - - def QVALIGNI : Z23Form_1<4, 5, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvaligni v4f64:$FRA, v4f64:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIs : Z23Form_1<4, 5, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvaligni v4f32:$FRA, v4f32:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIb : Z23Form_1<4, 5, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvaligni v4i1:$FRA, v4i1:$FRB, - (i32 imm:$idx)))]>; - - def QVESPLATI : Z23Form_2<4, 37, - (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIs : Z23Form_2<4, 37, - (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIb : Z23Form_2<4, 37, - (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>; - - def QVFPERM : AForm_1<4, 6, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFPERMs : AForm_1<4, 6, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>; - - let isReMaterializable = 1, isAsCheapAsAMove = 1 in - def QVGPCI : Z23Form_3<4, 133, - (outs qfrc:$FRT), (ins u12imm:$idx), - "qvgpci $FRT, $idx", IIC_VecPerm, - [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>; - - // Compare Instruction - let isCodeGenOnly = 1 in - def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>; - def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>; - def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>; - def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>; - def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>; - - let isCodeGenOnly = 1 in - def QVFLOGICAL : XForm_20<4, 4, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - def QVFLOGICALb : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - let isCodeGenOnly = 1 in - def QVFLOGICALs : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - - // Load indexed instructions - let mayLoad = 1 in { - def QVLFDX : XForm_1_memOp<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (load xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFDXb : XForm_1_memOp<31, 583, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFDXA : XForm_1<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFDUX : XForm_1<31, 615, - (outs qfrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfdux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - let RC = 1 in - def QVLFDUXA : XForm_1<31, 615, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSX : XForm_1_memOp<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>; - - let isCodeGenOnly = 1 in - def QVLFSXb : XForm_1<31, 519, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFSXs : XForm_1_memOp<31, 519, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f32:$FRT, (load xoaddr:$src))]>; - - let RC = 1 in - def QVLFSXA : XForm_1<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSUX : XForm_1<31, 551, - (outs qsrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfsux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - - let RC = 1 in - def QVLFSUXA : XForm_1<31, 551, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDX : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDXA : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDUX : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDUXA : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSX : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLFCSXs : XForm_1<31, 7, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFCSXA : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSUX : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCSUXA : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWAX : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwax $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWAXA : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWZX : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWZXA : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>; - } - - - def QVLPCLDX : XForm_1<31, 582, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcldx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCLSX : XForm_1<31, 518, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpclsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLPCLSXint : XForm_11<31, 518, - (outs qfrc:$FRT), (ins G8RC:$src), - "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>; - def QVLPCRDX : XForm_1<31, 70, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCRSX : XForm_1<31, 6, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>; - - // Store indexed instructions - let mayStore = 1 in { - def QVSTFDX : XForm_8_memOp<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, - [(store qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFDXb : XForm_8_memOp<31, 711, - (outs), (ins qbrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFDXA : XForm_8<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFDUXA : XForm_8<31, 743, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDXI : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDXIA : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUXI : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDUXIA : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSX : XForm_8_memOp<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFSXs : XForm_8_memOp<31, 647, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(store qsrc:$FRT, xoaddr:$dst)]>; - - let RC = 1 in - def QVSTFSXA : XForm_8<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qsrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - let isCodeGenOnly = 1 in - def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFSUXA : XForm_8<31, 679, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSXI : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSXIA : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUXI : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSUXIA : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDX : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXA : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSX : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - let isCodeGenOnly = 1 in - def QVSTFCSXs : XForm_8<31, 135, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFCSXA : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUX : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXA : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUX : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXA : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDXI : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXIA : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSXI : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSXIA : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUXI : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXIA : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUXI : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXIA : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFIWX : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFIWXA : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>; - } -} - -} // neverHasSideEffects -} - -def : InstAlias<"qvfclr $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>; -def : InstAlias<"qvfand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>; -def : InstAlias<"qvfandc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>; -def : InstAlias<"qvfctfb $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>; -def : InstAlias<"qvfxor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>; -def : InstAlias<"qvfor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>; -def : InstAlias<"qvfnor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>; -def : InstAlias<"qvfequ $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>; -def : InstAlias<"qvfnot $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>; -def : InstAlias<"qvforc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>; -def : InstAlias<"qvfnand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>; -def : InstAlias<"qvfset $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>; - -//===----------------------------------------------------------------------===// -// Additional QPX Patterns -// - -def : Pat<(v4f64 (scalar_to_vector f64:$A)), - (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>; -def : Pat<(v4f32 (scalar_to_vector f32:$A)), - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 1)), - (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 2)), - (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 3)), - (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; - -def : Pat<(f32 (extractelt v4f32:$S, 1)), - (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 2)), - (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 3)), - (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERM $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERMs $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; - -def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C), - (QVFPERM $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B), - (QVFCPSGN $A, $B)>; - -// FCOPYSIGN's operand types need not agree. -def : Pat<(fcopysign v4f64:$frB, v4f32:$frA), - (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>; -def : Pat<(fcopysign QSRC:$frB, QFRC:$frA), - (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>; - -def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>; -def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>; -def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>; - -def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>; -def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>; -def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>; -def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>; - -def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>; -def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>; - -def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B), - (QVFADD $A, $B)>; -def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B), - (QVFSUB $A, $B)>; -def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B), - (QVFMUL $A, $B)>; - -// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b) -def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; -def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; - -def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMSUB $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMSUB $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src), - (QVLFDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src), - (QVLFSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src), - (QVLFCDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src), - (QVLFCDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src), - (QVLFCSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src), - (QVLFCSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src), - (QVLFIWAXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src), - (QVLFIWAX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src), - (QVLFIWZXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src), - (QVLFIWZX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src), - (QVLPCLDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src), - (QVLPCLSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src), - (QVLPCRDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src), - (QVLPCRSX xoaddr:$src)>; - -def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst), - (QVSTFDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst), - (QVSTFSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst), - (QVSTFCDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst), - (QVSTFCDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst), - (QVSTFCSXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst), - (QVSTFCSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst), - (QVSTFDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst), - (QVSTFIWXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst), - (QVSTFIWX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst), - (QVSTFSXA $T, xoaddr:$dst)>; - -def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFDUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUXs $rS, $ptrreg, $ptroff)>; - -def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)), - (QVFLOGICAL $A, $B, imm:$idx)>; -def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)), - (QVGPCI imm:$idx)>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ), - (QVFCMPEQb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT), - (QVFCMPGTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT), - (QVFCMPLTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 10))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ), - (QVFCMPEQbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT), - (QVFCMPGTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT), - (QVFCMPLTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 10))>; - -def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 4))>; -def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 8))>; -def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 9))>; -def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 13))>; -def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 14))>; - -def : Pat<(and v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 1))>; -def : Pat<(or v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 7))>; -def : Pat<(xor v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 6))>; -def : Pat<(not v4i1:$FRA), - (QVFLOGICALb $FRA, $FRA, (i32 10))>; - -def : Pat<(v4f64 (fpextend v4f32:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f32 (fround_exact v4f64:$src)), - (COPY_TO_REGCLASS $src, QSRC)>; - -// Extract the underlying floating-point values from the -// QPX (-1.0, 1.0) boolean representation. -def : Pat<(v4f64 (PPCqbflt v4i1:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)), - (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)), - (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)), - (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)), - (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)), - (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)), - (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -} // end HasQPX - -let Predicates = [HasQPX, NoNaNsFPMath] in { -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>; -} - -let Predicates = [HasQPX, NaNsFPMath] in { -// When either of these operands is NaN, we should return the other operand. -// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need -// to explicitly or with a NaN test on the second operand. -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -} diff --git a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp deleted file mode 100644 index 6e90426438208..0000000000000 --- a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp +++ /dev/null @@ -1,161 +0,0 @@ -//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The QPX vector registers overlay the scalar floating-point registers, and -// any scalar floating-point loads splat their value across all vector lanes. -// Thus, if we have a scalar load followed by a splat, we can remove the splat -// (i.e. replace the load with a load-and-splat pseudo instruction). -// -// This pass must run after anything that might do store-to-load forwarding. -// -//===----------------------------------------------------------------------===// - -#include "PPC.h" -#include "PPCInstrBuilder.h" -#include "PPCInstrInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "ppc-qpx-load-splat" - -STATISTIC(NumSimplified, "Number of QPX load splats simplified"); - -namespace { - struct PPCQPXLoadSplat : public MachineFunctionPass { - static char ID; - PPCQPXLoadSplat() : MachineFunctionPass(ID) { - initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &Fn) override; - - StringRef getPassName() const override { - return "PowerPC QPX Load Splat Simplification"; - } - }; - char PPCQPXLoadSplat::ID = 0; -} - -INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat", - "PowerPC QPX Load Splat Simplification", - false, false) - -FunctionPass *llvm::createPPCQPXLoadSplatPass() { - return new PPCQPXLoadSplat(); -} - -bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - bool MadeChange = false; - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - - for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) { - MachineBasicBlock *MBB = &*MFI; - SmallVector Splats; - - for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) { - MachineInstr *MI = &*MBBI; - - if (MI->hasUnmodeledSideEffects() || MI->isCall()) { - Splats.clear(); - continue; - } - - // We're looking for a sequence like this: - // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2) - // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm - - for (auto SI = Splats.begin(); SI != Splats.end();) { - MachineInstr *SMI = *SI; - Register SplatReg = SMI->getOperand(0).getReg(); - Register SrcReg = SMI->getOperand(1).getReg(); - - if (MI->modifiesRegister(SrcReg, TRI)) { - switch (MI->getOpcode()) { - default: - SI = Splats.erase(SI); - continue; - case PPC::LFS: - case PPC::LFD: - case PPC::LFSU: - case PPC::LFDU: - case PPC::LFSUX: - case PPC::LFDUX: - case PPC::LFSX: - case PPC::LFDX: - case PPC::LFIWAX: - case PPC::LFIWZX: - if (SplatReg != SrcReg) { - // We need to change the load to define the scalar subregister of - // the QPX splat source register. - unsigned SubRegIndex = - TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg()); - Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); - - // Substitute both the explicit defined register, and also the - // implicit def of the containing QPX register. - MI->getOperand(0).setReg(SplatSubReg); - MI->substituteRegister(SrcReg, SplatReg, 0, *TRI); - } - - SI = Splats.erase(SI); - - // If SMI is directly after MI, then MBBI's base iterator is - // pointing at SMI. Adjust MBBI around the call to erase SMI to - // avoid invalidating MBBI. - ++MBBI; - SMI->eraseFromParent(); - --MBBI; - - ++NumSimplified; - MadeChange = true; - continue; - } - } - - // If this instruction defines the splat register, then we cannot move - // the previous definition above it. If it reads from the splat - // register, then it must already be alive from some previous - // definition, and if the splat register is different from the source - // register, then this definition must not be the load for which we're - // searching. - if (MI->modifiesRegister(SplatReg, TRI) || - (SrcReg != SplatReg && - MI->readsRegister(SplatReg, TRI))) { - SI = Splats.erase(SI); - continue; - } - - ++SI; - } - - if (MI->getOpcode() != PPC::QVESPLATI && - MI->getOpcode() != PPC::QVESPLATIs && - MI->getOpcode() != PPC::QVESPLATIb) - continue; - if (MI->getOperand(2).getImm() != 0) - continue; - - // If there are other uses of the scalar value after this, replacing - // those uses might be non-trivial. - if (!MI->getOperand(1).isKill()) - continue; - - Splats.push_back(MI); - } - } - - return MadeChange; -} diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index ed8948a639728..96666ad58dfe5 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -404,9 +404,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } case PPC::F8RCRegClassID: case PPC::F4RCRegClassID: - case PPC::QFRCRegClassID: - case PPC::QSRCRegClassID: - case PPC::QBRCRegClassID: case PPC::VRRCRegClassID: case PPC::VFRCRegClassID: case PPC::VSLRCRegClassID: diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 61acd955e1cba..a931967862c7b 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -153,7 +153,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { switch (RegName[0]) { case 'r': case 'f': - case 'q': // for QPX case 'v': if (RegName[1] == 's') return RegName + 2; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index b45757c1acc5e..e07b960ae305b 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -54,13 +54,6 @@ class FPR num, string n> : PPCReg { let HWEncoding{4-0} = num; } -// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX) -class QFPR : PPCReg { - let HWEncoding = SubReg.HWEncoding; - let SubRegs = [SubReg]; - let SubRegIndices = [sub_64]; -} - // VF - One of the 32 64-bit floating-point subregisters of the vector // registers (used by VSX). class VF num, string n> : PPCReg { @@ -132,12 +125,6 @@ foreach Index = 0-31 in { DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; } -// QPX Floating-point registers -foreach Index = 0-31 in { - def QF#Index : QFPR("F"#Index), "q"#Index>, - DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; -} - // Vector registers foreach Index = 0-31 in { def V#Index : VR("VF"#Index), "v"#Index>, @@ -343,16 +330,6 @@ def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC, // Register class for single precision scalars in VSX registers def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>; -// For QPX -def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13), - (sequence "QF%u", 31, 14))>; -def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>; -def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> { - // These are actually stored as floating-point values where a positive - // number is true and anything else (including NaN) is false. - let Size = 256; -} - def CRBITRC : RegisterClass<"PPC", [i1], 32, (add CR2LT, CR2GT, CR2EQ, CR2UN, CR3LT, CR3GT, CR3EQ, CR3UN, diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td index 0a1ae7e55b3c2..311d5cf165f63 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td @@ -40,12 +40,9 @@ def P9Model : SchedMachineModel { let CompleteModel = 1; - // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing - // Engine), prefixed instructions on Power 9, PC relative mem ops, or - // instructions introduced in ISA 3.1. - let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops, - IsISA3_1]; - + // Do not support SPE (Signal Processing Engine), prefixed instructions on + // Power 9, PC relative mem ops, or instructions introduced in ISA 3.1. + let UnsupportedFeatures = [HasSPE, PrefixInstrs, PCRelativeMemops, IsISA3_1]; } let SchedModel = P9Model in { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 3836cc960394f..85d2966654970 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -35,10 +35,6 @@ using namespace llvm; static cl::opt UseSubRegLiveness("ppc-track-subreg-liveness", cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden); -static cl::opt QPXStackUnaligned("qpx-stack-unaligned", - cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"), - cl::Hidden); - static cl::opt EnableMachinePipeliner("ppc-enable-pipeliner", cl::desc("Enable Machine Pipeliner for PPC"), @@ -70,7 +66,6 @@ void PPCSubtarget::initializeEnvironment() { HasAltivec = false; HasSPE = false; HasFPU = false; - HasQPX = false; HasVSX = false; NeedsTwoConstNR = false; HasP8Vector = false; @@ -109,7 +104,6 @@ void PPCSubtarget::initializeEnvironment() { HasInvariantFunctionDescriptors = false; HasPartwordAtomics = false; HasDirectMove = false; - IsQPXStackUnaligned = false; HasHTM = false; HasFloat128 = false; HasFusion = false; @@ -158,7 +152,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (HasSPE && IsPPC64) report_fatal_error( "SPE is only supported for 32-bit targets.\n", false); - if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU)) + if (HasSPE && (HasAltivec || HasVSX || HasFPU)) report_fatal_error( "SPE and traditional floating point cannot both be enabled.\n", false); @@ -166,10 +160,6 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (!HasSPE) HasFPU = true; - // QPX requires a 32-byte aligned stack. Note that we need to do this if - // we're compiling for a BG/Q system regardless of whether or not QPX - // is enabled because external functions will assume this alignment. - IsQPXStackUnaligned = QPXStackUnaligned; StackAlignment = getPlatformStackAlignment(); // Determine endianness. diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index ec329022c4572..8a4041518e3c2 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -97,7 +97,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool HasAltivec; bool HasFPU; bool HasSPE; - bool HasQPX; bool HasVSX; bool NeedsTwoConstNR; bool HasP8Vector; @@ -150,11 +149,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { POPCNTDKind HasPOPCNTD; - /// When targeting QPX running a stock PPC64 Linux kernel where the stack - /// alignment has not been changed, we need to keep the 16-byte alignment - /// of the stack. - bool IsQPXStackUnaligned; - const PPCTargetMachine &TM; PPCFrameLowering FrameLowering; PPCInstrInfo InstrInfo; @@ -255,7 +249,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasAltivec() const { return HasAltivec; } bool hasSPE() const { return HasSPE; } bool hasFPU() const { return HasFPU; } - bool hasQPX() const { return HasQPX; } bool hasVSX() const { return HasVSX; } bool needsTwoConstNR() const { return NeedsTwoConstNR; } bool hasP8Vector() const { return HasP8Vector; } @@ -291,11 +284,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasPartwordAtomics() const { return HasPartwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } - bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } Align getPlatformStackAlignment() const { - if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned()) - return Align(32); - return Align(16); } @@ -325,9 +314,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { const Triple &getTargetTriple() const { return TargetTriple; } - /// isBGQ - True if this is a BG/Q platform. - bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } - bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index f15f9c7f49429..27de5b29cd341 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -63,10 +63,6 @@ static cl:: opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); -static cl:: -opt DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden, - cl::desc("Disable QPX load splat simplification")); - static cl:: opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); @@ -114,7 +110,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { initializePPCReduceCRLogicalsPass(PR); initializePPCBSelPass(PR); initializePPCBranchCoalescingPass(PR); - initializePPCQPXLoadSplatPass(PR); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCPreEmitPeepholePass(PR); @@ -411,14 +406,9 @@ void PPCPassConfig::addIRPasses() { // Lower generic MASSV routines to PowerPC subtarget-specific entries. addPass(createPPCLowerMASSVEntriesPass()); - - // For the BG/Q (or if explicitly requested), add explicit data prefetch - // intrinsics. - bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && - getOptLevel() != CodeGenOpt::None; + + // If explicitly requested, add explicit data prefetch intrinsics. if (EnablePrefetch.getNumOccurrences() > 0) - UsePrefetching = EnablePrefetch; - if (UsePrefetching) addPass(createLoopDataPrefetchPass()); if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) { @@ -515,15 +505,8 @@ void PPCPassConfig::addPreRegAlloc() { } void PPCPassConfig::addPreSched2() { - if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() != CodeGenOpt::None) addPass(&IfConverterID); - - // This optimization must happen after anything that might do store-to-load - // forwarding. Here we're after RA (and, thus, when spills are inserted) - // but before post-RA scheduling. - if (!DisableQPXLoadSplat) - addPass(createPPCQPXLoadSplatPass()); - } } void PPCPassConfig::addPreEmitPass() { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index bbb4239d36da5..ee8842f4d8663 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -25,8 +25,7 @@ using namespace llvm; static cl::opt DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); -// This is currently only used for the data prefetch pass which is only enabled -// for BG/Q by default. +// This is currently only used for the data prefetch pass static cl::opt CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), cl::desc("The loop prefetch cache line size")); @@ -104,55 +103,6 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1)); } - case Intrinsic::ppc_qpx_qvlfs: - // Turn PPC QPX qvlfs -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(0), Align(16), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { - Type *VTy = - VectorType::get(IC.Builder.getFloatTy(), - cast(II.getType())->getElementCount()); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0), - PointerType::getUnqual(VTy)); - Value *Load = IC.Builder.CreateLoad(VTy, Ptr); - return new FPExtInst(Load, II.getType()); - } - break; - case Intrinsic::ppc_qpx_qvlfd: - // Turn PPC QPX qvlfd -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(0), Align(32), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { - Value *Ptr = IC.Builder.CreateBitCast( - II.getArgOperand(0), PointerType::getUnqual(II.getType())); - return new LoadInst(II.getType(), Ptr, "", false, Align(32)); - } - break; - case Intrinsic::ppc_qpx_qvstfs: - // Turn PPC QPX qvstfs -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(1), Align(16), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { - Type *VTy = VectorType::get( - IC.Builder.getFloatTy(), - cast(II.getArgOperand(0)->getType())->getElementCount()); - Value *TOp = IC.Builder.CreateFPTrunc(II.getArgOperand(0), VTy); - Type *OpPtrTy = PointerType::getUnqual(VTy); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); - return new StoreInst(TOp, Ptr, false, Align(16)); - } - break; - case Intrinsic::ppc_qpx_qvstfd: - // Turn PPC QPX qvstfd -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(1), Align(32), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { - Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); - return new StoreInst(II.getArgOperand(0), Ptr, false, Align(32)); - } - break; - case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -736,10 +686,7 @@ bool PPCTTIImpl::useColdCCForColdCall(Function &F) { } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { - // On the A2, always unroll aggressively. For QPX unaligned loads, we depend - // on combining the loads generated for consecutive accesses, and failure to - // do so is particularly expensive. This makes it much more likely (compared - // to only using concatenation unrolling). + // On the A2, always unroll aggressively. if (ST->getCPUDirective() == PPC::DIR_A2) return true; @@ -799,7 +746,6 @@ const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { - if (ST->hasQPX()) return 256; if (ST->hasAltivec()) return 128; return 0; } @@ -828,8 +774,6 @@ unsigned PPCTTIImpl::getCacheLineSize() const { } unsigned PPCTTIImpl::getPrefetchDistance() const { - // This seems like a reasonable default for the BG/Q (this pass is enabled, by - // default, only on the BG/Q). return 300; } @@ -918,7 +862,7 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the @@ -974,13 +918,6 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return Cost; - } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { - // Floating point scalars are already located in index #0. - if (Index == 0) - return 0; - - return Cost; - } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { if (ST->hasP9Altivec()) { if (ISD == ISD::INSERT_VECTOR_ELT) @@ -1055,8 +992,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, LT.second == MVT::v4i32 || LT.second == MVT::v4f32); bool IsVSXType = ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); - bool IsQPXType = ST->hasQPX() && - (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); // VSX has 32b/64b load instructions. Legalization can handle loading of // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and @@ -1079,8 +1014,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // for Altivec types using the VSX instructions, but that's more expensive // than using the permutation-based load sequence. On the P8, that's no // longer true. - if (Opcode == Instruction::Load && - ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) && *Alignment >= LT.second.getScalarType().getStoreSize()) return Cost + LT.first; // Add the cost of the permutations. @@ -1133,7 +1067,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost( getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). For each result vector, we need one shuffle per incoming // vector (except that the first shuffle can take two incoming vectors diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 427abde4277d4..aa06e8144f634 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4751,15 +4751,14 @@ struct VarArgPowerPC64Helper : public VarArgHelper { // For PowerPC, we need to deal with alignment of stack arguments - // they are mostly aligned to 8 bytes, but vectors and i128 arrays // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes, - // and QPX vectors are aligned to 32 bytes. For that reason, we - // compute current offset from stack pointer (which is always properly - // aligned), and offset for the first vararg, then subtract them. + // For that reason, we compute current offset from stack pointer (which is + // always properly aligned), and offset for the first vararg, then subtract + // them. unsigned VAArgBase; Triple TargetTriple(F.getParent()->getTargetTriple()); // Parameter save area starts at 48 bytes from frame pointer for ABIv1, // and 32 bytes for ABIv2. This is usually determined by target // endianness, but in theory could be overridden by function attribute. - // For simplicity, we ignore it here (it'd only matter for QPX vectors). if (TargetTriple.getArch() == Triple::ppc64) VAArgBase = 48; else diff --git a/llvm/test/Analysis/BasicAA/phi-spec-order.ll b/llvm/test/Analysis/BasicAA/phi-spec-order.ll index f8586f094c2ce..e5d435c09ccc7 100644 --- a/llvm/test/Analysis/BasicAA/phi-spec-order.ll +++ b/llvm/test/Analysis/BasicAA/phi-spec-order.ll @@ -1,5 +1,5 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; RUN: opt < %s -basic-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s @X = external global [16000 x double], align 32 diff --git a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll index 3b1bc3b3fdbc0..e5fbf070cf32a 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll @@ -218,42 +218,6 @@ entry: ; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 } -define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { -entry: - %r = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %r - -; CHECK-LABEL: test_l_qv4float -; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 -} - -define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { -entry: - %r = load <8 x float>, <8 x float>* %p, align 4 - ret <8 x float> %r - -; CHECK-LABEL: test_l_qv8float -; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 -} - -define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { -entry: - %r = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %r - -; CHECK-LABEL: test_l_qv4double -; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 -} - -define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { -entry: - %r = load <8 x double>, <8 x double>* %p, align 8 - ret <8 x double> %r - -; CHECK-LABEL: test_l_qv8double -; CHECK: cost of 4 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8 -} - define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { entry: store <16 x i8> %v, <16 x i8>* %p, align 1 @@ -362,43 +326,6 @@ entry: ; CHECK: cost of 2 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 } -define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void - -; CHECK-LABEL: test_s_qv4float -; CHECK: cost of 7 for instruction: store <4 x float> %v, <4 x float>* %p, align 4 -} - -define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { -entry: - store <8 x float> %v, <8 x float>* %p, align 4 - ret void - -; CHECK-LABEL: test_s_qv8float -; CHECK: cost of 15 for instruction: store <8 x float> %v, <8 x float>* %p, align 4 -} - -define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void - -; CHECK-LABEL: test_s_qv4double -; CHECK: cost of 7 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 -} - -define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { -entry: - store <8 x double> %v, <8 x double>* %p, align 8 - ret void - -; CHECK-LABEL: test_s_qv8double -; CHECK: cost of 15 for instruction: store <8 x double> %v, <8 x double>* %p, align 8 -} - attributes #0 = { nounwind "target-cpu"="pwr7" } -attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll index 69f9cff5c525f..d93f192b1274d 100644 --- a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll +++ b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -enable-misched < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -enable-misched < %s | FileCheck %s ; ; PR14315: misched should not move the physreg copy of %t below the calls. diff --git a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir index 738aa1df5dd9d..a0139879f8c91 100644 --- a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir +++ b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir @@ -55,7 +55,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.module.flags = !{!0, !1} diff --git a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir index bcd51d31c6cfd..01ce79995512a 100644 --- a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir +++ b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir @@ -30,7 +30,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll b/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll deleted file mode 100644 index 17e3df6d58ccc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 | FileCheck -check-prefix=CHECK-A2 %s -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck -check-prefix=CHECK-A2Q %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-bgq-linux -mcpu=a2 | FileCheck -check-prefix=CHECK-BGQ %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare i32 @bar(i8* %a) nounwind; -define i32 @foo() nounwind { - %p = alloca i8, i8 115 - store i8 0, i8* %p - %r = call i32 @bar(i8* %p) - ret i32 %r -} - -; Without QPX, the allocated stack frame is 240 bytes, but with QPX -; (because we require 32-byte alignment), it is 256 bytes. -; CHECK-A2: @foo -; CHECK-A2: stdu 1, -240(1) -; CHECK-A2Q: @foo -; CHECK-A2Q: stdu 1, -256(1) -; CHECK-BGQ: @foo -; CHECK-BGQ: stdu 1, -256(1) - diff --git a/llvm/test/CodeGen/PowerPC/a2q.ll b/llvm/test/CodeGen/PowerPC/a2q.ll deleted file mode 100644 index 84e2dfa991d78..0000000000000 --- a/llvm/test/CodeGen/PowerPC/a2q.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 -mattr=+qpx | FileCheck %s - -define void @foo() { -entry: - ret void -} - -; CHECK: @foo - diff --git a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll index 1b0ea26f1fdea..d629148535aa7 100644 --- a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll +++ b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll @@ -298,7 +298,7 @@ _ZN10SubProcess12SafeSyscalls5fcntlEiil.exit: ; preds = %_ZN10SubProcess12Sa ; Function Attrs: nounwind argmemonly declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind argmemonly } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/asm-Zy.ll b/llvm/test/CodeGen/PowerPC/asm-Zy.ll index 78bb0f4c73eca..c8b5e9f1aa1d1 100644 --- a/llvm/test/CodeGen/PowerPC/asm-Zy.ll +++ b/llvm/test/CodeGen/PowerPC/asm-Zy.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2 | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" define i32 @zytest(i32 %a) nounwind { entry: diff --git a/llvm/test/CodeGen/PowerPC/asm-constraints.ll b/llvm/test/CodeGen/PowerPC/asm-constraints.ll index a3e573d8935e9..da77d1a169792 100644 --- a/llvm/test/CodeGen/PowerPC/asm-constraints.ll +++ b/llvm/test/CodeGen/PowerPC/asm-constraints.ll @@ -65,7 +65,7 @@ entry: } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir index 2081e6fd02f51..904210ee13477 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir @@ -63,8 +63,8 @@ ret i64 %2 } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir index b52e0a4103add..f46d4fc0a42a4 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir @@ -187,7 +187,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir index 4d2595e1abdcb..ba950dc3d3ae9 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir @@ -983,10 +983,10 @@ ret i64 %xor } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll index ed3c9f07c1a85..75640d1d26072 100644 --- a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll +++ b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX declare float @fabsf(float) @@ -64,11 +63,6 @@ loop_exit: ; CHECK-NOT: xsmindp ; CHECK: blr -; QPX-LABEL: test1v: -; QPX: mtctr -; QPX-NOT: bl fminf -; QPX: blr - define void @test1a(float %f, float* %fp) { entry: br label %loop_body @@ -139,11 +133,6 @@ loop_exit: ; CHECK-NOT: xsmaxdp ; CHECK: blr -; QPX-LABEL: test2v: -; QPX: mtctr -; QPX-NOT: bl fmax -; QPX: blr - define void @test2a(float %f, float* %fp) { entry: br label %loop_body diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll index 44acfcdd6e66a..636c86b815c8c 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 -; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s --check-prefixes=CHECK,CHECK-A2Q ; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4 ; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result. @@ -86,11 +85,8 @@ for.body: ; preds = %entry, %for.body } ; Function Attrs: norecurse nounwind -; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. -; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount2NonSmallLoop() { ; CHECK-LABEL: testTripCount2NonSmallLoop: -; CHECK-A2Q: mtctr ; CHECK-PWR8-NOT: mtctr ; CHECK: blr @@ -121,12 +117,9 @@ for.end: ; preds = %if.end ret i32 %conv } -; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. -; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount5() { ; CHECK-LABEL: testTripCount5: ; CHECK-PWR8-NOT: mtctr -; CHECK-A2Q: mtctr entry: %.prea = load i32, i32* @a, align 4 diff --git a/llvm/test/CodeGen/PowerPC/ec-input.ll b/llvm/test/CodeGen/PowerPC/ec-input.ll index 9a1c121699a69..425bc1985d419 100644 --- a/llvm/test/CodeGen/PowerPC/ec-input.ll +++ b/llvm/test/CodeGen/PowerPC/ec-input.ll @@ -5,7 +5,7 @@ ; that were both inputs to the inline asm and also early-clobber outputs). target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713 = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712 = type { %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32 } diff --git a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll index e066b45d3ca4b..023928bcb5896 100644 --- a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll +++ b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64-unknown-linux" %"class.Foam::messageStream.6" = type <{ %"class.Foam::string.5", i32, i32, i32, [4 x i8] }> %"class.Foam::string.5" = type { %"class.std::basic_string.4" } @@ -419,8 +419,8 @@ declare void @_ZN4Foam11regIOobjectD2Ev() #0 declare void @_ZN4Foam6reduceIiNS_5sumOpIiEEEEvRKNS_4ListINS_8UPstream11commsStructEEERT_RKT0_ii() #0 -attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll index fdd0fc2767803..b08b050f2c2fd 100644 --- a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll +++ b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs -O0 -relocation-model=pic < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"class.std::__1::__tree_node.130.151" = type { %"class.std::__1::__tree_node_base.base.128.149", %"class.boost::serialization::extended_type_info.129.150"* } %"class.std::__1::__tree_node_base.base.128.149" = type <{ %"class.std::__1::__tree_end_node.127.148", %"class.std::__1::__tree_node_base.126.147"*, %"class.std::__1::__tree_node_base.126.147"*, i8 }> diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll index eef6e0ccac02b..a336fc796ca52 100644 --- a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll @@ -33,4 +33,4 @@ define float @f(float %xf) #0 { ret float %25 } -attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll index 2feb4556dfab7..3b555cf898f57 100644 --- a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll +++ b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define linkonce_odr double @test1(ppc_fp128 %input) { entry: diff --git a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll index 54c3e11528b7b..2aa5239f25eb8 100644 --- a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll +++ b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"class.std::__1::__assoc_sub_state" = type { %"class.std::__1::__shared_count", %"class.std::__exception_ptr::exception_ptr", %"class.std::__1::mutex", %"class.std::__1::condition_variable", i32 } %"class.std::__1::__shared_count" = type { i32 (...)**, i64 } diff --git a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll index 74bfa75e5e313..a2d0eb599f91d 100644 --- a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll +++ b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll @@ -33,5 +33,5 @@ declare i8* @_ZN11__sanitizer21internal_start_threadEPFvPvES0_(void (i8*)*, i8*) declare hidden void @_ZN11__sanitizer16BackgroundThreadEPv(i8* nocapture readnone) #5 -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #7 = { nobuiltin nounwind } diff --git a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll index e4dfd6c58f0e8..6f1bc76d816ae 100644 --- a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll +++ b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %struct.BG_CoordinateMapping_t = type { [4 x i8] } diff --git a/llvm/test/CodeGen/PowerPC/load-two-flts.ll b/llvm/test/CodeGen/PowerPC/load-two-flts.ll index 1cfcff5e01601..19e21faf47232 100644 --- a/llvm/test/CodeGen/PowerPC/load-two-flts.ll +++ b/llvm/test/CodeGen/PowerPC/load-two-flts.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) { entry: diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll index f4664788930d4..2cbb70bb14cb5 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-ppc-prefetching=true -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define void @foo(double* %x, double* nocapture readonly %y) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll index f4821564c202b..defc52eec8e0d 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -enable-ppc-prefetching=true -mcpu=a2 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define void @foo(double* nocapture %a, double* nocapture readonly %b) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll index a13192d3e6586..7fdabcd4be210 100644 --- a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll +++ b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll @@ -1,5 +1,4 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BGQ target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -21,7 +20,6 @@ for.body: ; preds = %for.body, %entry ; CHECK-LABEL: @foo -; CHECK-BGQ-DAG: dcbt 4, 5 ; CHECK-DAG: lfdu [[REG1:[0-9]+]], 8({{[0-9]+}}) ; CHECK-DAG: fadd [[REG2:[0-9]+]], [[REG1]], 0 ; CHECK-DAG: stfdu [[REG2]], 8({{[0-9]+}}) @@ -34,15 +32,13 @@ for.cond.cleanup6: ; preds = %for.body7 for.body7: ; preds = %for.body, %for.body7 %i3.017 = phi i32 [ %inc9, %for.body7 ], [ 0, %for.body ] - tail call void bitcast (void (...)* @bar to void ()*)() #2 + tail call void bitcast (void (...)* @bar to void ()*)() #0 %inc9 = add nuw nsw i32 %i3.017, 1 %exitcond = icmp eq i32 %inc9, 1024 br i1 %exitcond, label %for.cond.cleanup6, label %for.body7 } -declare void @bar(...) #1 +declare void @bar(...) -attributes #0 = { nounwind "target-cpu"="a2q" } -attributes #1 = { "target-cpu"="a2q" } -attributes #2 = { nounwind } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll index 93868007d0d36..aa618d2b732c7 100644 --- a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll +++ b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll @@ -41,6 +41,6 @@ define void @aligned_slot() #0 { ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll index 2e834b1fe788c..16fc3ee3e5202 100644 --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -1,5 +1,4 @@ ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr7 < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-PWR -; RUN: llc -verify-machineinstrs -O3 -mcpu=a2q < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-QPX ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 < %s | FileCheck %s -check-prefix=FIXPOINT target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -93,9 +92,6 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds1: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -110,9 +106,6 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds2: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -127,9 +120,6 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds3: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -144,9 +134,6 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds4: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -217,9 +204,6 @@ define i64 @reassociate_mulld(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { define double @reassociate_mamaa_double(double %0, double %1, double %2, double %3, double %4, double %5) { ; CHECK-LABEL: reassociate_mamaa_double: ; CHECK: # %bb.0: -; CHECK-QPX-DAG: fmadd [[REG0:[0-9]+]], 4, 3, 2 -; CHECK-QPX-DAG: fmadd [[REG1:[0-9]+]], 6, 5, 1 -; CHECK-QPX: fadd 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xsmaddadp 1, 6, 5 ; CHECK-PWR-DAG: xsmaddadp 2, 4, 3 ; CHECK-PWR: xsadddp 1, 2, 1 @@ -250,9 +234,6 @@ define float @reassociate_mamaa_float(float %0, float %1, float %2, float %3, fl define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) { ; CHECK-LABEL: reassociate_mamaa_vec: ; CHECK: # %bb.0: -; CHECK-QPX-DAG: qvfmadds [[REG0:[0-9]+]], 4, 3, 2 -; CHECK-QPX-DAG: qvfmadds [[REG1:[0-9]+]], 6, 5, 1 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xvmaddasp [[REG0:[0-9]+]], 39, 38 ; CHECK-PWR-DAG: xvmaddasp [[REG1:[0-9]+]], 37, 36 ; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] @@ -268,11 +249,6 @@ define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x f define double @reassociate_mamama_double(double %0, double %1, double %2, double %3, double %4, double %5, double %6, double %7, double %8) { ; CHECK-LABEL: reassociate_mamama_double: ; CHECK: # %bb.0: -; CHECK-QPX: fmadd [[REG0:[0-9]+]], 2, 1, 7 -; CHECK-QPX-DAG: fmul [[REG1:[0-9]+]], 4, 3 -; CHECK-QPX-DAG: fmadd [[REG2:[0-9]+]], 6, 5, [[REG0]] -; CHECK-QPX-DAG: fmadd [[REG3:[0-9]+]], 9, 8, [[REG1]] -; CHECK-QPX: fadd 1, [[REG2]], [[REG3]] ; CHECK-PWR: xsmaddadp 7, 2, 1 ; CHECK-PWR-DAG: xsmuldp [[REG0:[0-9]+]], 4, 3 ; CHECK-PWR-DAG: xsmaddadp 7, 6, 5 diff --git a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll index e135986a2894c..f807f4fa20d25 100644 --- a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll +++ b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll @@ -19,7 +19,7 @@ entry: declare void @bar(double) #1 -attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll index cbb7947be2198..502347a3af198 100644 --- a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll +++ b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll @@ -1,9 +1,8 @@ -; RUN: opt -ee-instrument < %s | opt -inline | llc | FileCheck %s +; RUN: opt -ee-instrument < %s | opt -inline | llc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s ; The run-line mimics how Clang might run the instrumentation passes. target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @leaf_function() #0 { diff --git a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll index cd0abd6149bde..c4e60f8c4b1f5 100644 --- a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll +++ b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll @@ -1,6 +1,5 @@ ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR7 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR8 -; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s -check-prefix=A2Q target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -25,12 +24,6 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @foo1 -; A2Q-NOT: bl memcpy -; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -52,12 +45,6 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @foo2 -; A2Q-NOT: bl memcpy -; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -76,11 +63,6 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @bar1 -; A2Q-NOT: bl memset -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -99,11 +81,6 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @bar2 -; A2Q-NOT: bl memset -; A2Q: qvstfdx -; A2Q: blr } ; Function Attrs: nounwind diff --git a/llvm/test/CodeGen/PowerPC/memset-nc.ll b/llvm/test/CodeGen/PowerPC/memset-nc.ll deleted file mode 100644 index 663d0cb1d6785..0000000000000 --- a/llvm/test/CodeGen/PowerPC/memset-nc.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -O0 < %s | FileCheck %s -check-prefix=CHECK-O0 -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -; Function Attrs: nounwind -define void @test_qpx() unnamed_addr #0 align 2 { -entry: - %0 = load i32, i32* undef, align 4 - %1 = trunc i32 %0 to i8 - call void @llvm.memset.p0i8.i64(i8* align 32 null, i8 %1, i64 64, i1 false) - ret void - -; CHECK-LABEL: @test_qpx -; CHECK: qvstfdx -; CHECK: qvstfdx -; CHECK: blr - -; CHECK-O0-LABEL: @test_qpx -; CHECK-O0-NOT: qvstfdx -; CHECK-O0: blr -} - -; Function Attrs: nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 - -; Function Attrs: nounwind -define void @test_vsx() unnamed_addr #2 align 2 { -entry: - %0 = load i32, i32* undef, align 4 - %1 = trunc i32 %0 to i8 - call void @llvm.memset.p0i8.i64(i8* null, i8 %1, i64 32, i1 false) - ret void - -; CHECK-LABEL: @test_vsx -; CHECK: stxvw4x -; CHECK: stxvw4x -; CHECK: blr - -; CHECK-O0-LABEL: @test_vsx -; CHECK-O0-NOT: stxvw4x -; CHECK-O0: blr -} - -attributes #0 = { nounwind "target-cpu"="a2q" } -attributes #1 = { nounwind } -attributes #2 = { nounwind "target-cpu"="pwr7" } - diff --git a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll index 26663d81f3575..089c947713b9d 100644 --- a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll +++ b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -1,8 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -enable-misched -pre-RA-sched=source -scheditins=false \ -; RUN: -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s +; RUN: -disable-ifcvt-triangle-false -disable-post-ra -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s ; target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" ; %val1 is a load live out of %entry. It should be hoisted ; above the add. diff --git a/llvm/test/CodeGen/PowerPC/misched.ll b/llvm/test/CodeGen/PowerPC/misched.ll index 1c868b3f171c9..9a75fe44b7176 100644 --- a/llvm/test/CodeGen/PowerPC/misched.ll +++ b/llvm/test/CodeGen/PowerPC/misched.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -enable-misched -verify-machineinstrs ; PR14302 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" @b = external global [16000 x double], align 32 diff --git a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll index f59df4291c48f..ad5976318fe3a 100644 --- a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll +++ b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" declare zeroext i1 @ri1() declare void @se1() diff --git a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll index 2e248506c7b7b..2871e077df565 100644 --- a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll @@ -92,7 +92,7 @@ entry: ; Left the target features in this test because it is important that caller has ; -pcrelative-memops while callee has +pcrelative-memops -attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } -attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } -attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-qpx,-spe" } +attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } +attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } +attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-spe" } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/popcnt.ll b/llvm/test/CodeGen/PowerPC/popcnt.ll index a06c59d4b945a..695863d87f16e 100644 --- a/llvm/test/CodeGen/PowerPC/popcnt.ll +++ b/llvm/test/CodeGen/PowerPC/popcnt.ll @@ -1,8 +1,6 @@ ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+popcntd < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+slow-popcntd < %s | FileCheck %s --check-prefix=SLOWPC ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q < %s | FileCheck %s --check-prefix=SLOWPC -; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q -mattr=+popcntd < %s | FileCheck %s define i64 @_cntb64(i64 %x) nounwind readnone { %cnt = tail call i64 @llvm.ppc.popcntb(i64 %x) diff --git a/llvm/test/CodeGen/PowerPC/ppc-passname.ll b/llvm/test/CodeGen/PowerPC/ppc-passname.ll index 98343bdb535c2..06f13278d84cd 100644 --- a/llvm/test/CodeGen/PowerPC/ppc-passname.ll +++ b/llvm/test/CodeGen/PowerPC/ppc-passname.ll @@ -105,14 +105,3 @@ ; STOP-AFTER-BRANCH-COALESCING-NOT: "ppc-branch-coalescing" pass is not registered. ; STOP-AFTER-BRANCH-COALESCING: Branch Coalescing - -; Test pass name: ppc-qpx-load-splat. -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-QPX-LOAD-SPLAT -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: -ppc-qpx-load-splat -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: PowerPC QPX Load Splat Simplification - -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-QPX-LOAD-SPLAT -; STOP-AFTER-QPX-LOAD-SPLAT: -ppc-qpx-load-splat -; STOP-AFTER-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. -; STOP-AFTER-QPX-LOAD-SPLAT: PowerPC QPX Load Splat Simplification diff --git a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll index fc0e71f878cab..357f28e88b184 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -code-model=small | FileCheck %s -check-prefix=SCM ; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because @@ -117,23 +117,6 @@ define void @caller_local_sret_32(%S_32* %a) #1 { attributes #0 = { noinline nounwind } attributes #1 = { nounwind } -; vector <4 x i1> test - -define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void } -define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) { - tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b) - ret void - -; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't -; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder - -; CHECK-SCO-LABEL: caller_v4i1_reorder: -; CHECK-SCO: bl callee_v4i1 - -; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder: -; CHECK-SCO-HASQPX: b callee_v4i1 -} - define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void } define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) diff --git a/llvm/test/CodeGen/PowerPC/pr24546.ll b/llvm/test/CodeGen/PowerPC/pr24546.ll index 28c03293680e5..028fd2d8f0064 100644 --- a/llvm/test/CodeGen/PowerPC/pr24546.ll +++ b/llvm/test/CodeGen/PowerPC/pr24546.ll @@ -47,8 +47,8 @@ declare double @pow(double, double) #0 ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/pr27350.ll b/llvm/test/CodeGen/PowerPC/pr27350.ll index 982023a1fcdc8..93dbd10fecdeb 100644 --- a/llvm/test/CodeGen/PowerPC/pr27350.ll +++ b/llvm/test/CodeGen/PowerPC/pr27350.ll @@ -18,7 +18,7 @@ entry: declare fastcc void @bar([2 x i64], [2 x i64]) unnamed_addr #1 align 2 attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/pr28130.ll b/llvm/test/CodeGen/PowerPC/pr28130.ll index cb703dfda8a59..4da415bd29269 100644 --- a/llvm/test/CodeGen/PowerPC/pr28130.ll +++ b/llvm/test/CodeGen/PowerPC/pr28130.ll @@ -67,4 +67,4 @@ bb: ret void } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll index 04dee1ee182bb..35aec57ec2640 100644 --- a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll +++ b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %t1 = type { %t2*, %t3* } %t2 = type <{ %t3*, i32, [4 x i8] }> diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll b/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll deleted file mode 100644 index 4e0aef4c3df71..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define void @s452(i32 %inp1) nounwind { -entry: - br label %for.body4 - -for.body4: ; preds = %for.body4, %entry - %conv.4 = sitofp i32 %inp1 to double - %conv.5 = sitofp i32 %inp1 to double - %mul.4.v.i0.1 = insertelement <2 x double> undef, double %conv.4, i32 0 - %v = insertelement <2 x double> %mul.4.v.i0.1, double %conv.5, i32 1 - %vv = fmul <2 x double> %v, %v - %add7.4 = fadd <2 x double> %vv, %vv - store <2 x double> %add7.4, <2 x double>* undef, align 16 - br i1 undef, label %for.end, label %for.body4 - -for.end: ; preds = %for.body4 - unreachable -; CHECK-LABEL: @s452 -; CHECK: lfiwax [[REG1:[0-9]+]], -; CHECK: fcfid [[REG2:[0-9]+]], [[REG1]] -; FIXME: We could 'promote' this to a vector earlier and remove this splat. -; CHECK: qvesplati {{[0-9]+}}, [[REG2]], 0 -; CHECK: qvfmul -; CHECK: qvfadd -; CHECK: qvesplati {{[0-9]+}}, -; FIXME: We can use qvstfcdx here instead of two stores. -; CHECK: stfd -; CHECK: stfd -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv.ll b/llvm/test/CodeGen/PowerPC/qpx-bv.ll deleted file mode 100644 index 93a739b864c1d..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-bv.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s - -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(double %f1, double %f2, double %f3, double %f4) { - %v1 = insertelement <4 x double> undef, double %f1, i32 0 - %v2 = insertelement <4 x double> %v1, double %f2, i32 1 - %v3 = insertelement <4 x double> %v2, double %f3, i32 2 - %v4 = insertelement <4 x double> %v3, double %f4, i32 3 - ret <4 x double> %v4 - -; CHECK-LABEL: @foo -; CHECK: qvgpci [[REG1:[0-9]+]], 275 -; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 -; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] -; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] -; CHECK: blr -} - -define <4 x float> @goo(float %f1, float %f2, float %f3, float %f4) { - %v1 = insertelement <4 x float> undef, float %f1, i32 0 - %v2 = insertelement <4 x float> %v1, float %f2, i32 1 - %v3 = insertelement <4 x float> %v2, float %f3, i32 2 - %v4 = insertelement <4 x float> %v3, float %f4, i32 3 - ret <4 x float> %v4 - -; CHECK-LABEL: @goo -; CHECK: qvgpci [[REG1:[0-9]+]], 275 -; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 -; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] -; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll b/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll deleted file mode 100644 index ccbbd162a0cdb..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -declare <4 x double> @foo(<4 x double> %p) - -define <4 x double> @bar(<4 x double> %p, <4 x double> %q) { -entry: - %v = call <4 x double> @foo(<4 x double> %p) - %w = call <4 x double> @foo(<4 x double> %q) - %x = fadd <4 x double> %v, %w - ret <4 x double> %x - -; CHECK-LABEL: @bar -; CHECK: qvstfdx 2, -; CHECK: bl foo -; CHECK: qvstfdx 1, -; CHECK: qvlfdx 1, -; CHECK: bl foo -; CHECK: qvlfdx [[REG:[0-9]+]], -; CHECK: qvfadd 1, [[REG]], 1 -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll deleted file mode 100644 index 50b864980d985..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll +++ /dev/null @@ -1,80 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ -; RUN: -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s - -; Function Attrs: norecurse nounwind readonly -define <4 x double> @foo(double* nocapture readonly %a) #0 { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvdsx v2, 0, r3 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %0 = load double, double* %a, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - ret <4 x double> %shuffle.i -} - -define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 { -; CHECK-LABEL: foox: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: lxvdsx v2, r3, r4 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %p = getelementptr double, double* %a, i64 %idx - %0 = load double, double* %p, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - ret <4 x double> %shuffle.i -} - -define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 { -; CHECK-LABEL: fooxu: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: add r6, r3, r4 -; CHECK-NEXT: std r6, 0(r5) -; CHECK-NEXT: lxvdsx v2, r3, r4 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %p = getelementptr double, double* %a, i64 %idx - %0 = load double, double* %p, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - store double* %p, double** %pptr, align 8 - ret <4 x double> %shuffle.i -} - -define <4 x float> @foof(float* nocapture readonly %a) #0 { -; CHECK-LABEL: foof: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxspltw v2, vs0, 1 -; CHECK-NEXT: blr -entry: - %0 = load float, float* %a, align 4 - %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 - %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %shuffle.i -} - -define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 { -; CHECK-LABEL: foofx: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 2 -; CHECK-NEXT: lfiwzx f0, r3, r4 -; CHECK-NEXT: xxspltw v2, vs0, 1 -; CHECK-NEXT: blr -entry: - %p = getelementptr float, float* %a, i64 %idx - %0 = load float, float* %p, align 4 - %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 - %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %shuffle.i -} - - diff --git a/llvm/test/CodeGen/PowerPC/qpx-load.ll b/llvm/test/CodeGen/PowerPC/qpx-load.ll deleted file mode 100644 index 514f0934b6cfc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-load.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(<4 x double>* %p) { -entry: - %v = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %v -} - -; CHECK: @foo -; CHECK-DAG: li [[REG1:[0-9]+]], 31 -; CHECK-DAG: qvlfdx [[REG4:[0-9]+]], 0, 3 -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], 3, [[REG1]] -; CHECK-DAG: qvlpcldx [[REG3:[0-9]+]], 0, 3 -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] -; CHECK: blr - -define <4 x double> @bar(<4 x double>* %p) { -entry: - %v = load <4 x double>, <4 x double>* %p, align 32 - ret <4 x double> %v -} - -; CHECK: @bar -; CHECK: qvlfdx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll b/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll deleted file mode 100644 index eab4d6af7e9fc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll +++ /dev/null @@ -1,79 +0,0 @@ -; RUN: llc -verify-machineinstrs -stop-after=finalize-isel < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <2 x double> @test_qvfmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfmadd -; CHECK: QVFMADD %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fadd reassoc nsz <2 x double> %4, %0 - ret <2 x double> %5 -} - -define <4 x float> @test_qvfmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfmadds -; CHECK: QVFMADDSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fadd reassoc nsz <4 x float> %4, %0 - ret <4 x float> %5 -} - -define <2 x double> @test_qvfnmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfnmadd -; CHECK: QVFNMADD %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fadd reassoc nsz <2 x double> %4, %0 - %6 = fneg reassoc nsz <2 x double> %5 - ret <2 x double> %6 -} - -define <4 x float> @test_qvfnmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfnmadds -; CHECK: QVFNMADDSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fadd reassoc nsz <4 x float> %4, %0 - %6 = fneg reassoc nsz <4 x float> %5 - ret <4 x float> %6 -} - -define <2 x double> @test_qvfmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfmsub -; CHECK: QVFMSUB %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fsub reassoc nsz <2 x double> %4, %0 - ret <2 x double> %5 -} - -define <4 x float> @test_qvfmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfmsubs -; CHECK: QVFMSUBSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fsub reassoc nsz <4 x float> %4, %0 - ret <4 x float> %5 -} - -define <2 x double> @test_qvfnmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfnmsub -; CHECK: QVFNMSUB %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fsub reassoc nsz <2 x double> %4, %0 - %6 = fneg reassoc nsz <2 x double> %5 - ret <2 x double> %6 -} - -define <4 x float> @test_qvfnmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfnmsubs -; CHECK: QVFNMSUBSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fsub reassoc nsz <4 x float> %4, %0 - %6 = fneg reassoc nsz <4 x float> %5 - ret <4 x float> %6 -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll deleted file mode 100644 index 498ab62819ced..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll +++ /dev/null @@ -1,473 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) -declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) - -define <4 x double> @foo_fmf(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha -; CHECK-NEXT: qvfrsqrte 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfmsub 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 -; CHECK-NEXT: qvfmul 3, 3, 4 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 -; CHECK-NEXT: qvfmul 0, 3, 0 -; CHECK-NEXT: qvfmul 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call ninf afn reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %r = fdiv arcp reassoc <4 x double> %a, %x - ret <4 x double> %r -} - -define <4 x double> @foo_safe(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 5, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 1 -; CHECK-NEXT: qvesplati 4, 2, 2 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 5, 5 -; CHECK-NEXT: fsqrt 4, 4 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: qvesplati 6, 1, 3 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdiv 2, 1, 2 -; CHECK-NEXT: fdiv 5, 6, 5 -; CHECK-NEXT: qvesplati 6, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdiv 4, 6, 4 -; CHECK-NEXT: fdiv 1, 1, 3 -; CHECK-NEXT: qvfperm 3, 4, 5, 0 -; CHECK-NEXT: qvfperm 0, 2, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 3, 1 -; CHECK-NEXT: blr -entry: - %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %r = fdiv <4 x double> %a, %x - ret <4 x double> %r -} - -define <4 x double> @foof_fmf(<4 x double> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: foof_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI2_0@toc@ha -; CHECK-NEXT: qvfrsqrtes 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI2_0@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: qvfmuls 4, 3, 3 -; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 -; CHECK-NEXT: qvfmuls 0, 3, 0 -; CHECK-NEXT: qvfmul 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %y = fpext <4 x float> %x to <4 x double> - %r = fdiv arcp reassoc nsz <4 x double> %a, %y - ret <4 x double> %r -} - -define <4 x double> @foof_safe(<4 x double> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: foof_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 2 -; CHECK-NEXT: fsqrts 4, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: fsqrts 0, 0 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: qvgpci 5, 275 -; CHECK-NEXT: qvgpci 6, 101 -; CHECK-NEXT: qvfperm 0, 3, 0, 5 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 2, 5 -; CHECK-NEXT: qvfperm 0, 2, 0, 6 -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 4, 0, 3 -; CHECK-NEXT: fdiv 2, 2, 4 -; CHECK-NEXT: qvesplati 4, 0, 2 -; CHECK-NEXT: fdiv 3, 3, 4 -; CHECK-NEXT: qvesplati 4, 1, 1 -; CHECK-NEXT: fdiv 1, 1, 0 -; CHECK-NEXT: qvesplati 0, 0, 1 -; CHECK-NEXT: fdiv 0, 4, 0 -; CHECK-NEXT: qvfperm 2, 3, 2, 5 -; CHECK-NEXT: qvfperm 0, 1, 0, 5 -; CHECK-NEXT: qvfperm 1, 0, 2, 6 -; CHECK-NEXT: blr -entry: - %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %y = fpext <4 x float> %x to <4 x double> - %r = fdiv <4 x double> %a, %y - ret <4 x double> %r -} - -define <4 x float> @food_fmf(<4 x float> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: food_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha -; CHECK-NEXT: qvfrsqrte 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfmsub 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 -; CHECK-NEXT: qvfmul 3, 3, 4 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 -; CHECK-NEXT: qvfmul 0, 3, 0 -; CHECK-NEXT: qvfrsp 0, 0 -; CHECK-NEXT: qvfmuls 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %y = fptrunc <4 x double> %x to <4 x float> - %r = fdiv arcp reassoc <4 x float> %a, %y - ret <4 x float> %r -} - -define <4 x float> @food_safe(<4 x float> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: food_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 2 -; CHECK-NEXT: fsqrt 4, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: fsqrt 0, 0 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: qvgpci 5, 275 -; CHECK-NEXT: qvgpci 6, 101 -; CHECK-NEXT: qvfperm 0, 3, 0, 5 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 2, 5 -; CHECK-NEXT: qvfperm 0, 2, 0, 6 -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvfrsp 0, 0 -; CHECK-NEXT: qvesplati 4, 0, 3 -; CHECK-NEXT: fdivs 2, 2, 4 -; CHECK-NEXT: qvesplati 4, 0, 2 -; CHECK-NEXT: fdivs 3, 3, 4 -; CHECK-NEXT: qvesplati 4, 1, 1 -; CHECK-NEXT: fdivs 1, 1, 0 -; CHECK-NEXT: qvesplati 0, 0, 1 -; CHECK-NEXT: fdivs 0, 4, 0 -; CHECK-NEXT: qvfperm 2, 3, 2, 5 -; CHECK-NEXT: qvfperm 0, 1, 0, 5 -; CHECK-NEXT: qvfperm 1, 0, 2, 6 -; CHECK-NEXT: blr -entry: - %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %y = fptrunc <4 x double> %x to <4 x float> - %r = fdiv <4 x float> %a, %y - ret <4 x float> %r -} - -define <4 x float> @goo_fmf(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; CHECK-NEXT: qvfrsqrtes 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI6_0@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: qvfmuls 4, 3, 3 -; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 -; CHECK-NEXT: qvfmuls 0, 3, 0 -; CHECK-NEXT: qvfmuls 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %r = fdiv arcp reassoc nsz <4 x float> %a, %x - ret <4 x float> %r -} - -define <4 x float> @goo_safe(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 5, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 1 -; CHECK-NEXT: qvesplati 4, 2, 2 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: fsqrts 5, 5 -; CHECK-NEXT: fsqrts 4, 4 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: qvesplati 6, 1, 3 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdivs 2, 1, 2 -; CHECK-NEXT: fdivs 5, 6, 5 -; CHECK-NEXT: qvesplati 6, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdivs 4, 6, 4 -; CHECK-NEXT: fdivs 1, 1, 3 -; CHECK-NEXT: qvfperm 3, 4, 5, 0 -; CHECK-NEXT: qvfperm 0, 2, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 3, 1 -; CHECK-NEXT: blr -entry: - %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %r = fdiv <4 x float> %a, %x - ret <4 x float> %r -} - -define <4 x double> @foo2_fmf(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo2_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI8_0@toc@ha -; CHECK-NEXT: qvfre 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI8_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmadd 0, 2, 3, 0 -; CHECK-NEXT: qvfnmsub 0, 3, 0, 3 -; CHECK-NEXT: qvfmul 3, 1, 0 -; CHECK-NEXT: qvfnmsub 1, 2, 3, 1 -; CHECK-NEXT: qvfmadd 1, 0, 1, 3 -; CHECK-NEXT: blr -entry: - %r = fdiv arcp reassoc nsz ninf <4 x double> %a, %b - ret <4 x double> %r -} - -define <4 x double> @foo2_safe(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo2_safe: -; CHECK: # %bb.0: -; CHECK-NEXT: qvesplati 3, 2, 3 -; CHECK-NEXT: qvesplati 4, 1, 3 -; CHECK-NEXT: qvesplati 5, 2, 2 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdiv 3, 4, 3 -; CHECK-NEXT: qvesplati 4, 1, 2 -; CHECK-NEXT: fdiv 4, 4, 5 -; CHECK-NEXT: fdiv 5, 1, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdiv 1, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 3, 0 -; CHECK-NEXT: qvfperm 0, 5, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr - %r = fdiv <4 x double> %a, %b - ret <4 x double> %r -} - -define <4 x float> @goo2_fmf(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo2_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvfres 0, 2 -; CHECK-NEXT: qvfmuls 3, 1, 0 -; CHECK-NEXT: qvfnmsubs 1, 2, 3, 1 -; CHECK-NEXT: qvfmadds 1, 0, 1, 3 -; CHECK-NEXT: blr -entry: - %r = fdiv arcp reassoc ninf <4 x float> %a, %b - ret <4 x float> %r -} - -define <4 x float> @goo2_safe(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo2_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 3, 2, 3 -; CHECK-NEXT: qvesplati 4, 1, 3 -; CHECK-NEXT: qvesplati 5, 2, 2 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdivs 3, 4, 3 -; CHECK-NEXT: qvesplati 4, 1, 2 -; CHECK-NEXT: fdivs 4, 4, 5 -; CHECK-NEXT: fdivs 5, 1, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdivs 1, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 3, 0 -; CHECK-NEXT: qvfperm 0, 5, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = fdiv <4 x float> %a, %b - ret <4 x float> %r -} - -define <4 x double> @foo3_fmf_denorm_on(<4 x double> %a) #0 { -; CHECK-LABEL: foo3_fmf_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI12_0@toc@ha -; CHECK-NEXT: qvfrsqrte 0, 1 -; CHECK-NEXT: addi 3, 3, .LCPI12_0@toc@l -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI12_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI12_1@toc@l -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfmsub 4, 1, 2, 1 -; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 3 -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 2 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI12_2@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI12_2@toc@l -; CHECK-NEXT: qvlfdx 3, 0, 3 -; CHECK-NEXT: qvfmul 0, 0, 1 -; CHECK-NEXT: qvfabs 1, 1 -; CHECK-NEXT: qvfcmplt 1, 1, 2 -; CHECK-NEXT: qvfsel 1, 1, 3, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc ninf afn <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_fmf_denorm_off(<4 x double> %a) #1 { -; CHECK-LABEL: foo3_fmf_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI13_0@toc@ha -; CHECK-NEXT: qvfrsqrte 0, 1 -; CHECK-NEXT: addi 3, 3, .LCPI13_0@toc@l -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI13_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI13_1@toc@l -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfmsub 4, 1, 2, 1 -; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 3 -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 2 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: qvfmul 0, 0, 1 -; CHECK-NEXT: qvfcmpeq 1, 1, 2 -; CHECK-NEXT: qvfsel 1, 1, 2, 0 -; CHECK-NEXT: blr -entry: - %r = call afn reassoc ninf <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_safe_denorm_on(<4 x double> %a) #0 { -; CHECK-LABEL: foo3_safe_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrt 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_safe_denorm_off(<4 x double> %a) #1 { -; CHECK-LABEL: foo3_safe_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrt 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x float> @goo3_fmf_denorm_on(<4 x float> %a) #0 { -; CHECK-LABEL: goo3_fmf_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI16_1@toc@ha -; CHECK-NEXT: qvfrsqrtes 2, 1 -; CHECK-NEXT: addi 3, 3, .LCPI16_1@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI16_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI16_0@toc@l -; CHECK-NEXT: qvfmuls 4, 2, 2 -; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 -; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 -; CHECK-NEXT: qvlfsx 3, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI16_2@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI16_2@toc@l -; CHECK-NEXT: qvlfsx 4, 0, 3 -; CHECK-NEXT: qvfmuls 0, 2, 0 -; CHECK-NEXT: qvfabs 2, 1 -; CHECK-NEXT: qvfmuls 0, 0, 1 -; CHECK-NEXT: qvfcmplt 1, 2, 3 -; CHECK-NEXT: qvfsel 1, 1, 4, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc afn ninf nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -define <4 x float> @goo3_fmf_denorm_off(<4 x float> %a) #1 { -; CHECK-LABEL: goo3_fmf_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI17_1@toc@ha -; CHECK-NEXT: qvfrsqrtes 2, 1 -; CHECK-NEXT: addi 3, 3, .LCPI17_1@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l -; CHECK-NEXT: qvfmuls 4, 2, 2 -; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 -; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 -; CHECK-NEXT: qvlfsx 3, 0, 3 -; CHECK-NEXT: qvfmuls 0, 2, 0 -; CHECK-NEXT: qvfmuls 0, 0, 1 -; CHECK-NEXT: qvfcmpeq 1, 1, 3 -; CHECK-NEXT: qvfsel 1, 1, 3, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc ninf afn nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -define <4 x float> @goo3_safe(<4 x float> %a) nounwind { -; CHECK-LABEL: goo3_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrts 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: fsqrts 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -attributes #0 = { nounwind "denormal-fp-math"="ieee,ieee" } -attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll deleted file mode 100644 index ee3357156a6c0..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll +++ /dev/null @@ -1,109 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q -enable-unsafe-fp-math | FileCheck -check-prefix=CHECK-FM %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -define <4 x float> @test1(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test1: -; CHECK: qvfrim 1, 1 - -; CHECK-FM: test1: -; CHECK-FM: qvfrim 1, 1 -} - -declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test2(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test2: -; CHECK: qvfrim 1, 1 - -; CHECK-FM: test2: -; CHECK-FM: qvfrim 1, 1 -} - -declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test3(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test3: -; CHECK-NOT: qvfrin - -; CHECK-FM: test3: -; CHECK-FM-NOT: qvfrin -} - -declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test4(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test4: -; CHECK-NOT: qvfrin - -; CHECK-FM: test4: -; CHECK-FM-NOT: qvfrin -} - -declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test5(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test5: -; CHECK: qvfrip 1, 1 - -; CHECK-FM: test5: -; CHECK-FM: qvfrip 1, 1 -} - -declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test6(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test6: -; CHECK: qvfrip 1, 1 - -; CHECK-FM: test6: -; CHECK-FM: qvfrip 1, 1 -} - -declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test9(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test9: -; CHECK: qvfriz 1, 1 - -; CHECK-FM: test9: -; CHECK-FM: qvfriz 1, 1 -} - -declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test10(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test10: -; CHECK: qvfriz 1, 1 - -; CHECK-FM: test10: -; CHECK-FM: qvfriz 1, 1 -} - -declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-load.ll b/llvm/test/CodeGen/PowerPC/qpx-s-load.ll deleted file mode 100644 index 57d7e3b0ded3c..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-load.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <4 x float> @foo(<4 x float>* %p) { -entry: - %v = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %v -} - -; CHECK: @foo -; CHECK-DAG: li [[REG1:[0-9]+]], 15 -; CHECK-DAG: qvlfsx [[REG4:[0-9]+]], 0, 3 -; CHECK-DAG: qvlfsx [[REG2:[0-9]+]], 3, [[REG1]] -; CHECK-DAG: qvlpclsx [[REG3:[0-9]+]], 0, 3 -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] -; CHECK: blr - -define <4 x float> @bar(<4 x float>* %p) { -entry: - %v = load <4 x float>, <4 x float>* %p, align 16 - ret <4 x float> %v -} - -; CHECK: @bar -; CHECK: qvlfsx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll deleted file mode 100644 index 5d42b9a529953..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll +++ /dev/null @@ -1,143 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -@R = global <4 x i1> , align 16 - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) nounwind readnone { -entry: - %r = select <4 x i1> %c, <4 x float> %a, <4 x float> %b - ret <4 x float> %r - -; CHECK-LABEL: @test1 -; CHECK: qvfsel 1, 3, 1, 2 -; CHECK: blr -} - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { -entry: - %v = insertelement <4 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 - %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 - %r = select <4 x i1> %v4, <4 x float> %a, <4 x float> %b - ret <4 x float> %r - -; CHECK-LABEL: @test2 -; CHECK: stw -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - -define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { -entry: - %v = and <4 x i1> %a, - ret <4 x i1> %v - -; CHECK-LABEL: @test3 -; CHECK: qvlfsx [[REG:[0-9]+]], -; qvflogical 1, 1, [[REG]], 1 -; blr -} - -define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { -entry: - %q = load <4 x i1>, <4 x i1>* %t, align 16 - %v = and <4 x i1> %a, %q - ret <4 x i1> %v - -; CHECK-LABEL: @test4 -; CHECK-DAG: lbz -; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], -; CHECK-DAG: stw -; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] -; CHECK: qvfand 1, 1, [[REG4]] -; CHECK: blr -} - -define void @test5(<4 x i1> %a) nounwind { -entry: - store <4 x i1> %a, <4 x i1>* @R - ret void - -; CHECK-LABEL: @test5 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: stb -; CHECK: blr -} - -define i1 @test6(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test6 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define i1 @test7(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - %s = extractelement <4 x i1> %a, i32 3 - %q = and i1 %r, %s - ret i1 %q - -; CHECK-LABEL: @test7 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG4:[0-9]+]], -; FIXME: We're storing the vector twice, and that's silly. -; CHECK-DAG: qvstfiwx [[REG3]], -; CHECK: lwz [[REG5:[0-9]+]], -; CHECK: and 3, -; CHECK: blr -} - -define i1 @test8(<3 x i1> %a) nounwind { -entry: - %r = extractelement <3 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test8 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define <3 x float> @test9(<3 x float> %a, <3 x float> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { -entry: - %v = insertelement <3 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 - %r = select <3 x i1> %v3, <3 x float> %a, <3 x float> %b - ret <3 x float> %r - -; CHECK-LABEL: @test9 -; CHECK: stw -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-store.ll b/llvm/test/CodeGen/PowerPC/qpx-s-store.ll deleted file mode 100644 index 81cff7b6457f1..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-store.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define void @foo(<4 x float> %v, <4 x float>* %p) { -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void -} - -; CHECK: @foo -; CHECK: stfs -; CHECK: stfs -; CHECK: stfs -; CHECK: stfs -; CHECK: blr - -define void @bar(<4 x float> %v, <4 x float>* %p) { -entry: - store <4 x float> %v, <4 x float>* %p, align 16 - ret void -} - -; CHECK: @bar -; CHECK: qvstfsx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-sel.ll deleted file mode 100644 index abc92c9e98b13..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-sel.ll +++ /dev/null @@ -1,151 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -@R = global <4 x i1> , align 16 - -define <4 x double> @test1(<4 x double> %a, <4 x double> %b, <4 x i1> %c) nounwind readnone { -entry: - %r = select <4 x i1> %c, <4 x double> %a, <4 x double> %b - ret <4 x double> %r - -; CHECK-LABEL: @test1 -; CHECK: qvfsel 1, 3, 1, 2 -; CHECK: blr -} - -define <4 x double> @test2(<4 x double> %a, <4 x double> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { -entry: - %v = insertelement <4 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 - %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 - %r = select <4 x i1> %v4, <4 x double> %a, <4 x double> %b - ret <4 x double> %r - -; CHECK-LABEL: @test2 - -; FIXME: This load/store sequence is unnecessary. -; CHECK-DAG: lbz -; CHECK-DAG: stw - -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - -define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { -entry: - %v = and <4 x i1> %a, - ret <4 x i1> %v - -; CHECK-LABEL: @test3 -; CHECK: qvlfsx [[REG:[0-9]+]], -; qvflogical 1, 1, [[REG]], 1 -; blr -} - -define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { -entry: - %q = load <4 x i1>, <4 x i1>* %t, align 16 - %v = and <4 x i1> %a, %q - ret <4 x i1> %v - -; CHECK-LABEL: @test4 -; CHECK-DAG: lbz -; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], -; CHECK-DAG: stw -; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] -; CHECK: qvfand 1, 1, [[REG4]] -; CHECK: blr -} - -define void @test5(<4 x i1> %a) nounwind { -entry: - store <4 x i1> %a, <4 x i1>* @R - ret void - -; CHECK-LABEL: @test5 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: stb -; CHECK: blr -} - -define i1 @test6(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test6 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define i1 @test7(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - %s = extractelement <4 x i1> %a, i32 3 - %q = and i1 %r, %s - ret i1 %q - -; CHECK-LABEL: @test7 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG4:[0-9]+]], -; FIXME: We're storing the vector twice, and that's silly. -; CHECK-DAG: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG5:[0-9]+]], -; CHECK: and 3, -; CHECK: blr -} - -define i1 @test8(<3 x i1> %a) nounwind { -entry: - %r = extractelement <3 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test8 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define <3 x double> @test9(<3 x double> %a, <3 x double> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { -entry: - %v = insertelement <3 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 - %r = select <3 x i1> %v3, <3 x double> %a, <3 x double> %b - ret <3 x double> %r - -; CHECK-LABEL: @test9 - -; FIXME: This load/store sequence is unnecessary. -; CHECK-DAG: lbz -; CHECK-DAG: stw - -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll b/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll deleted file mode 100644 index df3e0befaef8a..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" - -; Function Attrs: nounwind -define void @gsl_sf_legendre_Pl_deriv_array(<4 x i32> %inp1, <4 x double> %inp2) #0 { -entry: - br label %vector.body198 - -vector.body198: ; preds = %vector.body198, %for.body46.lr.ph - %0 = icmp ne <4 x i32> %inp1, zeroinitializer - %1 = select <4 x i1> %0, <4 x double> , <4 x double> - %2 = fmul <4 x double> %inp2, %1 - %3 = fmul <4 x double> %inp2, %2 - %4 = fmul <4 x double> %3, %inp2 - store <4 x double> %4, <4 x double>* undef, align 8 - br label %return - -; CHECK-LABEL: @gsl_sf_legendre_Pl_deriv_array -; CHECK: qvlfiwzx -; CHECK: qvfcfidu -; CHECK: qvfcmpeq -; CHECK: qvfsel -; CHECK: qvfmul - -return: ; preds = %if.else.i - ret void -} - -attributes #0 = { nounwind } - diff --git a/llvm/test/CodeGen/PowerPC/qpx-store.ll b/llvm/test/CodeGen/PowerPC/qpx-store.ll deleted file mode 100644 index 2b96576ce4493..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-store.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define void @foo(<4 x double> %v, <4 x double>* %p) { -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void -} - -; CHECK: @foo -; CHECK: stfd -; CHECK: stfd -; CHECK: stfd -; CHECK: stfd -; CHECK: blr - -define void @bar(<4 x double> %v, <4 x double>* %p) { -entry: - store <4 x double> %v, <4 x double>* %p, align 32 - ret void -} - -; CHECK: @bar -; CHECK: qvstfdx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll b/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll deleted file mode 100644 index e7ab92db6efc9..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll +++ /dev/null @@ -1,217 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -; Function Attrs: nounwind -define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { -entry: - br label %vector.body - -; CHECK-LABEL: @foo -; Make sure that the offset constants we use are all even (only the last should be odd). -; CHECK-DAG: li {{[0-9]+}}, 1056 -; CHECK-DAG: li {{[0-9]+}}, 1088 -; CHECK-DAG: li {{[0-9]+}}, 1152 -; CHECK-DAG: li {{[0-9]+}}, 1216 -; CHECK-DAG: li {{[0-9]+}}, 1280 -; CHECK-DAG: li {{[0-9]+}}, 1344 -; CHECK-DAG: li {{[0-9]+}}, 1408 -; CHECK-DAG: li {{[0-9]+}}, 1472 -; CHECK-DAG: li {{[0-9]+}}, 1536 -; CHECK-DAG: li {{[0-9]+}}, 1600 -; CHECK-DAG: li {{[0-9]+}}, 1568 -; CHECK-DAG: li {{[0-9]+}}, 1664 -; CHECK-DAG: li {{[0-9]+}}, 1632 -; CHECK-DAG: li {{[0-9]+}}, 1728 -; CHECK-DAG: li {{[0-9]+}}, 1696 -; CHECK-DAG: li {{[0-9]+}}, 1792 -; CHECK-DAG: li {{[0-9]+}}, 1760 -; CHECK-DAG: li {{[0-9]+}}, 1856 -; CHECK-DAG: li {{[0-9]+}}, 1824 -; CHECK-DAG: li {{[0-9]+}}, 1920 -; CHECK-DAG: li {{[0-9]+}}, 1888 -; CHECK-DAG: li {{[0-9]+}}, 1984 -; CHECK-DAG: li {{[0-9]+}}, 1952 -; CHECK-DAG: li {{[0-9]+}}, 2016 -; CHECK-DAG: li {{[0-9]+}}, 1024 -; CHECK-DAG: li {{[0-9]+}}, 1120 -; CHECK-DAG: li {{[0-9]+}}, 1184 -; CHECK-DAG: li {{[0-9]+}}, 1248 -; CHECK-DAG: li {{[0-9]+}}, 1312 -; CHECK-DAG: li {{[0-9]+}}, 1376 -; CHECK-DAG: li {{[0-9]+}}, 1440 -; CHECK-DAG: li {{[0-9]+}}, 1504 -; CHECK-DAG: li {{[0-9]+}}, 2047 -; CHECK: blr - -vector.body: ; preds = %vector.body, %entry - %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ] - %0 = shl i64 %index, 1 - %1 = getelementptr inbounds double, double* %b, i64 %0 - %2 = bitcast double* %1 to <8 x double>* - %wide.vec = load <8 x double>, <8 x double>* %2, align 8 - %strided.vec = shufflevector <8 x double> %wide.vec, <8 x double> undef, <4 x i32> - %3 = fadd <4 x double> %strided.vec, - %4 = getelementptr inbounds double, double* %a, i64 %index - %5 = bitcast double* %4 to <4 x double>* - store <4 x double> %3, <4 x double>* %5, align 8 - %index.next = or i64 %index, 4 - %6 = shl i64 %index.next, 1 - %7 = getelementptr inbounds double, double* %b, i64 %6 - %8 = bitcast double* %7 to <8 x double>* - %wide.vec.1 = load <8 x double>, <8 x double>* %8, align 8 - %strided.vec.1 = shufflevector <8 x double> %wide.vec.1, <8 x double> undef, <4 x i32> - %9 = fadd <4 x double> %strided.vec.1, - %10 = getelementptr inbounds double, double* %a, i64 %index.next - %11 = bitcast double* %10 to <4 x double>* - store <4 x double> %9, <4 x double>* %11, align 8 - %index.next.1 = or i64 %index, 8 - %12 = shl i64 %index.next.1, 1 - %13 = getelementptr inbounds double, double* %b, i64 %12 - %14 = bitcast double* %13 to <8 x double>* - %wide.vec.2 = load <8 x double>, <8 x double>* %14, align 8 - %strided.vec.2 = shufflevector <8 x double> %wide.vec.2, <8 x double> undef, <4 x i32> - %15 = fadd <4 x double> %strided.vec.2, - %16 = getelementptr inbounds double, double* %a, i64 %index.next.1 - %17 = bitcast double* %16 to <4 x double>* - store <4 x double> %15, <4 x double>* %17, align 8 - %index.next.2 = or i64 %index, 12 - %18 = shl i64 %index.next.2, 1 - %19 = getelementptr inbounds double, double* %b, i64 %18 - %20 = bitcast double* %19 to <8 x double>* - %wide.vec.3 = load <8 x double>, <8 x double>* %20, align 8 - %strided.vec.3 = shufflevector <8 x double> %wide.vec.3, <8 x double> undef, <4 x i32> - %21 = fadd <4 x double> %strided.vec.3, - %22 = getelementptr inbounds double, double* %a, i64 %index.next.2 - %23 = bitcast double* %22 to <4 x double>* - store <4 x double> %21, <4 x double>* %23, align 8 - %index.next.3 = or i64 %index, 16 - %24 = shl i64 %index.next.3, 1 - %25 = getelementptr inbounds double, double* %b, i64 %24 - %26 = bitcast double* %25 to <8 x double>* - %wide.vec.4 = load <8 x double>, <8 x double>* %26, align 8 - %strided.vec.4 = shufflevector <8 x double> %wide.vec.4, <8 x double> undef, <4 x i32> - %27 = fadd <4 x double> %strided.vec.4, - %28 = getelementptr inbounds double, double* %a, i64 %index.next.3 - %29 = bitcast double* %28 to <4 x double>* - store <4 x double> %27, <4 x double>* %29, align 8 - %index.next.4 = or i64 %index, 20 - %30 = shl i64 %index.next.4, 1 - %31 = getelementptr inbounds double, double* %b, i64 %30 - %32 = bitcast double* %31 to <8 x double>* - %wide.vec.5 = load <8 x double>, <8 x double>* %32, align 8 - %strided.vec.5 = shufflevector <8 x double> %wide.vec.5, <8 x double> undef, <4 x i32> - %33 = fadd <4 x double> %strided.vec.5, - %34 = getelementptr inbounds double, double* %a, i64 %index.next.4 - %35 = bitcast double* %34 to <4 x double>* - store <4 x double> %33, <4 x double>* %35, align 8 - %index.next.5 = or i64 %index, 24 - %36 = shl i64 %index.next.5, 1 - %37 = getelementptr inbounds double, double* %b, i64 %36 - %38 = bitcast double* %37 to <8 x double>* - %wide.vec.6 = load <8 x double>, <8 x double>* %38, align 8 - %strided.vec.6 = shufflevector <8 x double> %wide.vec.6, <8 x double> undef, <4 x i32> - %39 = fadd <4 x double> %strided.vec.6, - %40 = getelementptr inbounds double, double* %a, i64 %index.next.5 - %41 = bitcast double* %40 to <4 x double>* - store <4 x double> %39, <4 x double>* %41, align 8 - %index.next.6 = or i64 %index, 28 - %42 = shl i64 %index.next.6, 1 - %43 = getelementptr inbounds double, double* %b, i64 %42 - %44 = bitcast double* %43 to <8 x double>* - %wide.vec.7 = load <8 x double>, <8 x double>* %44, align 8 - %strided.vec.7 = shufflevector <8 x double> %wide.vec.7, <8 x double> undef, <4 x i32> - %45 = fadd <4 x double> %strided.vec.7, - %46 = getelementptr inbounds double, double* %a, i64 %index.next.6 - %47 = bitcast double* %46 to <4 x double>* - store <4 x double> %45, <4 x double>* %47, align 8 - %index.next.7 = or i64 %index, 32 - %48 = shl i64 %index.next.7, 1 - %49 = getelementptr inbounds double, double* %b, i64 %48 - %50 = bitcast double* %49 to <8 x double>* - %wide.vec.8 = load <8 x double>, <8 x double>* %50, align 8 - %strided.vec.8 = shufflevector <8 x double> %wide.vec.8, <8 x double> undef, <4 x i32> - %51 = fadd <4 x double> %strided.vec.8, - %52 = getelementptr inbounds double, double* %a, i64 %index.next.7 - %53 = bitcast double* %52 to <4 x double>* - store <4 x double> %51, <4 x double>* %53, align 8 - %index.next.8 = or i64 %index, 36 - %54 = shl i64 %index.next.8, 1 - %55 = getelementptr inbounds double, double* %b, i64 %54 - %56 = bitcast double* %55 to <8 x double>* - %wide.vec.9 = load <8 x double>, <8 x double>* %56, align 8 - %strided.vec.9 = shufflevector <8 x double> %wide.vec.9, <8 x double> undef, <4 x i32> - %57 = fadd <4 x double> %strided.vec.9, - %58 = getelementptr inbounds double, double* %a, i64 %index.next.8 - %59 = bitcast double* %58 to <4 x double>* - store <4 x double> %57, <4 x double>* %59, align 8 - %index.next.9 = or i64 %index, 40 - %60 = shl i64 %index.next.9, 1 - %61 = getelementptr inbounds double, double* %b, i64 %60 - %62 = bitcast double* %61 to <8 x double>* - %wide.vec.10 = load <8 x double>, <8 x double>* %62, align 8 - %strided.vec.10 = shufflevector <8 x double> %wide.vec.10, <8 x double> undef, <4 x i32> - %63 = fadd <4 x double> %strided.vec.10, - %64 = getelementptr inbounds double, double* %a, i64 %index.next.9 - %65 = bitcast double* %64 to <4 x double>* - store <4 x double> %63, <4 x double>* %65, align 8 - %index.next.10 = or i64 %index, 44 - %66 = shl i64 %index.next.10, 1 - %67 = getelementptr inbounds double, double* %b, i64 %66 - %68 = bitcast double* %67 to <8 x double>* - %wide.vec.11 = load <8 x double>, <8 x double>* %68, align 8 - %strided.vec.11 = shufflevector <8 x double> %wide.vec.11, <8 x double> undef, <4 x i32> - %69 = fadd <4 x double> %strided.vec.11, - %70 = getelementptr inbounds double, double* %a, i64 %index.next.10 - %71 = bitcast double* %70 to <4 x double>* - store <4 x double> %69, <4 x double>* %71, align 8 - %index.next.11 = or i64 %index, 48 - %72 = shl i64 %index.next.11, 1 - %73 = getelementptr inbounds double, double* %b, i64 %72 - %74 = bitcast double* %73 to <8 x double>* - %wide.vec.12 = load <8 x double>, <8 x double>* %74, align 8 - %strided.vec.12 = shufflevector <8 x double> %wide.vec.12, <8 x double> undef, <4 x i32> - %75 = fadd <4 x double> %strided.vec.12, - %76 = getelementptr inbounds double, double* %a, i64 %index.next.11 - %77 = bitcast double* %76 to <4 x double>* - store <4 x double> %75, <4 x double>* %77, align 8 - %index.next.12 = or i64 %index, 52 - %78 = shl i64 %index.next.12, 1 - %79 = getelementptr inbounds double, double* %b, i64 %78 - %80 = bitcast double* %79 to <8 x double>* - %wide.vec.13 = load <8 x double>, <8 x double>* %80, align 8 - %strided.vec.13 = shufflevector <8 x double> %wide.vec.13, <8 x double> undef, <4 x i32> - %81 = fadd <4 x double> %strided.vec.13, - %82 = getelementptr inbounds double, double* %a, i64 %index.next.12 - %83 = bitcast double* %82 to <4 x double>* - store <4 x double> %81, <4 x double>* %83, align 8 - %index.next.13 = or i64 %index, 56 - %84 = shl i64 %index.next.13, 1 - %85 = getelementptr inbounds double, double* %b, i64 %84 - %86 = bitcast double* %85 to <8 x double>* - %wide.vec.14 = load <8 x double>, <8 x double>* %86, align 8 - %strided.vec.14 = shufflevector <8 x double> %wide.vec.14, <8 x double> undef, <4 x i32> - %87 = fadd <4 x double> %strided.vec.14, - %88 = getelementptr inbounds double, double* %a, i64 %index.next.13 - %89 = bitcast double* %88 to <4 x double>* - store <4 x double> %87, <4 x double>* %89, align 8 - %index.next.14 = or i64 %index, 60 - %90 = shl i64 %index.next.14, 1 - %91 = getelementptr inbounds double, double* %b, i64 %90 - %92 = bitcast double* %91 to <8 x double>* - %wide.vec.15 = load <8 x double>, <8 x double>* %92, align 8 - %strided.vec.15 = shufflevector <8 x double> %wide.vec.15, <8 x double> undef, <4 x i32> - %93 = fadd <4 x double> %strided.vec.15, - %94 = getelementptr inbounds double, double* %a, i64 %index.next.14 - %95 = bitcast double* %94 to <4 x double>* - store <4 x double> %93, <4 x double>* %95, align 8 - %index.next.15 = add nsw i64 %index, 64 - %96 = icmp eq i64 %index.next.15, 1600 - br i1 %96, label %for.cond.cleanup, label %vector.body - -for.cond.cleanup: ; preds = %vector.body - ret void -} - -attributes #0 = { nounwind "target-cpu"="a2q" } - diff --git a/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll b/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll deleted file mode 100644 index fdee919fdfc32..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 32 - ret <4 x double> %r -; CHECK: qvlfdx -; CHECK: blr -} - -define <4 x double> @bar(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 - %s = load <4 x double>, <4 x double>* %b, align 32 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -; CHECK: qvlpcldx -; CHECK: qvlfdx -; CHECK: qvfperm -; CHECK: blr -} - -define <4 x double> @bar1(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 - %s = load <4 x double>, <4 x double>* %b, align 8 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar2(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 32 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar3(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 8 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar4(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 8 - %c = getelementptr <4 x double>, <4 x double>* %b, i32 1 - %t = load <4 x double>, <4 x double>* %c, align 8 - %u = fadd <4 x double> %r, %s - %v = fadd <4 x double> %u, %t - ret <4 x double> %v -} - diff --git a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll index e8fc409527588..d512f51a76e7a 100644 --- a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll +++ b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll @@ -1,6 +1,4 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits < %s | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s define void @test() align 2 { entry: diff --git a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir index e3aeb5605b42c..dbe314b5251fe 100644 --- a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir +++ b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir @@ -60,7 +60,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll index 5c15145af2378..20071ea1710c5 100644 --- a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll +++ b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll @@ -1,7 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" @aa = external global [256 x [256 x double]], align 32 @bb = external global [256 x [256 x double]], align 32 diff --git a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll index 80ac733156197..9f458ebcf0a6e 100644 --- a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll +++ b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll @@ -1225,576 +1225,5 @@ entry: ; CHECK: blr } -define <4 x double> @testqv4doubleslt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleslt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleult(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublesle(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleule(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleeq(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleeq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x double> @testqv4doublesge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleuge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleuge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublesgt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleugt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublene(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublene -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x float> @testqv4floatslt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatslt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatult(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatsle(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatule(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floateq(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floateq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x float> @testqv4floatsge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatuge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatuge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatsgt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatugt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatne(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatne -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x i1> @testqv4i1slt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1slt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ult(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sle(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ule(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1eq(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1eq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1uge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1uge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sgt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ugt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ne(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ne -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - attributes #0 = { nounwind readnone "target-cpu"="pwr7" } -attributes #1 = { nounwind readnone "target-cpu"="a2q" } diff --git a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll index 53d17d8668270..73fce78c33aa7 100644 --- a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll +++ b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux < %s ; Check that llc does not crash due to an illegal APInt operation diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc.mir b/llvm/test/CodeGen/PowerPC/setcr_bc.mir index e9d81da681fcc..564ee7d45957b 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir index 582284d6d0a59..513cb85e1580a 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/stwu-sched.ll b/llvm/test/CodeGen/PowerPC/stwu-sched.ll index 0afd2ee406894..36afaf84a296b 100644 --- a/llvm/test/CodeGen/PowerPC/stwu-sched.ll +++ b/llvm/test/CodeGen/PowerPC/stwu-sched.ll @@ -58,7 +58,7 @@ define void @initCombList(%0* nocapture, i32 signext) local_unnamed_addr #0 { ret void } -attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll index 497add38e0444..79a368dd095ac 100644 --- a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll @@ -327,72 +327,6 @@ entry: } -define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { -; CHECK-LABEL: test_l_qv4float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 15 -; CHECK-NEXT: qvlpclsx 0, 0, 3 -; CHECK-NEXT: qvlfsx 1, 3, 4 -; CHECK-NEXT: qvlfsx 2, 0, 3 -; CHECK-NEXT: qvfperm 1, 2, 1, 0 -; CHECK-NEXT: blr -entry: - %r = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %r - -} - -define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { -; CHECK-LABEL: test_l_qv8float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 31 -; CHECK-NEXT: qvlpclsx 1, 0, 3 -; CHECK-NEXT: qvlfsx 0, 3, 4 -; CHECK-NEXT: li 4, 16 -; CHECK-NEXT: qvlfsx 3, 3, 4 -; CHECK-NEXT: qvlfsx 4, 0, 3 -; CHECK-NEXT: qvfperm 2, 3, 0, 1 -; CHECK-NEXT: qvfperm 1, 4, 3, 1 -; CHECK-NEXT: blr -entry: - %r = load <8 x float>, <8 x float>* %p, align 4 - ret <8 x float> %r - -} - -define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { -; CHECK-LABEL: test_l_qv4double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 31 -; CHECK-NEXT: qvlpcldx 0, 0, 3 -; CHECK-NEXT: qvlfdx 1, 3, 4 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: qvfperm 1, 2, 1, 0 -; CHECK-NEXT: blr -entry: - %r = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %r - -} - -define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { -; CHECK-LABEL: test_l_qv8double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 63 -; CHECK-NEXT: qvlpcldx 1, 0, 3 -; CHECK-NEXT: qvlfdx 0, 3, 4 -; CHECK-NEXT: li 4, 32 -; CHECK-NEXT: qvlfdx 3, 3, 4 -; CHECK-NEXT: qvlfdx 4, 0, 3 -; CHECK-NEXT: qvfperm 2, 3, 0, 1 -; CHECK-NEXT: qvfperm 1, 4, 3, 1 -; CHECK-NEXT: blr -entry: - %r = load <8 x double>, <8 x double>* %p, align 8 - ret <8 x double> %r - -} - define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { ; CHECK-LABEL: test_s_v16i8: ; CHECK: # %bb.0: # %entry @@ -537,89 +471,6 @@ entry: } -define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { -; CHECK-LABEL: test_s_qv4float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfs 1, 0(3) -; CHECK-NEXT: stfs 0, 12(3) -; CHECK-NEXT: qvesplati 0, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfs 0, 8(3) -; CHECK-NEXT: stfs 1, 4(3) -; CHECK-NEXT: blr -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void - -} - -define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { -; CHECK-LABEL: test_s_qv8float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: stfs 2, 16(3) -; CHECK-NEXT: stfs 0, 28(3) -; CHECK-NEXT: qvesplati 0, 2, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: stfs 1, 0(3) -; CHECK-NEXT: stfs 0, 24(3) -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfs 2, 20(3) -; CHECK-NEXT: qvesplati 2, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfs 0, 12(3) -; CHECK-NEXT: stfs 2, 8(3) -; CHECK-NEXT: stfs 1, 4(3) -; CHECK-NEXT: blr -entry: - store <8 x float> %v, <8 x float>* %p, align 4 - ret void - -} - -define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { -; CHECK-LABEL: test_s_qv4double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfd 1, 0(3) -; CHECK-NEXT: stfd 0, 24(3) -; CHECK-NEXT: qvesplati 0, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfd 0, 16(3) -; CHECK-NEXT: stfd 1, 8(3) -; CHECK-NEXT: blr -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void - -} - -define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { -; CHECK-LABEL: test_s_qv8double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: stfd 2, 32(3) -; CHECK-NEXT: stfd 0, 56(3) -; CHECK-NEXT: qvesplati 0, 2, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: stfd 1, 0(3) -; CHECK-NEXT: stfd 0, 48(3) -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfd 2, 40(3) -; CHECK-NEXT: qvesplati 2, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfd 0, 24(3) -; CHECK-NEXT: stfd 2, 16(3) -; CHECK-NEXT: stfd 1, 8(3) -; CHECK-NEXT: blr -entry: - store <8 x double> %v, <8 x double>* %p, align 8 - ret void - -} - attributes #0 = { nounwind "target-cpu"="pwr7" } -attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/CodeGen/PowerPC/uwtables.ll b/llvm/test/CodeGen/PowerPC/uwtables.ll index 7523d04d73d38..e302934ab8d6b 100644 --- a/llvm/test/CodeGen/PowerPC/uwtables.ll +++ b/llvm/test/CodeGen/PowerPC/uwtables.ll @@ -47,5 +47,5 @@ declare i32 @__gxx_personality_v0(...) declare void @__cxa_call_unexpected(i8*) local_unnamed_addr -attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll index 36da7add88015..33f3d82c3683d 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll @@ -55,21 +55,6 @@ define i32 @bar2() { ; CHECK: store <2 x i64> zeroinitializer, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 8) to <2 x i64>*), align 8 ; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls -; Check QPX vector argument. -define i32 @bar3() "target-features"="+qpx" { - %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i32 2, <4 x double> ) - ret i32 %1 -} - -; That one is even stranger: the parameter save area starts at offset 48 from -; (32-byte aligned) stack pointer, the vector parameter is at 96 bytes from -; the stack pointer, so its offset from parameter save area is misaligned. -; CHECK-LABEL: @bar3 -; CHECK: store i32 0, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 4) to i32*), align 8 -; CHECK: store i32 0, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 12) to i32*), align 8 -; CHECK: store <4 x i64> zeroinitializer, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_va_arg_tls to i64), i64 40) to <4 x i64>*), align 8 -; CHECK: store {{.*}} 72, {{.*}} @__msan_va_arg_overflow_size_tls - ; Check i64 array. define i32 @bar4() { %1 = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2]) diff --git a/llvm/test/MC/Disassembler/PowerPC/qpx.txt b/llvm/test/MC/Disassembler/PowerPC/qpx.txt deleted file mode 100644 index 00e598bd4356e..0000000000000 --- a/llvm/test/MC/Disassembler/PowerPC/qpx.txt +++ /dev/null @@ -1,371 +0,0 @@ -# RUN: llvm-mc --disassemble %s -triple powerpc64-bgq-linux -mcpu=a2q | FileCheck %s - -# CHECK: qvfabs 3, 5 -0x10 0x60 0x2a 0x10 - -# CHECK: qvfadd 3, 4, 5 -0x10 0x64 0x28 0x2a - -# CHECK: qvfadds 3, 4, 5 -0x00 0x64 0x28 0x2a - -# CHECK: qvfandc 3, 4, 5 -0x10 0x64 0x2a 0x08 - -# CHECK: qvfand 3, 4, 5 -0x10 0x64 0x28 0x88 - -# CHECK: qvfcfid 3, 5 -0x10 0x60 0x2e 0x9c - -# CHECK: qvfcfids 3, 5 -0x00 0x60 0x2e 0x9c - -# CHECK: qvfcfidu 3, 5 -0x10 0x60 0x2f 0x9c - -# CHECK: qvfcfidus 3, 5 -0x00 0x60 0x2f 0x9c - -# CHECK: qvfclr 3 -0x10 0x63 0x18 0x08 - -# CHECK: qvfcpsgn 3, 4, 5 -0x10 0x64 0x28 0x10 - -# CHECK: qvfctfb 3, 4 -0x10 0x64 0x22 0x88 - -# CHECK: qvfctid 3, 5 -0x10 0x60 0x2e 0x5c - -# CHECK: qvfctidu 3, 5 -0x10 0x60 0x2f 0x5c - -# CHECK: qvfctiduz 3, 5 -0x10 0x60 0x2f 0x5e - -# CHECK: qvfctidz 3, 5 -0x10 0x60 0x2e 0x5e - -# CHECK: qvfctiw 3, 5 -0x10 0x60 0x28 0x1c - -# CHECK: qvfctiwu 3, 5 -0x10 0x60 0x29 0x1c - -# CHECK: qvfctiwuz 3, 5 -0x10 0x60 0x29 0x1e - -# CHECK: qvfctiwz 3, 5 -0x10 0x60 0x28 0x1e - -# CHECK: qvfequ 3, 4, 5 -0x10 0x64 0x2c 0x88 - -# CHECK: qvflogical 3, 4, 5, 12 -0x10 0x64 0x2e 0x08 - -# CHECK: qvfmadd 3, 4, 6, 5 -0x10 0x64 0x29 0xba - -# CHECK: qvfmadds 3, 4, 6, 5 -0x00 0x64 0x29 0xba - -# CHECK: qvfmr 3, 5 -0x10 0x60 0x28 0x90 - -# CHECK: qvfmsub 3, 4, 6, 5 -0x10 0x64 0x29 0xb8 - -# CHECK: qvfmsubs 3, 4, 6, 5 -0x00 0x64 0x29 0xb8 - -# CHECK: qvfmul 3, 4, 6 -0x10 0x64 0x01 0xb2 - -# CHECK: qvfmuls 3, 4, 6 -0x00 0x64 0x01 0xb2 - -# CHECK: qvfnabs 3, 5 -0x10 0x60 0x29 0x10 - -# CHECK: qvfnand 3, 4, 5 -0x10 0x64 0x2f 0x08 - -# CHECK: qvfneg 3, 5 -0x10 0x60 0x28 0x50 - -# CHECK: qvfnmadd 3, 4, 6, 5 -0x10 0x64 0x29 0xbe - -# CHECK: qvfnmadds 3, 4, 6, 5 -0x00 0x64 0x29 0xbe - -# CHECK: qvfnmsub 3, 4, 6, 5 -0x10 0x64 0x29 0xbc - -# CHECK: qvfnmsubs 3, 4, 6, 5 -0x00 0x64 0x29 0xbc - -# CHECK: qvfnor 3, 4, 5 -0x10 0x64 0x2c 0x08 - -# CHECK: qvfnot 3, 4 -0x10 0x64 0x25 0x08 - -# CHECK: qvforc 3, 4, 5 -0x10 0x64 0x2e 0x88 - -# CHECK: qvfor 3, 4, 5 -0x10 0x64 0x2b 0x88 - -# CHECK: qvfperm 3, 4, 5, 6 -0x10 0x64 0x29 0x8c - -# CHECK: qvfre 3, 5 -0x10 0x60 0x28 0x30 - -# CHECK: qvfres 3, 5 -0x00 0x60 0x28 0x30 - -# CHECK: qvfrim 3, 5 -0x10 0x60 0x2b 0xd0 - -# CHECK: qvfrin 3, 5 -0x10 0x60 0x2b 0x10 - -# CHECK: qvfrip 3, 5 -0x10 0x60 0x2b 0x90 - -# CHECK: qvfriz 3, 5 -0x10 0x60 0x2b 0x50 - -# CHECK: qvfrsp 3, 5 -0x10 0x60 0x28 0x18 - -# CHECK: qvfrsqrte 3, 5 -0x10 0x60 0x28 0x34 - -# CHECK: qvfrsqrtes 3, 5 -0x00 0x60 0x28 0x34 - -# CHECK: qvfsel 3, 4, 6, 5 -0x10 0x64 0x29 0xae - -# CHECK: qvfset 3 -0x10 0x63 0x1f 0x88 - -# CHECK: qvfsub 3, 4, 5 -0x10 0x64 0x28 0x28 - -# CHECK: qvfsubs 3, 4, 5 -0x00 0x64 0x28 0x28 - -# CHECK: qvfxmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x92 - -# CHECK: qvfxmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x92 - -# CHECK: qvfxmul 3, 4, 6 -0x10 0x64 0x01 0xa2 - -# CHECK: qvfxmuls 3, 4, 6 -0x00 0x64 0x01 0xa2 - -# CHECK: qvfxor 3, 4, 5 -0x10 0x64 0x2b 0x08 - -# CHECK: qvfxxcpnmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x86 - -# CHECK: qvfxxcpnmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x86 - -# CHECK: qvfxxmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x82 - -# CHECK: qvfxxmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x82 - -# CHECK: qvfxxnpmadd 3, 4, 6, 5 -0x10 0x64 0x29 0x96 - -# CHECK: qvfxxnpmadds 3, 4, 6, 5 -0x00 0x64 0x29 0x96 - -# CHECK: qvlfcduxa 3, 9, 11 -0x7c 0x69 0x58 0xcf - -# CHECK: qvlfcdux 3, 9, 11 -0x7c 0x69 0x58 0xce - -# CHECK: qvlfcdxa 3, 10, 11 -0x7c 0x6a 0x58 0x8f - -# CHECK: qvlfcdx 3, 10, 11 -0x7c 0x6a 0x58 0x8e - -# CHECK: qvlfcsuxa 3, 9, 11 -0x7c 0x69 0x58 0x4f - -# CHECK: qvlfcsux 3, 9, 11 -0x7c 0x69 0x58 0x4e - -# CHECK: qvlfcsxa 3, 10, 11 -0x7c 0x6a 0x58 0x0f - -# CHECK: qvlfcsx 3, 10, 11 -0x7c 0x6a 0x58 0x0e - -# CHECK: qvlfduxa 3, 9, 11 -0x7c 0x69 0x5c 0xcf - -# CHECK: qvlfdux 3, 9, 11 -0x7c 0x69 0x5c 0xce - -# CHECK: qvlfdxa 3, 10, 11 -0x7c 0x6a 0x5c 0x8f - -# CHECK: qvlfdx 3, 10, 11 -0x7c 0x6a 0x5c 0x8e - -# CHECK: qvlfiwaxa 3, 10, 11 -0x7c 0x6a 0x5e 0xcf - -# CHECK: qvlfiwax 3, 10, 11 -0x7c 0x6a 0x5e 0xce - -# CHECK: qvlfiwzxa 3, 10, 11 -0x7c 0x6a 0x5e 0x8f - -# CHECK: qvlfiwzx 3, 10, 11 -0x7c 0x6a 0x5e 0x8e - -# CHECK: qvlfsuxa 3, 9, 11 -0x7c 0x69 0x5c 0x4f - -# CHECK: qvlfsux 3, 9, 11 -0x7c 0x69 0x5c 0x4e - -# CHECK: qvlfsxa 3, 10, 11 -0x7c 0x6a 0x5c 0x0f - -# CHECK: qvlfsx 3, 10, 11 -0x7c 0x6a 0x5c 0x0e - -# CHECK: qvlpcldx 3, 10, 11 -0x7c 0x6a 0x5c 0x8c - -# CHECK: qvlpclsx 3, 10, 11 -0x7c 0x6a 0x5c 0x0c - -# CHECK: qvlpcrdx 3, 10, 11 -0x7c 0x6a 0x58 0x8c - -# CHECK: qvlpcrsx 3, 10, 11 -0x7c 0x6a 0x58 0x0c - -# CHECK: qvstfcduxa 2, 9, 11 -0x7c 0x49 0x59 0xcf - -# CHECK: qvstfcduxia 2, 9, 11 -0x7c 0x49 0x59 0xcb - -# CHECK: qvstfcduxi 2, 9, 11 -0x7c 0x49 0x59 0xca - -# CHECK: qvstfcdux 2, 9, 11 -0x7c 0x49 0x59 0xce - -# CHECK: qvstfcdxa 2, 10, 11 -0x7c 0x4a 0x59 0x8f - -# CHECK: qvstfcdxia 2, 10, 11 -0x7c 0x4a 0x59 0x8b - -# CHECK: qvstfcdxi 2, 10, 11 -0x7c 0x4a 0x59 0x8a - -# CHECK: qvstfcdx 2, 10, 11 -0x7c 0x4a 0x59 0x8e - -# CHECK: qvstfcsuxa 2, 9, 11 -0x7c 0x49 0x59 0x4f - -# CHECK: qvstfcsuxia 2, 9, 11 -0x7c 0x49 0x59 0x4b - -# CHECK: qvstfcsuxi 2, 9, 11 -0x7c 0x49 0x59 0x4a - -# CHECK: qvstfcsux 2, 9, 11 -0x7c 0x49 0x59 0x4e - -# CHECK: qvstfcsxa 2, 10, 11 -0x7c 0x4a 0x59 0x0f - -# CHECK: qvstfcsxia 2, 10, 11 -0x7c 0x4a 0x59 0x0b - -# CHECK: qvstfcsxi 2, 10, 11 -0x7c 0x4a 0x59 0x0a - -# CHECK: qvstfcsx 2, 10, 11 -0x7c 0x4a 0x59 0x0e - -# CHECK: qvstfduxa 2, 9, 11 -0x7c 0x49 0x5d 0xcf - -# CHECK: qvstfduxia 2, 9, 11 -0x7c 0x49 0x5d 0xcb - -# CHECK: qvstfduxi 2, 9, 11 -0x7c 0x49 0x5d 0xca - -# CHECK: qvstfdux 2, 9, 11 -0x7c 0x49 0x5d 0xce - -# CHECK: qvstfdxa 2, 10, 11 -0x7c 0x4a 0x5d 0x8f - -# CHECK: qvstfdxia 2, 10, 11 -0x7c 0x4a 0x5d 0x8b - -# CHECK: qvstfdxi 2, 10, 11 -0x7c 0x4a 0x5d 0x8a - -# CHECK: qvstfdx 2, 10, 11 -0x7c 0x4a 0x5d 0x8e - -# CHECK: qvstfiwxa 2, 10, 11 -0x7c 0x4a 0x5f 0x8f - -# CHECK: qvstfiwx 2, 10, 11 -0x7c 0x4a 0x5f 0x8e - -# CHECK: qvstfsuxa 2, 9, 11 -0x7c 0x49 0x5d 0x4f - -# CHECK: qvstfsuxia 2, 9, 11 -0x7c 0x49 0x5d 0x4b - -# CHECK: qvstfsuxi 2, 9, 11 -0x7c 0x49 0x5d 0x4a - -# CHECK: qvstfsux 2, 9, 11 -0x7c 0x49 0x5d 0x4e - -# CHECK: qvstfsxa 2, 10, 11 -0x7c 0x4a 0x5d 0x0f - -# CHECK: qvstfsxia 2, 10, 11 -0x7c 0x4a 0x5d 0x0b - -# CHECK: qvstfsxi 2, 10, 11 -0x7c 0x4a 0x5d 0x0a - -# CHECK: qvstfsx 2, 10, 11 -0x7c 0x4a 0x5d 0x0e - diff --git a/llvm/test/MC/PowerPC/qpx.s b/llvm/test/MC/PowerPC/qpx.s deleted file mode 100644 index a1fb2090f8fff..0000000000000 --- a/llvm/test/MC/PowerPC/qpx.s +++ /dev/null @@ -1,252 +0,0 @@ -# RUN: llvm-mc -triple powerpc64-bgq-linux --show-encoding %s | FileCheck %s - -# CHECK: qvfabs 3, 5 # encoding: [0x10,0x60,0x2a,0x10] - qvfabs %q3, %q5 - -# CHECK: qvfabs 3, 5 # encoding: [0x10,0x60,0x2a,0x10] - qvfabs 3, 5 -# CHECK: qvfadd 3, 4, 5 # encoding: [0x10,0x64,0x28,0x2a] - qvfadd 3, 4, 5 -# CHECK: qvfadds 3, 4, 5 # encoding: [0x00,0x64,0x28,0x2a] - qvfadds 3, 4, 5 -# CHECK: qvfandc 3, 4, 5 # encoding: [0x10,0x64,0x2a,0x08] - qvfandc 3, 4, 5 -# CHECK: qvfand 3, 4, 5 # encoding: [0x10,0x64,0x28,0x88] - qvfand 3, 4, 5 -# CHECK: qvfcfid 3, 5 # encoding: [0x10,0x60,0x2e,0x9c] - qvfcfid 3, 5 -# CHECK: qvfcfids 3, 5 # encoding: [0x00,0x60,0x2e,0x9c] - qvfcfids 3, 5 -# CHECK: qvfcfidu 3, 5 # encoding: [0x10,0x60,0x2f,0x9c] - qvfcfidu 3, 5 -# CHECK: qvfcfidus 3, 5 # encoding: [0x00,0x60,0x2f,0x9c] - qvfcfidus 3, 5 -# CHECK: qvfclr 3 # encoding: [0x10,0x63,0x18,0x08] - qvfclr 3 -# CHECK: qvfcpsgn 3, 4, 5 # encoding: [0x10,0x64,0x28,0x10] - qvfcpsgn 3, 4, 5 -# CHECK: qvfctfb 3, 4 # encoding: [0x10,0x64,0x22,0x88] - qvfctfb 3, 4 -# CHECK: qvfctid 3, 5 # encoding: [0x10,0x60,0x2e,0x5c] - qvfctid 3, 5 -# CHECK: qvfctidu 3, 5 # encoding: [0x10,0x60,0x2f,0x5c] - qvfctidu 3, 5 -# CHECK: qvfctiduz 3, 5 # encoding: [0x10,0x60,0x2f,0x5e] - qvfctiduz 3, 5 -# CHECK: qvfctidz 3, 5 # encoding: [0x10,0x60,0x2e,0x5e] - qvfctidz 3, 5 -# CHECK: qvfctiw 3, 5 # encoding: [0x10,0x60,0x28,0x1c] - qvfctiw 3, 5 -# CHECK: qvfctiwu 3, 5 # encoding: [0x10,0x60,0x29,0x1c] - qvfctiwu 3, 5 -# CHECK: qvfctiwuz 3, 5 # encoding: [0x10,0x60,0x29,0x1e] - qvfctiwuz 3, 5 -# CHECK: qvfctiwz 3, 5 # encoding: [0x10,0x60,0x28,0x1e] - qvfctiwz 3, 5 -# CHECK: qvfequ 3, 4, 5 # encoding: [0x10,0x64,0x2c,0x88] - qvfequ 3, 4, 5 -# CHECK: qvflogical 3, 4, 5, 12 # encoding: [0x10,0x64,0x2e,0x08] - qvflogical 3, 4, 5, 12 -# CHECK: qvfmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xba] - qvfmadd 3, 4, 6, 5 -# CHECK: qvfmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xba] - qvfmadds 3, 4, 6, 5 -# CHECK: qvfmr 3, 5 # encoding: [0x10,0x60,0x28,0x90] - qvfmr 3, 5 -# CHECK: qvfmsub 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xb8] - qvfmsub 3, 4, 6, 5 -# CHECK: qvfmsubs 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xb8] - qvfmsubs 3, 4, 6, 5 -# CHECK: qvfmul 3, 4, 6 # encoding: [0x10,0x64,0x01,0xb2] - qvfmul 3, 4, 6 -# CHECK: qvfmuls 3, 4, 6 # encoding: [0x00,0x64,0x01,0xb2] - qvfmuls 3, 4, 6 -# CHECK: qvfnabs 3, 5 # encoding: [0x10,0x60,0x29,0x10] - qvfnabs 3, 5 -# CHECK: qvfnand 3, 4, 5 # encoding: [0x10,0x64,0x2f,0x08] - qvfnand 3, 4, 5 -# CHECK: qvfneg 3, 5 # encoding: [0x10,0x60,0x28,0x50] - qvfneg 3, 5 -# CHECK: qvfnmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xbe] - qvfnmadd 3, 4, 6, 5 -# CHECK: qvfnmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xbe] - qvfnmadds 3, 4, 6, 5 -# CHECK: qvfnmsub 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xbc] - qvfnmsub 3, 4, 6, 5 -# CHECK: qvfnmsubs 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0xbc] - qvfnmsubs 3, 4, 6, 5 -# CHECK: qvfnor 3, 4, 5 # encoding: [0x10,0x64,0x2c,0x08] - qvfnor 3, 4, 5 -# CHECK: qvfnot 3, 4 # encoding: [0x10,0x64,0x25,0x08] - qvfnot 3, 4 -# CHECK: qvforc 3, 4, 5 # encoding: [0x10,0x64,0x2e,0x88] - qvforc 3, 4, 5 -# CHECK: qvfor 3, 4, 5 # encoding: [0x10,0x64,0x2b,0x88] - qvfor 3, 4, 5 -# CHECK: qvfperm 3, 4, 5, 6 # encoding: [0x10,0x64,0x29,0x8c] - qvfperm 3, 4, 5, 6 -# CHECK: qvfre 3, 5 # encoding: [0x10,0x60,0x28,0x30] - qvfre 3, 5 -# CHECK: qvfres 3, 5 # encoding: [0x00,0x60,0x28,0x30] - qvfres 3, 5 -# CHECK: qvfrim 3, 5 # encoding: [0x10,0x60,0x2b,0xd0] - qvfrim 3, 5 -# CHECK: qvfrin 3, 5 # encoding: [0x10,0x60,0x2b,0x10] - qvfrin 3, 5 -# CHECK: qvfrip 3, 5 # encoding: [0x10,0x60,0x2b,0x90] - qvfrip 3, 5 -# CHECK: qvfriz 3, 5 # encoding: [0x10,0x60,0x2b,0x50] - qvfriz 3, 5 -# CHECK: qvfrsp 3, 5 # encoding: [0x10,0x60,0x28,0x18] - qvfrsp 3, 5 -# CHECK: qvfrsqrte 3, 5 # encoding: [0x10,0x60,0x28,0x34] - qvfrsqrte 3, 5 -# CHECK: qvfrsqrtes 3, 5 # encoding: [0x00,0x60,0x28,0x34] - qvfrsqrtes 3, 5 -# CHECK: qvfsel 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0xae] - qvfsel 3, 4, 6, 5 -# CHECK: qvfset 3 # encoding: [0x10,0x63,0x1f,0x88] - qvfset 3 -# CHECK: qvfsub 3, 4, 5 # encoding: [0x10,0x64,0x28,0x28] - qvfsub 3, 4, 5 -# CHECK: qvfsubs 3, 4, 5 # encoding: [0x00,0x64,0x28,0x28] - qvfsubs 3, 4, 5 -# CHECK: qvfxmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x92] - qvfxmadd 3, 4, 6, 5 -# CHECK: qvfxmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x92] - qvfxmadds 3, 4, 6, 5 -# CHECK: qvfxmul 3, 4, 6 # encoding: [0x10,0x64,0x01,0xa2] - qvfxmul 3, 4, 6 -# CHECK: qvfxmuls 3, 4, 6 # encoding: [0x00,0x64,0x01,0xa2] - qvfxmuls 3, 4, 6 -# CHECK: qvfxor 3, 4, 5 # encoding: [0x10,0x64,0x2b,0x08] - qvfxor 3, 4, 5 -# CHECK: qvfxxcpnmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x86] - qvfxxcpnmadd 3, 4, 6, 5 -# CHECK: qvfxxcpnmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x86] - qvfxxcpnmadds 3, 4, 6, 5 -# CHECK: qvfxxmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x82] - qvfxxmadd 3, 4, 6, 5 -# CHECK: qvfxxmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x82] - qvfxxmadds 3, 4, 6, 5 -# CHECK: qvfxxnpmadd 3, 4, 6, 5 # encoding: [0x10,0x64,0x29,0x96] - qvfxxnpmadd 3, 4, 6, 5 -# CHECK: qvfxxnpmadds 3, 4, 6, 5 # encoding: [0x00,0x64,0x29,0x96] - qvfxxnpmadds 3, 4, 6, 5 -# CHECK: qvlfcduxa 3, 9, 11 # encoding: [0x7c,0x69,0x58,0xcf] - qvlfcduxa 3, 9, 11 -# CHECK: qvlfcdux 3, 9, 11 # encoding: [0x7c,0x69,0x58,0xce] - qvlfcdux 3, 9, 11 -# CHECK: qvlfcdxa 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8f] - qvlfcdxa 3, 10, 11 -# CHECK: qvlfcdx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8e] - qvlfcdx 3, 10, 11 -# CHECK: qvlfcsuxa 3, 9, 11 # encoding: [0x7c,0x69,0x58,0x4f] - qvlfcsuxa 3, 9, 11 -# CHECK: qvlfcsux 3, 9, 11 # encoding: [0x7c,0x69,0x58,0x4e] - qvlfcsux 3, 9, 11 -# CHECK: qvlfcsxa 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0f] - qvlfcsxa 3, 10, 11 -# CHECK: qvlfcsx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0e] - qvlfcsx 3, 10, 11 -# CHECK: qvlfduxa 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0xcf] - qvlfduxa 3, 9, 11 -# CHECK: qvlfdux 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0xce] - qvlfdux 3, 9, 11 -# CHECK: qvlfdxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8f] - qvlfdxa 3, 10, 11 -# CHECK: qvlfdx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8e] - qvlfdx 3, 10, 11 -# CHECK: qvlfiwaxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0xcf] - qvlfiwaxa 3, 10, 11 -# CHECK: qvlfiwax 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0xce] - qvlfiwax 3, 10, 11 -# CHECK: qvlfiwzxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0x8f] - qvlfiwzxa 3, 10, 11 -# CHECK: qvlfiwzx 3, 10, 11 # encoding: [0x7c,0x6a,0x5e,0x8e] - qvlfiwzx 3, 10, 11 -# CHECK: qvlfsuxa 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0x4f] - qvlfsuxa 3, 9, 11 -# CHECK: qvlfsux 3, 9, 11 # encoding: [0x7c,0x69,0x5c,0x4e] - qvlfsux 3, 9, 11 -# CHECK: qvlfsxa 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0f] - qvlfsxa 3, 10, 11 -# CHECK: qvlfsx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0e] - qvlfsx 3, 10, 11 -# CHECK: qvlpcldx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x8c] - qvlpcldx 3, 10, 11 -# CHECK: qvlpclsx 3, 10, 11 # encoding: [0x7c,0x6a,0x5c,0x0c] - qvlpclsx 3, 10, 11 -# CHECK: qvlpcrdx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x8c] - qvlpcrdx 3, 10, 11 -# CHECK: qvlpcrsx 3, 10, 11 # encoding: [0x7c,0x6a,0x58,0x0c] - qvlpcrsx 3, 10, 11 -# CHECK: qvstfcduxa 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xcf] - qvstfcduxa 2, 9, 11 -# CHECK: qvstfcduxia 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xcb] - qvstfcduxia 2, 9, 11 -# CHECK: qvstfcduxi 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xca] - qvstfcduxi 2, 9, 11 -# CHECK: qvstfcdux 2, 9, 11 # encoding: [0x7c,0x49,0x59,0xce] - qvstfcdux 2, 9, 11 -# CHECK: qvstfcdxa 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8f] - qvstfcdxa 2, 10, 11 -# CHECK: qvstfcdxia 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8b] - qvstfcdxia 2, 10, 11 -# CHECK: qvstfcdxi 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8a] - qvstfcdxi 2, 10, 11 -# CHECK: qvstfcdx 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x8e] - qvstfcdx 2, 10, 11 -# CHECK: qvstfcsuxa 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4f] - qvstfcsuxa 2, 9, 11 -# CHECK: qvstfcsuxia 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4b] - qvstfcsuxia 2, 9, 11 -# CHECK: qvstfcsuxi 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4a] - qvstfcsuxi 2, 9, 11 -# CHECK: qvstfcsux 2, 9, 11 # encoding: [0x7c,0x49,0x59,0x4e] - qvstfcsux 2, 9, 11 -# CHECK: qvstfcsxa 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0f] - qvstfcsxa 2, 10, 11 -# CHECK: qvstfcsxia 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0b] - qvstfcsxia 2, 10, 11 -# CHECK: qvstfcsxi 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0a] - qvstfcsxi 2, 10, 11 -# CHECK: qvstfcsx 2, 10, 11 # encoding: [0x7c,0x4a,0x59,0x0e] - qvstfcsx 2, 10, 11 -# CHECK: qvstfduxa 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xcf] - qvstfduxa 2, 9, 11 -# CHECK: qvstfduxia 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xcb] - qvstfduxia 2, 9, 11 -# CHECK: qvstfduxi 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xca] - qvstfduxi 2, 9, 11 -# CHECK: qvstfdux 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0xce] - qvstfdux 2, 9, 11 -# CHECK: qvstfdxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8f] - qvstfdxa 2, 10, 11 -# CHECK: qvstfdxia 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8b] - qvstfdxia 2, 10, 11 -# CHECK: qvstfdxi 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8a] - qvstfdxi 2, 10, 11 -# CHECK: qvstfdx 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x8e] - qvstfdx 2, 10, 11 -# CHECK: qvstfiwxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5f,0x8f] - qvstfiwxa 2, 10, 11 -# CHECK: qvstfiwx 2, 10, 11 # encoding: [0x7c,0x4a,0x5f,0x8e] - qvstfiwx 2, 10, 11 -# CHECK: qvstfsuxa 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4f] - qvstfsuxa 2, 9, 11 -# CHECK: qvstfsuxia 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4b] - qvstfsuxia 2, 9, 11 -# CHECK: qvstfsuxi 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4a] - qvstfsuxi 2, 9, 11 -# CHECK: qvstfsux 2, 9, 11 # encoding: [0x7c,0x49,0x5d,0x4e] - qvstfsux 2, 9, 11 -# CHECK: qvstfsxa 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0f] - qvstfsxa 2, 10, 11 -# CHECK: qvstfsxia 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0b] - qvstfsxia 2, 10, 11 -# CHECK: qvstfsxi 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0a] - qvstfsxi 2, 10, 11 -# CHECK: qvstfsx 2, 10, 11 # encoding: [0x7c,0x4a,0x5d,0x0e] - qvstfsx 2, 10, 11 - diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll b/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll index c45c48d502343..67c22f9470779 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/fp-bc-icmp-const-fold.ll @@ -4,7 +4,7 @@ ; RUN: opt -attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @test(i32 signext %n) { ; IS__TUNIT____: Function Attrs: nofree noreturn nosync nounwind readnone diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll index 6cd77a59df6b1..faf7041bfc387 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"struct.std::complex" = type { { float, float } } diff --git a/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll index 2a61fff15ade0..a57693a1da38e 100644 --- a/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"struct.std::complex" = type { { float, float } } diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll index 4c5f18a26657c..5a6daa2c9a008 100644 --- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll +++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll @@ -4,7 +4,7 @@ ; RUN: opt -passes="function(ee-instrument),function(ee-instrument),cgscc(inline),function(post-inline-ee-instrument),function(post-inline-ee-instrument)" -S < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @leaf_function() #0 { entry: diff --git a/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll b/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll deleted file mode 100644 index e9710df5670cd..0000000000000 --- a/llvm/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll +++ /dev/null @@ -1,165 +0,0 @@ -; RUN: opt -S -instcombine < %s | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare <4 x double> @llvm.ppc.qpx.qvlfs(i8*) #1 - -define <4 x double> @test1(<4 x float>* %h) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv) - -; CHECK-LABEL: @test1 -; CHECK: @llvm.ppc.qpx.qvlfs -; CHECK: ret <4 x double> - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - %v0e = fpext <4 x float> %v0 to <4 x double> - %a = fadd <4 x double> %v0e, %vl - ret <4 x double> %a -} - -define <4 x double> @test1a(<4 x float>* align 16 %h) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv) - -; CHECK-LABEL: @test1a -; CHECK-NOT: @llvm.ppc.qpx.qvlfs -; CHECK-NOT: load <4 x double> -; CHECK: ret <4 x double> - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - %v0e = fpext <4 x float> %v0 to <4 x double> - %a = fadd <4 x double> %v0e, %vl - ret <4 x double> %a -} - -declare void @llvm.ppc.qpx.qvstfs(<4 x double>, i8*) #0 - -define <4 x float> @test2(<4 x float>* %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv) - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - ret <4 x float> %v0 - -; CHECK-LABEL: @test2 -; CHECK: @llvm.ppc.qpx.qvstfs -; CHECK: ret <4 x float> -} - -define <4 x float> @test2a(<4 x float>* align 16 %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1 - %hv = bitcast <4 x float>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv) - - %v0 = load <4 x float>, <4 x float>* %h, align 8 - ret <4 x float> %v0 - -; CHECK-LABEL: @test2 -; CHECK: fptrunc <4 x double> %d to <4 x float> -; CHECK-NOT: @llvm.ppc.qpx.qvstfs -; CHECK-NOT: store <4 x double> -; CHECK: ret <4 x float> -} - -declare <4 x double> @llvm.ppc.qpx.qvlfd(i8*) #1 - -define <4 x double> @test1l(<4 x double>* %h) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) - -; CHECK-LABEL: @test1l -; CHECK: @llvm.ppc.qpx.qvlfd -; CHECK: ret <4 x double> - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - %a = fadd <4 x double> %v0, %vl - ret <4 x double> %a -} - -define <4 x double> @test1ln(<4 x double>* align 16 %h) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) - -; CHECK-LABEL: @test1ln -; CHECK: @llvm.ppc.qpx.qvlfd -; CHECK: ret <4 x double> - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - %a = fadd <4 x double> %v0, %vl - ret <4 x double> %a -} - -define <4 x double> @test1la(<4 x double>* align 32 %h) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv) - -; CHECK-LABEL: @test1la -; CHECK-NOT: @llvm.ppc.qpx.qvlfd -; CHECK: ret <4 x double> - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - %a = fadd <4 x double> %v0, %vl - ret <4 x double> %a -} - -declare void @llvm.ppc.qpx.qvstfd(<4 x double>, i8*) #0 - -define <4 x double> @test2l(<4 x double>* %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - ret <4 x double> %v0 - -; CHECK-LABEL: @test2l -; CHECK: @llvm.ppc.qpx.qvstfd -; CHECK: ret <4 x double> -} - -define <4 x double> @test2ln(<4 x double>* align 16 %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - ret <4 x double> %v0 - -; CHECK-LABEL: @test2ln -; CHECK: @llvm.ppc.qpx.qvstfd -; CHECK: ret <4 x double> -} - -define <4 x double> @test2la(<4 x double>* align 32 %h, <4 x double> %d) #0 { -entry: - %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1 - %hv = bitcast <4 x double>* %h1 to i8* - call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv) - - %v0 = load <4 x double>, <4 x double>* %h, align 8 - ret <4 x double> %v0 - -; CHECK-LABEL: @test2l -; CHECK-NOT: @llvm.ppc.qpx.qvstfd -; CHECK: ret <4 x double> -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readonly } - diff --git a/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll b/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll index ea46fd0d5a8f8..68c75af14f3e9 100644 --- a/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll +++ b/llvm/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll @@ -1,7 +1,6 @@ -; RUN: opt -mcpu=a2 -loop-data-prefetch -S < %s | FileCheck %s -; RUN: opt -mcpu=a2 -passes=loop-data-prefetch -S < %s | FileCheck %s +; RUN: opt -mcpu=a2 -loop-data-prefetch -mtriple=powerpc64le-unknown-linux -enable-ppc-prefetching -S < %s | FileCheck %s +; RUN: opt -mcpu=a2 -passes=loop-data-prefetch -mtriple=powerpc64le-unknown-linux -enable-ppc-prefetching -S < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @foo(double* nocapture %a, double* nocapture readonly %b) { entry: diff --git a/llvm/test/Transforms/LoopSimplify/dup-preds.ll b/llvm/test/Transforms/LoopSimplify/dup-preds.ll index c9253fa51a65f..362d834686d41 100644 --- a/llvm/test/Transforms/LoopSimplify/dup-preds.ll +++ b/llvm/test/Transforms/LoopSimplify/dup-preds.ll @@ -1,6 +1,6 @@ ; RUN: opt -loop-simplify -S %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define fastcc void @do_update_md([3 x float]* nocapture readonly %x) #0 { entry: diff --git a/llvm/test/Transforms/LoopUnroll/pr14167.ll b/llvm/test/Transforms/LoopUnroll/pr14167.ll index 9aac70115d9ae..3097c234fb933 100644 --- a/llvm/test/Transforms/LoopUnroll/pr14167.ll +++ b/llvm/test/Transforms/LoopUnroll/pr14167.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -S -loop-unroll -unroll-runtime | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @test1() nounwind { ; Ensure that we don't crash when the trip count == -1. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll deleted file mode 100644 index 9fdfb6f90e7bf..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: opt -S -basic-aa -loop-vectorize < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -; Function Attrs: nounwind -define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %c) #0 { -entry: - br label %for.body - -; CHECK-LABEL: @foo -; CHECK: fmul <4 x double> %{{[^,]+}}, -; CHECK-NEXT: fmul <4 x double> %{{[^,]+}}, - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv - %0 = load double, double* %arrayidx, align 8 - %mul = fmul double %0, 2.000000e+00 - %mul3 = fmul double %0, %mul - %arrayidx5 = getelementptr inbounds double, double* %c, i64 %indvars.iv - %1 = load double, double* %arrayidx5, align 8 - %mul6 = fmul double %1, 3.000000e+00 - %mul9 = fmul double %1, %mul6 - %add = fadd double %mul3, %mul9 - %mul12 = fmul double %0, 4.000000e+00 - %mul15 = fmul double %mul12, %1 - %add16 = fadd double %mul15, %add - %add17 = fadd double %add16, 1.000000e+00 - %arrayidx19 = getelementptr inbounds double, double* %a, i64 %indvars.iv - store double %add17, double* %arrayidx19, align 8 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 1600 - br i1 %exitcond, label %for.cond.cleanup, label %for.body -} - -attributes #0 = { nounwind "target-cpu"="a2q" } - diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll index 8abc25ece35c6..cddddba579473 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -loop-vectorize < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define zeroext i32 @test() #0 { diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll index 999ff74ad5881..5bf7e1a695011 100644 --- a/llvm/test/Transforms/NewGVN/pr31483.ll +++ b/llvm/test/Transforms/NewGVN/pr31483.ll @@ -100,7 +100,7 @@ declare signext i32 @zot(i8*, ...) #1 ; Function Attrs: nounwind declare void @llvm.va_end(i8*) #2 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll index 8f97225ca446b..20c44384504e2 100644 --- a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll +++ b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -ipsccp < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" define void @test(i32 signext %n) { diff --git a/llvm/unittests/ADT/TripleTest.cpp b/llvm/unittests/ADT/TripleTest.cpp index dc7a28c72f208..1852d7b6a1b0d 100644 --- a/llvm/unittests/ADT/TripleTest.cpp +++ b/llvm/unittests/ADT/TripleTest.cpp @@ -111,41 +111,6 @@ TEST(TripleTest, ParsedIDs) { EXPECT_EQ(Triple::Linux, T.getOS()); EXPECT_EQ(Triple::Musl, T.getEnvironment()); - T = Triple("powerpc-bgp-linux"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("powerpc-bgp-cnk"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::CNK, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("ppc-bgp-linux"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("ppc32-bgp-linux"); - EXPECT_EQ(Triple::ppc, T.getArch()); - EXPECT_EQ(Triple::BGP, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("powerpc64-bgq-linux"); - EXPECT_EQ(Triple::ppc64, T.getArch()); - EXPECT_EQ(Triple::BGQ, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); - - T = Triple("ppc64-bgq-linux"); - EXPECT_EQ(Triple::ppc64, T.getArch()); - EXPECT_EQ(Triple::BGQ, T.getVendor()); - EXPECT_EQ(Triple::Linux, T.getOS()); - T = Triple("powerpc-ibm-aix"); EXPECT_EQ(Triple::ppc, T.getArch()); EXPECT_EQ(Triple::IBM, T.getVendor()); diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn index 043a672a76e1e..3a452fc6e0601 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn @@ -61,7 +61,6 @@ static_library("LLVMPowerPCCodeGen") { "PPCMachineScheduler.cpp", "PPCMacroFusion.cpp", "PPCPreEmitPeephole.cpp", - "PPCQPXLoadSplat.cpp", "PPCReduceCRLogicals.cpp", "PPCRegisterInfo.cpp", "PPCSubtarget.cpp", diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 933573bc810cb..bb6cee740ace7 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1116,9 +1116,6 @@ extern kmp_uint64 __kmp_now_nsec(); #if KMP_OS_WINDOWS #define KMP_INIT_WAIT 64U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */ -#elif KMP_OS_CNK -#define KMP_INIT_WAIT 16U /* initial number of spin-tests */ -#define KMP_NEXT_WAIT 8U /* susequent number of spin-tests */ #elif KMP_OS_LINUX #define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index b5c641cc7273c..f6fb1e602c297 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -680,17 +680,6 @@ void __kmpc_flush(ident_t *loc) { // Nothing to see here move along #elif KMP_ARCH_PPC64 // Nothing needed here (we have a real MB above). -#if KMP_OS_CNK - // The flushing thread needs to yield here; this prevents a - // busy-waiting thread from saturating the pipeline. flush is - // often used in loops like this: - // while (!flag) { - // #pragma omp flush(flag) - // } - // and adding the yield here is good for at least a 10x speedup - // when running >2 threads per core (on the NAS LU benchmark). - __kmp_yield(); -#endif #else #error Unknown or unsupported architecture #endif diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h index e54f6812b8b34..b80e54777e8c2 100644 --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -159,7 +159,7 @@ extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck); #define KMP_LOCK_ACQUIRED_NEXT 0 #ifndef KMP_USE_FUTEX #define KMP_USE_FUTEX \ - (KMP_OS_LINUX && !KMP_OS_CNK && \ + (KMP_OS_LINUX && \ (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)) #endif #if KMP_USE_FUTEX diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index bfe7765b2a967..33735cf455c7e 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -69,7 +69,7 @@ #error Unknown compiler #endif -#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_CNK +#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) #define KMP_AFFINITY_SUPPORTED 1 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64 #define KMP_GROUP_AFFINITY 1 diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index 779c08e9771d5..4296ca31d67d9 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -22,7 +22,6 @@ #define KMP_OS_OPENBSD 0 #define KMP_OS_DARWIN 0 #define KMP_OS_WINDOWS 0 -#define KMP_OS_CNK 0 #define KMP_OS_HURD 0 #define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */ @@ -66,11 +65,6 @@ #define KMP_OS_OPENBSD 1 #endif -#if (defined __bgq__) -#undef KMP_OS_CNK -#define KMP_OS_CNK 1 -#endif - #if (defined __GNU__) #undef KMP_OS_HURD #define KMP_OS_HURD 1 diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index 8090ff759fe1b..16059a3762bf4 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -1433,13 +1433,8 @@ __kmp_invoke_microtask: add 12, 0, 12 neg 12, 12 -// We need to make sure that the stack frame stays aligned (to 16 bytes, except -// under the BG/Q CNK, where it must be to 32 bytes). -# if KMP_OS_CNK - li 0, -32 -# else +// We need to make sure that the stack frame stays aligned (to 16 bytes). li 0, -16 -# endif and 12, 0, 12 // Establish the local stack frame. diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 3b5910fc95e89..58cc4d25f6080 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -31,7 +31,7 @@ #include #include -#if KMP_OS_LINUX && !KMP_OS_CNK +#if KMP_OS_LINUX #include #if KMP_USE_FUTEX // We should really include , but that causes compatibility problems on diff --git a/polly/lib/External/isl/config.sub b/polly/lib/External/isl/config.sub index 1d8e98bcee23a..bc4db70f82abf 100755 --- a/polly/lib/External/isl/config.sub +++ b/polly/lib/External/isl/config.sub @@ -152,9 +152,6 @@ case $os in os= basic_machine=$1 ;; - -bluegene*) - os=-cnk - ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 @@ -539,10 +536,6 @@ case $basic_machine in basic_machine=bfin-`echo "$basic_machine" | sed 's/^[^-]*-//'` os=-linux ;; - bluegene*) - basic_machine=powerpc-ibm - os=-cnk - ;; c54x-*) basic_machine=tic54x-`echo "$basic_machine" | sed 's/^[^-]*-//'` ;; @@ -1364,7 +1357,7 @@ case $os in # Each alternative MUST end in a * to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ @@ -1728,7 +1721,7 @@ case $basic_machine in -sunos*) vendor=sun ;; - -cnk*|-aix*) + -aix*) vendor=ibm ;; -beos*) diff --git a/polly/lib/External/ppcg/config.sub b/polly/lib/External/ppcg/config.sub index 6205f8423d6aa..d97f3009f9f09 100644 --- a/polly/lib/External/ppcg/config.sub +++ b/polly/lib/External/ppcg/config.sub @@ -160,9 +160,6 @@ case $os in os= basic_machine=$1 ;; - -bluegene*) - os=-cnk - ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 @@ -524,10 +521,6 @@ case $basic_machine in basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; - bluegene*) - basic_machine=powerpc-ibm - os=-cnk - ;; c54x-*) basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; @@ -1344,7 +1337,7 @@ case $os in # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ @@ -1709,7 +1702,7 @@ case $basic_machine in -sunos*) vendor=sun ;; - -cnk*|-aix*) + -aix*) vendor=ibm ;; -beos*) From 22ec861d28c0aa0cdf76b9618151b9ee87ba2221 Mon Sep 17 00:00:00 2001 From: Xing GUO Date: Tue, 28 Jul 2020 22:10:44 +0800 Subject: [PATCH 0301/1035] [DWARFYAML] Add support for emitting custom range list content. This patch adds support for emitting custom range list content. We are able to handcraft a custom range list via the following syntax. ``` debug_rnglists: - Lists: - Entries: - Operator: DW_RLE_startx_endx Values: [ 0x1234, 0x1234 ] - Content: '1234567890abcdef' - Content: 'abcdef1234567890' ``` Reviewed By: jhenderson Differential Revision: https://reviews.llvm.org/D84618 --- llvm/include/llvm/ObjectYAML/DWARFYAML.h | 6 +- llvm/lib/ObjectYAML/DWARFEmitter.cpp | 17 ++++-- llvm/lib/ObjectYAML/DWARFYAML.cpp | 9 +++ .../yaml2obj/ELF/DWARF/debug-rnglists.yaml | 61 +++++++++++++++++++ 4 files changed, 86 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h index 88ac404b21b13..f4ace738ef8cf 100644 --- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h +++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h @@ -18,6 +18,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/ObjectYAML/YAML.h" #include "llvm/Support/YAMLTraits.h" #include #include @@ -189,7 +190,8 @@ struct RnglistEntry { }; template struct ListEntries { - std::vector Entries; + Optional> Entries; + Optional Content; }; template struct ListTable { @@ -328,6 +330,8 @@ struct MappingTraits> { template struct MappingTraits> { static void mapping(IO &IO, DWARFYAML::ListEntries &ListEntries); + static StringRef validate(IO &IO, + DWARFYAML::ListEntries &ListEntries); }; template <> struct MappingTraits { diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp index f61191022fb9b..b39bb003db9a5 100644 --- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp +++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp @@ -658,12 +658,17 @@ Error writeDWARFLists(raw_ostream &OS, for (const DWARFYAML::ListEntries &List : Table.Lists) { Offsets.push_back(ListBufferOS.tell()); - for (const EntryType &Entry : List.Entries) { - Expected EntrySize = - writeListEntry(ListBufferOS, Entry, AddrSize, IsLittleEndian); - if (!EntrySize) - return EntrySize.takeError(); - Length += *EntrySize; + if (List.Content) { + List.Content->writeAsBinary(ListBufferOS, UINT64_MAX); + Length += List.Content->binary_size(); + } else if (List.Entries) { + for (const EntryType &Entry : *List.Entries) { + Expected EntrySize = + writeListEntry(ListBufferOS, Entry, AddrSize, IsLittleEndian); + if (!EntrySize) + return EntrySize.takeError(); + Length += *EntrySize; + } } } diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp index e5c77bc3721fb..adc167249226b 100644 --- a/llvm/lib/ObjectYAML/DWARFYAML.cpp +++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp @@ -246,6 +246,15 @@ template void MappingTraits>::mapping( IO &IO, DWARFYAML::ListEntries &ListEntries) { IO.mapOptional("Entries", ListEntries.Entries); + IO.mapOptional("Content", ListEntries.Content); +} + +template +StringRef MappingTraits>::validate( + IO &IO, DWARFYAML::ListEntries &ListEntries) { + if (ListEntries.Entries && ListEntries.Content) + return "Entries and Content can't be used together"; + return StringRef(); } template diff --git a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-rnglists.yaml b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-rnglists.yaml index 80dc9b9fdeb35..248cb190235b2 100644 --- a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-rnglists.yaml +++ b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-rnglists.yaml @@ -608,3 +608,64 @@ FileHeader: Machine: EM_X86_64 DWARF: debug_rnglists: [] + +## s) Test that we are able to generate a range list via raw binary data. + +# RUN: yaml2obj --docnum=17 %s -o %t17.o +# RUN: llvm-readelf --hex-dump=.debug_rnglists %t17.o | \ +# RUN: FileCheck %s --check-prefix=CUSTOM-LIST + +# CUSTOM-LIST: Hex dump of section '.debug_rnglists': +# CUSTOM-LIST-NEXT: 0x00000000 29000000 05000800 03000000 0c000000 )............... +## ^------- unit_length (4-byte) +## ^--- version (2-byte) +## ^- address_size (1-byte) +## ^- segment_selector_size (1-byte) +## ^------- offset_entry_count (4-byte) +## ^------- offsets[0] (4-byte) +# CUSTOM-LIST-NEXT: 0x00000010 11000000 19000000 02b424b4 24123456 ..........$.$.4V +## ^------- offsets[1] (4-byte) +## ^------- offsets[2] (4-byte) +## ^- DW_RLE_startx_endx +## ^--- operands[0] (ULEB128) 0x1234 +## ^---- operands[1] (ULEB128) 0x1234 +## ^----- custom list content +# CUSTOM-LIST-NEXT: 0x00000020 7890abcd efabcdef 12345678 90 x........4Vx. +## ----------- +## ^----------------- custom list content + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +DWARF: + debug_rnglists: + - Lists: + - Entries: + - Operator: DW_RLE_startx_endx + Values: [ 0x1234, 0x1234 ] + - Content: '1234567890abcdef' + - Content: 'abcdef1234567890' + +## t) Test that yaml2obj emits an error message when 'Content' and 'Entries' are specified +## at the same time. + +# RUN: not yaml2obj --docnum=18 %s 2>&1 | FileCheck %s --check-prefix=ERR + +# ERR: YAML:{{.*}}: error: Entries and Content can't be used together +# ERR-NEXT: - Entries: [] +# ERR-NEXT: ^ + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +DWARF: + debug_rnglists: + - Lists: + - Entries: [] + Content: '' From c64c04bbaadbc35e265f12644b45787d6d077587 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 28 Jul 2020 16:29:29 +0200 Subject: [PATCH 0302/1035] Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 --- .../ConvertLaunchFuncToRuntimeCalls.cpp | 96 +++++++------------ ...ower-launch-func-to-gpu-runtime-calls.mlir | 10 +- .../cuda-runtime-wrappers.cpp | 89 ++++++++--------- .../rocm-runtime-wrappers.cpp | 80 ++++++++-------- 4 files changed, 121 insertions(+), 154 deletions(-) diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp index 37b056263ab46..14011e08de027 100644 --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -39,7 +39,7 @@ static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad"; static constexpr const char *kGpuModuleGetFunctionName = "mgpuModuleGetFunction"; static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel"; -static constexpr const char *kGpuGetStreamHelperName = "mgpuGetStreamHelper"; +static constexpr const char *kGpuStreamCreateName = "mgpuStreamCreate"; static constexpr const char *kGpuStreamSynchronizeName = "mgpuStreamSynchronize"; static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister"; @@ -100,12 +100,6 @@ class GpuLaunchFuncToGpuRuntimeCallsPass getLLVMDialect(), module.getDataLayout().getPointerSizeInBits()); } - LLVM::LLVMType getGpuRuntimeResultType() { - // This is declared as an enum in both CUDA and ROCm (HIP), but helpers - // use i32. - return getInt32Type(); - } - // Allocate a void pointer on the stack. Value allocatePointer(OpBuilder &builder, Location loc) { auto one = builder.create(loc, getInt32Type(), @@ -168,27 +162,21 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( if (!module.lookupSymbol(kGpuModuleLoadName)) { builder.create( loc, kGpuModuleLoadName, - LLVM::LLVMType::getFunctionTy( - getGpuRuntimeResultType(), - { - getPointerPointerType(), /* CUmodule *module */ - getPointerType() /* void *cubin */ - }, - /*isVarArg=*/false)); + LLVM::LLVMType::getFunctionTy(getPointerType(), + {getPointerType()}, /* void *cubin */ + /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuModuleGetFunctionName)) { // The helper uses void* instead of CUDA's opaque CUmodule and // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t. builder.create( loc, kGpuModuleGetFunctionName, - LLVM::LLVMType::getFunctionTy( - getGpuRuntimeResultType(), - { - getPointerPointerType(), /* void **function */ - getPointerType(), /* void *module */ - getPointerType() /* char *name */ - }, - /*isVarArg=*/false)); + LLVM::LLVMType::getFunctionTy(getPointerType(), + { + getPointerType(), /* void *module */ + getPointerType() /* char *name */ + }, + /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuLaunchKernelName)) { // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to @@ -198,7 +186,7 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( builder.create( loc, kGpuLaunchKernelName, LLVM::LLVMType::getFunctionTy( - getGpuRuntimeResultType(), + getVoidType(), { getPointerType(), /* void* f */ getIntPtrType(), /* intptr_t gridXDim */ @@ -214,18 +202,18 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( }, /*isVarArg=*/false)); } - if (!module.lookupSymbol(kGpuGetStreamHelperName)) { + if (!module.lookupSymbol(kGpuStreamCreateName)) { // Helper function to get the current GPU compute stream. Uses void* // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t. builder.create( - loc, kGpuGetStreamHelperName, + loc, kGpuStreamCreateName, LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuStreamSynchronizeName)) { builder.create( loc, kGpuStreamSynchronizeName, - LLVM::LLVMType::getFunctionTy(getGpuRuntimeResultType(), - getPointerType() /* CUstream stream */, + LLVM::LLVMType::getFunctionTy(getVoidType(), + {getPointerType()}, /* void *stream */ /*isVarArg=*/false)); } if (!module.lookupSymbol(kGpuMemHostRegisterName)) { @@ -365,17 +353,13 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant( // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. // // %0 = call %binarygetter -// %1 = alloca sizeof(void*) -// call %moduleLoad(%2, %1) -// %2 = alloca sizeof(void*) -// %3 = load %1 -// %4 = -// call %moduleGetFunction(%2, %3, %4) -// %5 = call %getStreamHelper() -// %6 = load %2 -// %7 = -// call %launchKernel(%6, , 0, %5, %7, nullptr) -// call %streamSynchronize(%5) +// %1 = call %moduleLoad(%0) +// %2 = +// %3 = call %moduleGetFunction(%1, %2) +// %4 = call %streamCreate() +// %5 = +// call %launchKernel(%3, , 0, %4, %5, nullptr) +// call %streamSynchronize(%4) void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( mlir::gpu::LaunchFuncOp launchOp) { OpBuilder builder(launchOp); @@ -405,36 +389,30 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( // Emit the load module call to load the module data. Error checking is done // in the called helper function. - auto gpuModule = allocatePointer(builder, loc); auto gpuModuleLoad = getOperation().lookupSymbol(kGpuModuleLoadName); - builder.create(loc, ArrayRef{getGpuRuntimeResultType()}, - builder.getSymbolRefAttr(gpuModuleLoad), - ArrayRef{gpuModule, data}); + auto module = builder.create( + loc, ArrayRef{getPointerType()}, + builder.getSymbolRefAttr(gpuModuleLoad), ArrayRef{data}); // Get the function from the module. The name corresponds to the name of // the kernel function. - auto gpuOwningModuleRef = - builder.create(loc, getPointerType(), gpuModule); auto kernelName = generateKernelNameConstant( launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder); - auto gpuFunction = allocatePointer(builder, loc); auto gpuModuleGetFunction = getOperation().lookupSymbol(kGpuModuleGetFunctionName); - builder.create( - loc, ArrayRef{getGpuRuntimeResultType()}, + auto function = builder.create( + loc, ArrayRef{getPointerType()}, builder.getSymbolRefAttr(gpuModuleGetFunction), - ArrayRef{gpuFunction, gpuOwningModuleRef, kernelName}); + ArrayRef{module.getResult(0), kernelName}); // Grab the global stream needed for execution. - auto gpuGetStreamHelper = - getOperation().lookupSymbol(kGpuGetStreamHelperName); - auto gpuStream = builder.create( + auto gpuStreamCreate = + getOperation().lookupSymbol(kGpuStreamCreateName); + auto stream = builder.create( loc, ArrayRef{getPointerType()}, - builder.getSymbolRefAttr(gpuGetStreamHelper), ArrayRef{}); + builder.getSymbolRefAttr(gpuStreamCreate), ArrayRef{}); // Invoke the function with required arguments. auto gpuLaunchKernel = getOperation().lookupSymbol(kGpuLaunchKernelName); - auto gpuFunctionRef = - builder.create(loc, getPointerType(), gpuFunction); auto paramsArray = setupParamsArray(launchOp, builder); if (!paramsArray) { launchOp.emitOpError() << "cannot pass given parameters to the kernel"; @@ -443,21 +421,21 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( auto nullpointer = builder.create(loc, getPointerPointerType(), zero); builder.create( - loc, ArrayRef{getGpuRuntimeResultType()}, + loc, ArrayRef{getVoidType()}, builder.getSymbolRefAttr(gpuLaunchKernel), - ArrayRef{gpuFunctionRef, launchOp.getOperand(0), + ArrayRef{function.getResult(0), launchOp.getOperand(0), launchOp.getOperand(1), launchOp.getOperand(2), launchOp.getOperand(3), launchOp.getOperand(4), launchOp.getOperand(5), zero, /* sharedMemBytes */ - gpuStream.getResult(0), /* stream */ + stream.getResult(0), /* stream */ paramsArray, /* kernel params */ nullpointer /* extra */}); // Sync on the stream to make it synchronous. auto gpuStreamSync = getOperation().lookupSymbol(kGpuStreamSynchronizeName); - builder.create(loc, ArrayRef{getGpuRuntimeResultType()}, + builder.create(loc, ArrayRef{getVoidType()}, builder.getSymbolRefAttr(gpuStreamSync), - ArrayRef(gpuStream.getResult(0))); + ArrayRef(stream.getResult(0))); launchOp.erase(); } diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir index a3381465ebf2c..bdcde0be60c20 100644 --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -20,13 +20,11 @@ module attributes {gpu.container_module} { // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]] // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) - // CHECK: %[[binary_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]] + // CHECK: %[[binary:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]] // CHECK-SAME: -> !llvm<"i8*"> - // CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> - // CHECK: llvm.call @mgpuModuleLoad(%[[module_ptr]], %[[binary_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32 - // CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> - // CHECK: llvm.call @mgpuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32 - // CHECK: llvm.call @mgpuGetStreamHelper + // CHECK: %[[module:.*]] = llvm.call @mgpuModuleLoad(%[[binary]]) : (!llvm<"i8*">) -> !llvm<"i8*"> + // CHECK: %[[func:.*]] = llvm.call @mgpuModuleGetFunction(%[[module]], {{.*}}) : (!llvm<"i8*">, !llvm<"i8*">) -> !llvm<"i8*"> + // CHECK: llvm.call @mgpuStreamCreate // CHECK: llvm.call @mgpuLaunchKernel // CHECK: llvm.call @mgpuStreamSynchronize "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_module::@kernel } diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp index 2b71eb34703bd..8e2dc029fa9f0 100644 --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -21,54 +21,50 @@ #include "cuda.h" -namespace { -int32_t reportErrorIfAny(CUresult result, const char *where) { - if (result != CUDA_SUCCESS) { - llvm::errs() << "CUDA failed with " << result << " in " << where << "\n"; - } - return result; +#define CUDA_REPORT_IF_ERROR(expr) \ + [](CUresult result) { \ + if (!result) \ + return; \ + const char *name = nullptr; \ + cuGetErrorName(result, &name); \ + if (!name) \ + name = ""; \ + llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n"; \ + }(expr) + +extern "C" CUmodule mgpuModuleLoad(void *data) { + CUmodule module = nullptr; + CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); + return module; } -} // anonymous namespace -extern "C" int32_t mgpuModuleLoad(void **module, void *data) { - int32_t err = reportErrorIfAny( - cuModuleLoadData(reinterpret_cast(module), data), - "ModuleLoad"); - return err; -} - -extern "C" int32_t mgpuModuleGetFunction(void **function, void *module, - const char *name) { - return reportErrorIfAny( - cuModuleGetFunction(reinterpret_cast(function), - reinterpret_cast(module), name), - "GetFunction"); +extern "C" CUfunction mgpuModuleGetFunction(CUmodule module, const char *name) { + CUfunction function = nullptr; + CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); + return function; } // The wrapper uses intptr_t instead of CUDA's unsigned int to match // the type of MLIR's index type. This avoids the need for casts in the // generated MLIR code. -extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX, - intptr_t gridY, intptr_t gridZ, - intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, void *stream, - void **params, void **extra) { - return reportErrorIfAny( - cuLaunchKernel(reinterpret_cast(function), gridX, gridY, - gridZ, blockX, blockY, blockZ, smem, - reinterpret_cast(stream), params, extra), - "LaunchKernel"); +extern "C" void mgpuLaunchKernel(CUfunction function, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, + intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, CUstream stream, + void **params, void **extra) { + CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, + blockY, blockZ, smem, stream, params, + extra)); } -extern "C" void *mgpuGetStreamHelper() { - CUstream stream; - reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate"); +extern "C" CUstream mgpuStreamCreate() { + CUstream stream = nullptr; + CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); return stream; } -extern "C" int32_t mgpuStreamSynchronize(void *stream) { - return reportErrorIfAny( - cuStreamSynchronize(reinterpret_cast(stream)), "StreamSync"); +extern "C" void mgpuStreamSynchronize(CUstream stream) { + CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); } /// Helper functions for writing mlir example code @@ -76,17 +72,16 @@ extern "C" int32_t mgpuStreamSynchronize(void *stream) { // Allows to register byte array with the CUDA runtime. Helpful until we have // transfer functions implemented. extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { - reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0), - "MemHostRegister"); + CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0)); } // Allows to register a MemRef with the CUDA runtime. Initializes array with // value. Helpful until we have transfer functions implemented. template -void mgpuMemHostRegisterMemRef(const DynamicMemRefType &mem_ref, T value) { - llvm::SmallVector denseStrides(mem_ref.rank); - llvm::ArrayRef sizes(mem_ref.sizes, mem_ref.rank); - llvm::ArrayRef strides(mem_ref.strides, mem_ref.rank); +void mgpuMemHostRegisterMemRef(const DynamicMemRefType &memRef, T value) { + llvm::SmallVector denseStrides(memRef.rank); + llvm::ArrayRef sizes(memRef.sizes, memRef.rank); + llvm::ArrayRef strides(memRef.strides, memRef.rank); std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(), std::multiplies()); @@ -98,17 +93,17 @@ void mgpuMemHostRegisterMemRef(const DynamicMemRefType &mem_ref, T value) { denseStrides.back() = 1; assert(strides == llvm::makeArrayRef(denseStrides)); - auto *pointer = mem_ref.data + mem_ref.offset; + auto *pointer = memRef.data + memRef.offset; std::fill_n(pointer, count, value); mgpuMemHostRegister(pointer, count * sizeof(T)); } extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) { - UnrankedMemRefType mem_ref = {rank, ptr}; - mgpuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 1.23f); + UnrankedMemRefType memRef = {rank, ptr}; + mgpuMemHostRegisterMemRef(DynamicMemRefType(memRef), 1.23f); } extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) { - UnrankedMemRefType mem_ref = {rank, ptr}; - mgpuMemHostRegisterMemRef(DynamicMemRefType(mem_ref), 123); + UnrankedMemRefType memRef = {rank, ptr}; + mgpuMemHostRegisterMemRef(DynamicMemRefType(memRef), 123); } diff --git a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp index f49e6c91ea65e..b97ce695ac42c 100644 --- a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp +++ b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp @@ -21,56 +21,52 @@ #include "hip/hip_runtime.h" -namespace { -int32_t reportErrorIfAny(hipError_t result, const char *where) { - if (result != hipSuccess) { - llvm::errs() << "HIP failed with " << result << " in " << where << "\n"; - } - return result; +#define HIP_REPORT_IF_ERROR(expr) \ + [](hipError_t result) { \ + if (!result) \ + return; \ + const char *name = nullptr; \ + hipGetErrorName(result, &name); \ + if (!name) \ + name = ""; \ + llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n"; \ + }(expr) + +extern "C" hipModule_t mgpuModuleLoad(void *data) { + hipModule_t module = nullptr; + HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data)); + return module; } -} // anonymous namespace -extern "C" int32_t mgpuModuleLoad(void **module, void *data) { - int32_t err = reportErrorIfAny( - hipModuleLoadData(reinterpret_cast(module), data), - "ModuleLoad"); - return err; -} - -extern "C" int32_t mgpuModuleGetFunction(void **function, void *module, - const char *name) { - return reportErrorIfAny( - hipModuleGetFunction(reinterpret_cast(function), - reinterpret_cast(module), name), - "GetFunction"); +extern "C" hipFunction_t mgpuModuleGetFunction(hipModule_t module, + const char *name) { + hipFunction_t function = nullptr; + HIP_REPORT_IF_ERROR(hipModuleGetFunction(&function, module, name)); + return function; } // The wrapper uses intptr_t instead of ROCM's unsigned int to match // the type of MLIR's index type. This avoids the need for casts in the // generated MLIR code. -extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX, - intptr_t gridY, intptr_t gridZ, - intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, void *stream, - void **params, void **extra) { - return reportErrorIfAny( - hipModuleLaunchKernel(reinterpret_cast(function), gridX, - gridY, gridZ, blockX, blockY, blockZ, smem, - reinterpret_cast(stream), params, - extra), - "LaunchKernel"); +extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, + intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, + hipStream_t stream, void **params, + void **extra) { + HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, + blockX, blockY, blockZ, smem, + stream, params, extra)); } -extern "C" void *mgpuGetStreamHelper() { - hipStream_t stream; - reportErrorIfAny(hipStreamCreate(&stream), "StreamCreate"); +extern "C" void *mgpuStreamCreate() { + hipStream_t stream = nullptr; + HIP_REPORT_IF_ERROR(hipStreamCreate(&stream)); return stream; } -extern "C" int32_t mgpuStreamSynchronize(void *stream) { - return reportErrorIfAny( - hipStreamSynchronize(reinterpret_cast(stream)), - "StreamSync"); +extern "C" void mgpuStreamSynchronize(hipStream_t stream) { + return HIP_REPORT_IF_ERROR(hipStreamSynchronize(stream)); } /// Helper functions for writing mlir example code @@ -78,8 +74,8 @@ extern "C" int32_t mgpuStreamSynchronize(void *stream) { // Allows to register byte array with the ROCM runtime. Helpful until we have // transfer functions implemented. extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { - reportErrorIfAny(hipHostRegister(ptr, sizeBytes, /*flags=*/0), - "MemHostRegister"); + HIP_REPORT_IF_ERROR(hipHostRegister(ptr, sizeBytes, /*flags=*/0), + "MemHostRegister"); } // Allows to register a MemRef with the ROCM runtime. Initializes array with @@ -120,8 +116,8 @@ extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) { template void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) { - reportErrorIfAny(hipSetDevice(0), "hipSetDevice"); - reportErrorIfAny( + HIP_REPORT_IF_ERROR(hipSetDevice(0), "hipSetDevice"); + HIP_REPORT_IF_ERROR( hipHostGetDevicePointer((void **)devicePtr, hostPtr, /*flags=*/0), "hipHostGetDevicePointer"); } From 01302ec08609ef3b113e13aba2a08443e4fa3046 Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Tue, 28 Jul 2020 15:35:31 +0100 Subject: [PATCH 0303/1035] [Support][NFC] Fix a Wdocumentation warning in FileSystem.h --- llvm/include/llvm/Support/FileSystem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h index b6d2a9f3aad53..18d787a75f611 100644 --- a/llvm/include/llvm/Support/FileSystem.h +++ b/llvm/include/llvm/Support/FileSystem.h @@ -1140,7 +1140,7 @@ openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None, /// none of other processes read or write this file, provided that all processes /// lock the file prior to accessing its content. /// -/// @param File The descriptor representing the file to lock. +/// @param FD The descriptor representing the file to lock. /// @param Timeout Time in milliseconds that the process should wait before /// reporting lock failure. Zero value means try to get lock only /// once. @@ -1163,7 +1163,7 @@ std::error_code lockFile(int FD); /// Unlock the file. /// -/// @param File The descriptor representing the file to unlock. +/// @param FD The descriptor representing the file to unlock. /// @returns errc::success if lock is successfully released or platform-specific /// error_code otherwise. std::error_code unlockFile(int FD); From 984cf99055a292b3afe4535c013d38914a3da880 Mon Sep 17 00:00:00 2001 From: Bruno Ricci Date: Tue, 28 Jul 2020 15:39:17 +0100 Subject: [PATCH 0304/1035] [clang][NFC] Add some documentation about the use of NamedDecl::getDeclName in diagnostics. As explained in eb10b065f2a870b425dcc2040b9955e0eee464b4, sending a NamedDecl* in a diagnostic should generally be preferred over sending the DeclarationName from getDeclName(). Let's document that. --- clang/include/clang/AST/Decl.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 28faa2c1fc780..4dd5e14d36e18 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -269,6 +269,19 @@ class NamedDecl : public Decl { /// Get the actual, stored name of the declaration, which may be a special /// name. + /// + /// Note that generally in diagnostics, the non-null \p NamedDecl* itself + /// should be sent into the diagnostic instead of using the result of + /// \p getDeclName(). + /// + /// A \p DeclarationName in a diagnostic will just be streamed to the output, + /// which will directly result in a call to \p DeclarationName::print. + /// + /// A \p NamedDecl* in a diagnostic will also ultimately result in a call to + /// \p DeclarationName::print, but with two customisation points along the + /// way (\p getNameForDiagnostic and \p printName). These are used to print + /// the template arguments if any, and to provide a user-friendly name for + /// some entities (such as unnamed variables and anonymous records). DeclarationName getDeclName() const { return Name; } /// Set the name of this declaration. From 54492a5843a34684ce21ae201dd8ca3e509288fd Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 27 Jul 2020 12:57:41 +0100 Subject: [PATCH 0305/1035] [AArch64][SVE] Don't support fixedStack for SVE objects. Fixed stack objects are preallocated and defined to be allocated before any of the regular stack objects. These are normally used to model stack arguments. The AAPCS does not support passing SVE registers on the stack by value (only by reference). The current layout also doesn't place them before all stack objects, but rather before all SVE objects. Removing this simplifies the code that emits the allocation/deallocation around callee-saved registers (D84042). This patch also removes all uses of fixedStack from from framelayout-sve.mir, where this was used purely for testing purposes. Reviewers: paulwalker-arm, efriedma, rengolin Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D84538 --- .../Target/AArch64/AArch64FrameLowering.cpp | 13 +- llvm/test/CodeGen/AArch64/framelayout-sve.mir | 125 +++++++++--------- 2 files changed, 66 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index efa3fd5ca9cef..cc563dd706326 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2595,20 +2595,21 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex, bool AssignOffsets) { +#ifndef NDEBUG // First process all fixed stack objects. - int64_t Offset = 0; for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - if (MFI.getStackID(I) == TargetStackID::SVEVector) { - int64_t FixedOffset = -MFI.getObjectOffset(I); - if (FixedOffset > Offset) - Offset = FixedOffset; - } + assert(MFI.getStackID(I) != TargetStackID::SVEVector && + "SVE vectors should never be passed on the stack by value, only by " + "reference."); +#endif auto Assign = [&MFI](int FI, int64_t Offset) { LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); MFI.setObjectOffset(FI, Offset); }; + int64_t Offset = 0; + // Then process all callee saved slots. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { // Make sure to align the last callee save slot. diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 7903df64863bf..575c839fbd151 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -41,10 +41,10 @@ # +----------+ # |scratchreg| // x29 is used as scratch reg. # +----------+ -# | %fixed- | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, -# | stack.0 | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) # +----------+ -# | %stack.0 | // not scalable +# | %stack.1 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_allocate_sve @@ -60,10 +60,9 @@ # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16 # CHECK-NEXT: RET_ReallyLR name: test_allocate_sve -fixedStack: - - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 } + - { id: 1, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: RET_ReallyLR @@ -73,10 +72,9 @@ body: | # | x20, x21 | // callee saves # |scratchreg| // x29 is used as scratch reg. # +----------+ -# | %fixed- | // scalable objects -# | stack.0 | +# | %stack.0 | // scalable objects # +----------+ -# | %stack.0 | // not scalable +# | %stack.1 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_allocate_sve_gpr_callee_saves @@ -95,10 +93,9 @@ body: | # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 32 # CHECK-NEXT: RET_ReallyLR name: test_allocate_sve_gpr_callee_saves -fixedStack: - - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 } + - { id: 1, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: $x20 = IMPLICIT_DEF @@ -109,11 +106,10 @@ body: | # +----------+ # | lr, fp | // frame record # +----------+ <- FP -# | %fixed- | // scalable objects -# | stack.0 | +# | %stack.0 | // scalable objects # +----------+ # |//////////| // alignment gap -# | %stack.0 | // not scalable +# | %stack.1 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_allocate_sve_gpr_realigned # CHECK: stackSize: 32 @@ -128,10 +124,9 @@ body: | # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 # CHECK-NEXT: RET_ReallyLR name: test_allocate_sve_gpr_realigned -fixedStack: - - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 32 } + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 } + - { id: 1, stack-id: default, size: 16, alignment: 32 } body: | bb.0.entry: RET_ReallyLR @@ -144,7 +139,7 @@ body: | # | %stack.1 | // scalable @ SP + 16b + 16 scalable bytes # | %stack.2 | // scalable @ SP + 16b + 14 scalable bytes # +----------+ -# | %stack.0 | // not scalable +# | %stack.3 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_address_sve @@ -169,19 +164,18 @@ body: | name: test_address_sve frameInfo: maxAlignment: 16 -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 } - - { id: 1, stack-id: sve-vec, size: 16, alignment: 8, offset: -32 } - - { id: 2, stack-id: sve-vec, size: 2, alignment: 2, offset: -34 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 1, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 2, stack-id: sve-vec, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: liveins: $z0, $z1, $p0 - STR_ZXI $z0, %fixed-stack.0, 0 - STR_ZXI $z1, %fixed-stack.1, 0 - STR_PXI $p0, %fixed-stack.2, 0 + STR_ZXI $z0, %stack.0, 0 + STR_ZXI $z1, %stack.1, 0 + STR_PXI $p0, %stack.2, 0 RET_ReallyLR --- @@ -190,11 +184,11 @@ body: | # | x20, x21 | // callee saves # | lr, fp | // frame record # +-----------+ <- FP -# | %fstack.0 | // scalable @ FP - 16 scalable bytes -# | %fstack.1 | // scalable @ FP - 32 scalable bytes -# | %fstack.2 | // scalable @ FP - 34 scalable bytes +# | %stack.0 | // scalable @ FP - 16 scalable bytes +# | %stack.1 | // scalable @ FP - 32 scalable bytes +# | %stack.2 | // scalable @ FP - 34 scalable bytes # +-----------+ -# | %stack.0 | // not scalable +# | %stack.3 | // not scalable # +-----------+ <- SP # CHECK-LABEL: name: test_address_sve_fp @@ -218,19 +212,18 @@ name: test_address_sve_fp frameInfo: maxAlignment: 16 isFrameAddressTaken: true -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 } - - { id: 1, stack-id: sve-vec, size: 16, alignment: 8, offset: -32 } - - { id: 2, stack-id: sve-vec, size: 2, alignment: 2, offset: -34 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 1, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 2, stack-id: sve-vec, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: liveins: $z0, $z1, $p0 - STR_ZXI $z0, %fixed-stack.0, 0 - STR_ZXI $z1, %fixed-stack.1, 0 - STR_PXI $p0, %fixed-stack.2, 0 + STR_ZXI $z0, %stack.0, 0 + STR_ZXI $z1, %stack.1, 0 + STR_PXI $p0, %stack.2, 0 RET_ReallyLR --- @@ -240,9 +233,9 @@ body: | # +-----------+ # |callee save| // register saved as scratch reg. # +-----------+ -# | %fstack.1 | // vector of 16 scalable bytes +# | %stack.0 | // vector of 16 scalable bytes # +---------- + -# | %stack.0 | // not scalable, 16 bytes +# | %stack.1 | // not scalable, 16 bytes # +-----------+ <- SP # CHECK-LABEL: name: test_stack_arg_sve # CHECK: stackSize: 32 @@ -262,9 +255,9 @@ body: | name: test_stack_arg_sve fixedStack: - { id: 0, stack-id: default, size: 16, alignment: 16, offset: 0 } - - { id: 1, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 16 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: default, size: 16, alignment: 16 } body: | bb.0.entry: liveins: $x0 @@ -320,17 +313,17 @@ body: | name: test_address_sve_out_of_range frameInfo: maxAlignment: 16 -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } - - { id: 1, stack-id: sve-vec, size: 3584, alignment: 16, offset: -3600 } - - { id: 2, stack-id: sve-vec, size: 512, alignment: 16, offset: -4112 } +stack: + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: sve-vec, size: 3584, alignment: 16 } + - { id: 2, stack-id: sve-vec, size: 512, alignment: 16 } body: | bb.0.entry: liveins: $z0, $p0 - STR_ZXI $z0, %fixed-stack.0, 0 - STR_PXI $p0, %fixed-stack.1, 0 + STR_ZXI $z0, %stack.0, 0 + STR_PXI $p0, %stack.1, 0 RET_ReallyLR --- @@ -340,11 +333,11 @@ body: | # access from the FP when there are also SVE objects on the stack. # # +----------+ <- FP -# | %fstack.0| // 16 scalable bytes +# | %stack.0 | // 16 scalable bytes # +----------+ <- @FP - 16 scalable bytes -# | %stack.0 | // 16 bytes +# | %stack.1 | // 16 bytes # +----------+ <- @BP -# : %stack.1 : // variable length +# : %stack.2 : // variable length # +----------+ <- SP # CHECK-LABEL: name: test_address_gpr_vla @@ -354,16 +347,15 @@ body: | name: test_address_gpr_vla frameInfo: maxAlignment: 16 -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } - - { id: 1, stack-id: default, type: variable-sized } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 1, stack-id: default, size: 16, alignment: 8 } + - { id: 2, stack-id: default, type: variable-sized } body: | bb.0.entry: liveins: $xzr - STRXui $xzr, %stack.0, 0 + STRXui $xzr, %stack.1, 0 RET_ReallyLR --- @@ -429,7 +421,7 @@ body: | # CHECK-LABEL: name: save_restore_sve # CHECK: $sp = frame-setup STPXpre killed ${{[a-z0-9]+}}, killed $x21, $sp, -4 # CHECK: frame-setup STPXi killed $x20, killed $x19, $sp, 2 -# CHECK: $sp = frame-setup ADDVL_XXI $sp, -19 +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -18 # CHECK: frame-setup STR_PXI killed $p15, $sp, 4 # CHECK: frame-setup STR_PXI killed $p14, $sp, 5 # CHECK: frame-setup STR_PXI killed $p5, $sp, 14 @@ -438,9 +430,11 @@ body: | # CHECK: frame-setup STR_ZXI killed $z22, $sp, 3 # CHECK: frame-setup STR_ZXI killed $z9, $sp, 16 # CHECK: frame-setup STR_ZXI killed $z8, $sp, 17 +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0 # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 +# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1 # CHECK: $p15 = frame-destroy LDR_PXI $sp, 4 # CHECK: $p14 = frame-destroy LDR_PXI $sp, 5 # CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 @@ -449,15 +443,14 @@ body: | # CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3 # CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 # CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17 -# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 19 +# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18 # CHECK: $x20, $x19 = frame-destroy LDPXi $sp, 2 # CHECK: $sp, ${{[a-z0-9]+}}, $x21 = frame-destroy LDPXpost $sp, 4 # CHECK: RET_ReallyLR name: save_restore_sve -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } stack: - - { id: 0, stack-id: default, size: 32, alignment: 16 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: default, size: 32, alignment: 16 } body: | bb.0.entry: @@ -494,7 +487,7 @@ body: | # CHECK-LABEL: name: save_restore_sve_realign # CHECK: $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 # CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -19 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -18 # CHECK-NEXT: STR_PXI killed $p15, $sp, 4 # CHECK-NEXT: STR_PXI killed $p14, $sp, 5 # CHECK: STR_PXI killed $p5, $sp, 14 @@ -503,6 +496,7 @@ body: | # CHECK-NEXT: STR_ZXI killed $z22, $sp, 3 # CHECK: STR_ZXI killed $z9, $sp, 16 # CHECK-NEXT: STR_ZXI killed $z8, $sp, 17 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]] @@ -519,10 +513,9 @@ body: | # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 # CHECK-NEXT: RET_ReallyLR name: save_restore_sve_realign -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 32 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: default, size: 16, alignment: 32 } body: | bb.0.entry: From 26b4ef3694973ea2fa656d3d3a7f67f16f135654 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 27 Jul 2020 14:16:55 +0100 Subject: [PATCH 0306/1035] [AArch64][SVE] Don't align the last SVE callee save. Instead of aligning the last callee-saved-register slot to the stack alignment (16 bytes), just align the SVE callee-saved block. This also simplifies the code that allocates space for the callee-saves. This change is needed to make sure the offset to which the callee-saved register is spilled, corresponds to the offset used for e.g. unwind call frame instructions. Reviewers: efriedma, paulwalker-arm, david-arm, rengolin Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D84042 --- .../Target/AArch64/AArch64FrameLowering.cpp | 25 +++++++------------ llvm/test/CodeGen/AArch64/framelayout-sve.mir | 2 +- llvm/test/CodeGen/AArch64/sve-trunc.ll | 2 +- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index cc563dd706326..1b49c692f2931 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1192,7 +1192,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Process the SVE callee-saves to determine what space needs to be // allocated. - if (AFI->getSVECalleeSavedStackSize()) { + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1200,11 +1200,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; CalleeSavesEnd = MBBI; - int64_t OffsetToFirstCalleeSaveFromSP = - MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); - StackOffset OffsetToCalleeSavesFromSP = - StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; - AllocateBefore -= OffsetToCalleeSavesFromSP; + AllocateBefore = {CalleeSavedSize, MVT::nxv1i8}; AllocateAfter = SVEStackSize - AllocateBefore; } @@ -1582,7 +1578,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // deallocated. StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; - if (AFI->getSVECalleeSavedStackSize()) { + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { RestoreBegin = std::prev(RestoreEnd);; while (IsSVECalleeSave(RestoreBegin) && RestoreBegin != MBB.begin()) @@ -1592,12 +1588,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, assert(IsSVECalleeSave(RestoreBegin) && IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); - int64_t OffsetToFirstCalleeSaveFromSP = - MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); - StackOffset OffsetToCalleeSavesFromSP = - StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; - DeallocateBefore = OffsetToCalleeSavesFromSP; - DeallocateAfter = SVEStackSize - DeallocateBefore; + StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8}; + DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; + DeallocateAfter = CalleeSavedSizeAsOffset; } // Deallocate the SVE area. @@ -2612,9 +2605,6 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Then process all callee saved slots. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { - // Make sure to align the last callee save slot. - MFI.setObjectAlignment(MaxCSFrameIndex, Align(16)); - // Assign offsets to the callee save slots. for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { Offset += MFI.getObjectSize(I); @@ -2624,6 +2614,9 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, } } + // Ensure that the Callee-save area is aligned to 16bytes. + Offset = alignTo(Offset, Align(16U)); + // Create a buffer of SVE objects to allocate and sort it. SmallVector ObjectsToAllocate; for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 575c839fbd151..75a65a6ad5226 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -573,7 +573,7 @@ body: | # CHECK-NEXT: stack-id: sve-vec, callee-saved-register: '$z23', # CHECK: - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2, # CHECK-NEXT: stack-id: sve-vec, callee-saved-register: '$p4', -# CHECK: - { id: 9, name: '', type: spill-slot, offset: -48, size: 2, alignment: 16, +# CHECK: - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2, # CHECK-NEXT: stack-id: sve-vec, callee-saved-register: '$p15', # CHECK: - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16, # CHECK-NEXT: stack-id: default, callee-saved-register: '$fp', diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll index 46d152bbf7acb..af50176f6b101 100644 --- a/llvm/test/CodeGen/AArch64/sve-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll @@ -117,7 +117,7 @@ define @trunc_i64toi1_split3( %in) { ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset p4, -16 +; CHECK-NEXT: .cfi_offset p4, -2 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z7.d, z7.d, #0x1 From cda2eb3ad2bbe923e74d6eb083af196a0622d800 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 28 Jul 2020 12:11:09 +0100 Subject: [PATCH 0307/1035] [AArch64][SVE] Fix epilogue for SVE when the stack is realigned. While deallocating the stackframe, the offset used to reload the callee-saved registers was not pointing to the SVE callee-saves, but rather to the whole SVE area. +--------------+ | GRP callee | | saves | +--------------+ <- FP | SVE callee | | saves | +--------------+ <- Should restore SVE callee saves from here | SVE Spills | | and Locals | +--------------+ <- instead of from here. | | : : | | +--------------+ <- SP Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D84539 --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 9 +++++---- llvm/test/CodeGen/AArch64/framelayout-sve.mir | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 1b49c692f2931..4789a9f02937a 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1596,12 +1596,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Deallocate the SVE area. if (SVEStackSize) { if (AFI->isStackRealigned()) { - if (AFI->getSVECalleeSavedStackSize()) - // Set SP to start of SVE area, from which the callee-save reloads - // can be done. The code below will deallocate the stack space + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) + // Set SP to start of SVE callee-save area from which they can + // be reloaded. The code below will deallocate the stack space // space by moving FP -> SP. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, - -SVEStackSize, TII, MachineInstr::FrameDestroy); + {-CalleeSavedSize, MVT::nxv1i8}, TII, + MachineInstr::FrameDestroy); } else { if (AFI->getSVECalleeSavedStackSize()) { // Deallocate the non-SVE locals first before we can deallocate (and diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 75a65a6ad5226..668b243dd79e0 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -500,7 +500,7 @@ body: | # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]] -# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -19 +# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5 # CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 From 6784d82d5b859cc14a63358d34ed50e6e2f9bf43 Mon Sep 17 00:00:00 2001 From: Xing GUO Date: Tue, 28 Jul 2020 22:20:10 +0800 Subject: [PATCH 0308/1035] [DWARFYAML] Rename checkListEntryOperands() to checkOperandCount(). NFC. This patch renames checkListEntryOperands() to checkOperandCount(), so that we are able to check DWARF expression operands using the same function. Reviewed By: jhenderson, labath Differential Revision: https://reviews.llvm.org/D84624 --- llvm/lib/ObjectYAML/DWARFEmitter.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp index b39bb003db9a5..66fabe875f7b0 100644 --- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp +++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp @@ -543,9 +543,9 @@ Error DWARFYAML::emitDebugStrOffsets(raw_ostream &OS, const Data &DI) { return Error::success(); } -static Error checkListEntryOperands(StringRef EncodingString, - ArrayRef Values, - uint64_t ExpectedOperands) { +static Error checkOperandCount(StringRef EncodingString, + ArrayRef Values, + uint64_t ExpectedOperands) { if (Values.size() != ExpectedOperands) return createStringError( errc::invalid_argument, @@ -578,7 +578,7 @@ static Expected writeListEntry(raw_ostream &OS, StringRef EncodingName = dwarf::RangeListEncodingString(Entry.Operator); auto CheckOperands = [&](uint64_t ExpectedOperands) -> Error { - return checkListEntryOperands(EncodingName, Entry.Values, ExpectedOperands); + return checkOperandCount(EncodingName, Entry.Values, ExpectedOperands); }; auto WriteAddress = [&](uint64_t Addr) -> Error { From ca0bf440dbf9977340db4a32ba61740930c2be03 Mon Sep 17 00:00:00 2001 From: Camille Coti Date: Mon, 27 Jul 2020 16:58:39 -0600 Subject: [PATCH 0309/1035] Order of libraries and source files in the f18 frontend When the f18 frontend calls the link editor, put the libraries and object files in the correct order. Fixes the issues reported here https://github.com/flang-compiler/flang/issues/897 Reviewed By: sscalpone, AlexisPerry Differential Revision: https://reviews.llvm.org/D84340 --- flang/tools/f18/f18.cpp | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp index 2b2eacc2e6f11..dc5ddd12295f6 100644 --- a/flang/tools/f18/f18.cpp +++ b/flang/tools/f18/f18.cpp @@ -369,20 +369,24 @@ std::string CompileOtherLanguage(std::string path, DriverOptions &driver) { return {}; } -void Link(std::vector &relocatables, DriverOptions &driver) { +void Link(std::vector &liblist, std::vector &objects, + DriverOptions &driver) { if (!ParentProcess()) { std::vector argv; for (size_t j{0}; j < driver.F18_FCArgs.size(); ++j) { argv.push_back(driver.F18_FCArgs[j].data()); } - for (auto &relo : relocatables) { - argv.push_back(relo.data()); + for (auto &obj : objects) { + argv.push_back(obj.data()); } if (!driver.outputPath.empty()) { char dashO[3] = "-o"; argv.push_back(dashO); argv.push_back(driver.outputPath.data()); } + for (auto &lib : liblist) { + argv.push_back(lib.data()); + } Exec(argv, driver.verbose); } } @@ -397,6 +401,7 @@ int main(int argc, char *const argv[]) { bool isPGF90{driver.F18_FCArgs.back().rfind("pgf90") != std::string::npos}; std::list args{argList(argc, argv)}; + std::vector objlist, liblist; std::string prefix{args.front()}; args.pop_front(); prefix += ": "; @@ -413,32 +418,37 @@ int main(int argc, char *const argv[]) { Fortran::common::IntrinsicTypeDefaultKinds defaultKinds; - std::vector fortranSources, otherSources, relocatables; + std::vector fortranSources, otherSources; bool anyFiles{false}; while (!args.empty()) { std::string arg{std::move(args.front())}; + auto dot{arg.rfind(".")}; + std::string suffix{arg.substr(dot + 1)}; + std::string prefix{arg.substr(0, 2)}; args.pop_front(); if (arg.empty()) { } else if (arg.at(0) != '-') { anyFiles = true; - auto dot{arg.rfind(".")}; if (dot == std::string::npos) { driver.F18_FCArgs.push_back(arg); } else { - std::string suffix{arg.substr(dot + 1)}; if (suffix == "f" || suffix == "F" || suffix == "ff" || suffix == "f90" || suffix == "F90" || suffix == "ff90" || suffix == "f95" || suffix == "F95" || suffix == "ff95" || suffix == "cuf" || suffix == "CUF" || suffix == "f18" || suffix == "F18" || suffix == "ff18") { fortranSources.push_back(arg); - } else if (suffix == "o" || suffix == "a") { - relocatables.push_back(arg); + } else if (suffix == "o" || suffix == "so") { + objlist.push_back(arg); + } else if (suffix == "a") { + liblist.push_back(arg); } else { otherSources.push_back(arg); } } + } else if (prefix == "-l" || suffix == "a") { + liblist.push_back(arg); } else if (arg == "-") { fortranSources.push_back("-"); } else if (arg == "--") { @@ -682,17 +692,17 @@ int main(int argc, char *const argv[]) { for (const auto &path : fortranSources) { std::string relo{CompileFortran(path, options, driver, defaultKinds)}; if (!driver.compileOnly && !relo.empty()) { - relocatables.push_back(relo); + objlist.push_back(relo); } } for (const auto &path : otherSources) { std::string relo{CompileOtherLanguage(path, driver)}; if (!driver.compileOnly && !relo.empty()) { - relocatables.push_back(relo); + objlist.push_back(relo); } } - if (!relocatables.empty()) { - Link(relocatables, driver); + if (!objlist.empty()) { + Link(liblist, objlist, driver); } return exitStatus; } From be2ea29ee16bc132626cba07559e9f023ad6ac13 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 12 Jul 2020 23:05:43 +0100 Subject: [PATCH 0310/1035] [SCEV] Add additional tests. Increase test coverage for upcoming changes to how SCEV deals with LCSSA phis. --- .../Analysis/ScalarEvolution/trivial-phis.ll | 191 ++++++ .../scev-expander-preserve-lcssa.ll | 587 ++++++++++++++++++ .../Transforms/LoopStrengthReduce/funclet.ll | 212 +++++-- .../LoopStrengthReduce/scev-expander-lcssa.ll | 144 +++++ 4 files changed, 1098 insertions(+), 36 deletions(-) create mode 100644 llvm/test/Analysis/ScalarEvolution/trivial-phis.ll create mode 100644 llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll create mode 100644 llvm/test/Transforms/LoopStrengthReduce/scev-expander-lcssa.ll diff --git a/llvm/test/Analysis/ScalarEvolution/trivial-phis.ll b/llvm/test/Analysis/ScalarEvolution/trivial-phis.ll new file mode 100644 index 0000000000000..3a897e957d1b2 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/trivial-phis.ll @@ -0,0 +1,191 @@ +; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s + +; CHECK-LABEL @test1 +; CHECK %add.lcssa.wide = phi i64 [ %indvars.iv.next, %do.body ] +; CHECK-NEXT --> %add.lcssa.wide U: [1,2147483648) S: [1,2147483648) + +define i64 @test1(i32 signext %n, float* %A) { +entry: + %0 = sext i32 %n to i64 + br label %do.body + +do.body: ; preds = %do.body, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %do.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float 1.000000e+00, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + %add.lcssa.wide = phi i64 [ %indvars.iv.next, %do.body ] + ret i64 %add.lcssa.wide +} + +; CHECK-LABEL @test2 +; CHECK: %tmp24 = phi i64 [ %tmp14, %bb22 ], [ %tmp14, %bb13 ] +; CHECK-NEXT: --> %tmp24 U: full-set S: full-set Exits: <> LoopDispositions: { %bb13: Variant, %bb8: Variant, %bb17: Invariant, %bb27: Invariant } + +define void @test2(i64 %arg, i32* noalias %arg1) { +bb: + %tmp = icmp slt i64 0, %arg + br i1 %tmp, label %bb7, label %bb48 + +bb7: ; preds = %bb + br label %bb8 + +bb8: ; preds = %bb44, %bb7 + %tmp9 = phi i64 [ 0, %bb7 ], [ %tmp45, %bb44 ] + %tmp10 = add nsw i64 %arg, -1 + %tmp11 = icmp slt i64 1, %tmp10 + br i1 %tmp11, label %bb12, label %bb43 + +bb12: ; preds = %bb8 + br label %bb13 + +bb13: ; preds = %bb39, %bb12 + %tmp14 = phi i64 [ 1, %bb12 ], [ %tmp40, %bb39 ] + %tmp15 = icmp slt i64 0, %arg + br i1 %tmp15, label %bb16, label %bb23 + +bb16: ; preds = %bb13 + br label %bb17 + +bb17: ; preds = %bb19, %bb16 + %tmp18 = phi i64 [ 0, %bb16 ], [ %tmp20, %bb19 ] + br label %bb19 + +bb19: ; preds = %bb17 + %tmp20 = add nuw nsw i64 %tmp18, 1 + %tmp21 = icmp slt i64 %tmp20, %arg + br i1 %tmp21, label %bb17, label %bb22 + +bb22: ; preds = %bb19 + br label %bb23 + +bb23: ; preds = %bb22, %bb13 + %tmp24 = phi i64 [ %tmp14, %bb22 ], [ %tmp14, %bb13 ] + %tmp25 = icmp slt i64 0, %arg + br i1 %tmp25, label %bb26, label %bb37 + +bb26: ; preds = %bb23 + br label %bb27 + +bb27: ; preds = %bb33, %bb26 + %tmp28 = phi i64 [ 0, %bb26 ], [ %tmp34, %bb33 ] + %tmp29 = mul nsw i64 %tmp9, %arg + %tmp30 = getelementptr inbounds i32, i32* %arg1, i64 %tmp24 + %tmp31 = getelementptr inbounds i32, i32* %tmp30, i64 %tmp29 + %tmp32 = load i32, i32* %tmp31, align 4 + br label %bb33 + +bb33: ; preds = %bb27 + %tmp34 = add nuw nsw i64 %tmp28, 1 + %tmp35 = icmp slt i64 %tmp34, %arg + br i1 %tmp35, label %bb27, label %bb36 + +bb36: ; preds = %bb33 + br label %bb37 + +bb37: ; preds = %bb36, %bb23 + %tmp38 = phi i64 [ %tmp24, %bb36 ], [ %tmp24, %bb23 ] + br label %bb39 + +bb39: ; preds = %bb37 + %tmp40 = add nuw nsw i64 %tmp38, 1 + %tmp41 = icmp slt i64 %tmp40, %tmp10 + br i1 %tmp41, label %bb13, label %bb42 + +bb42: ; preds = %bb39 + br label %bb43 + +bb43: ; preds = %bb42, %bb8 + br label %bb44 + +bb44: ; preds = %bb43 + %tmp45 = add nuw nsw i64 %tmp9, 1 + %tmp46 = icmp slt i64 %tmp45, %arg + br i1 %tmp46, label %bb8, label %bb47 + +bb47: ; preds = %bb44 + br label %bb48 + +bb48: ; preds = %bb47, %bb + ret void +} + +; CHECK-LABEL @test3 + +; CHECK: %tmp14 = phi i64 [ %tmp40, %bb39 ], [ 1, %bb8 ] +; CHECK-NEXT: --> {1,+,1}<%bb13> U: [1,9223372036854775807) S: [1,9223372036854775807) +; CHECK-SAME: Exits: (-2 + %arg) LoopDispositions: { %bb13: Computable, %bb8: Variant, %bb17_a: Invariant, %bb27: Invariant } +; CHECK: %tmp18 = phi i64 [ %tmp20, %bb17 ], [ 0, %bb13 ] +; CHECK-NEXT: --> {0,+,1}<%bb17_a> U: [0,9223372036854775807) S: [0,9223372036854775807) +; CHECK-SAME: Exits: (-1 + %arg) LoopDispositions: { %bb17_a: Computable, %bb13: Variant, %bb8: Variant } + +; CHECK: %tmp24 = phi i64 [ %tmp14, %bb13 ], [ %tmp14, %bb17 ] +; CHECK-NEXT: --> {1,+,1}<%bb13> U: [1,9223372036854775807) S: [1,9223372036854775807) +; CHECK-SAME: Exits: (-2 + %arg) LoopDispositions: { %bb13: Computable, %bb8: Variant, %bb17_a: Invariant, %bb27: Invariant } +; CHECK: %tmp28 = phi i64 [ %tmp34, %bb27 ], [ 0, %bb23 ] +; CHECK-NEXT: --> {0,+,1}<%bb27> U: [0,9223372036854775807) S: [0,9223372036854775807) +; CHECK-SAME: Exits: (-1 + %arg) LoopDispositions: { %bb27: Computable, %bb13: Variant, %bb8: Variant } + +; CHECK: %tmp38 = phi i64 [ %tmp24, %bb23 ], [ %tmp24, %bb27 ] +; CHECK-NEXT: --> {1,+,1}<%bb13> U: [1,9223372036854775807) S: [1,9223372036854775807) +; CHECK-SAME: Exits: (-2 + %arg) LoopDispositions: { %bb13: Computable, %bb8: Variant, %bb17_a: Invariant, %bb27: Invariant } + +define void @test3(i64 %arg, i32* %arg1) { +bb: + %tmp = icmp slt i64 0, %arg + br i1 %tmp, label %bb8, label %bb48 + +bb8: ; preds = %bb, %bb44 + %tmp9 = phi i64 [ %tmp45, %bb44 ], [ 0, %bb ] + %tmp10 = add nsw i64 %arg, -1 + %tmp11 = icmp slt i64 1, %tmp10 + br i1 %tmp11, label %bb13, label %bb44 + +bb13: ; preds = %bb8, %bb39 + %tmp14 = phi i64 [ %tmp40, %bb39 ], [ 1, %bb8 ] + %tmp15 = icmp slt i64 0, %arg + br i1 %tmp15, label %bb17_a, label %bb23 + +bb17_a: + %tmp18 = phi i64 [ %tmp20, %bb17 ], [ 0, %bb13 ] + %tmp20 = add nuw nsw i64 %tmp18, 1 + + br label %bb17 + +bb17: ; preds = %bb13, %bb17 + %tmp21 = icmp slt i64 %tmp20, %arg + br i1 %tmp21, label %bb17_a, label %bb23 + +bb23: ; preds = %bb17, %bb13 + %tmp24 = phi i64 [ %tmp14, %bb13 ], [ %tmp14, %bb17 ] + %tmp25 = icmp slt i64 0, %arg + br i1 %tmp25, label %bb27, label %bb39 + +bb27: ; preds = %bb23, %bb27 + %tmp28 = phi i64 [ %tmp34, %bb27 ], [ 0, %bb23 ] + %tmp29 = mul nsw i64 %tmp9, %arg + %tmp30 = getelementptr inbounds i32, i32* %arg1, i64 %tmp24 + %tmp31 = getelementptr inbounds i32, i32* %tmp30, i64 %tmp29 + %tmp32 = load i32, i32* %tmp31, align 4 + %tmp34 = add nuw nsw i64 %tmp28, 1 + %tmp35 = icmp slt i64 %tmp34, %arg + br i1 %tmp35, label %bb27, label %bb39 + +bb39: ; preds = %bb23, %bb27 + %tmp38 = phi i64 [ %tmp24, %bb23 ], [ %tmp24, %bb27 ] + %tmp40 = add nuw nsw i64 %tmp38, 1 + %tmp41 = icmp slt i64 %tmp40, %tmp10 + br i1 %tmp41, label %bb13, label %bb44 + +bb44: ; preds = %bb8, %bb39 + %tmp45 = add nuw nsw i64 %tmp9, 1 + %tmp46 = icmp slt i64 %tmp45, %arg + br i1 %tmp46, label %bb8, label %bb48 + +bb48: ; preds = %bb44, %bb + ret void +} diff --git a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll new file mode 100644 index 0000000000000..d26d3b23b7724 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll @@ -0,0 +1,587 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -indvars -S -indvars -verify-loop-lcssa %s | FileCheck %s + +; Make sure SCEVExpander does not crash and introduce unnecessary LCSSA PHI nodes. +; The tests are a collection of cases with crashes when preserving LCSSA PHI +; nodes directly in SCEVExpander. + +declare i1 @cond() readnone + +define void @test1(i8 %x, [512 x i8]* %ptr) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LAND_LHS_TRUE:%.*]] +; CHECK: land.lhs.true: +; CHECK-NEXT: br label [[WHILE_COND22:%.*]] +; CHECK: while.cond22: +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label [[WHILE_COND22]], label [[WHILE_COND29_PREHEADER:%.*]] +; CHECK: while.cond29.preheader: +; CHECK-NEXT: br label [[WHILE_BODY35:%.*]] +; CHECK: while.body35: +; CHECK-NEXT: [[I_1107:%.*]] = phi i32 [ [[I_9:%.*]], [[IF_END224:%.*]] ], [ 0, [[WHILE_COND29_PREHEADER]] ] +; CHECK-NEXT: br label [[WHILE_COND192:%.*]] +; CHECK: while.cond192: +; CHECK-NEXT: switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [ +; CHECK-NEXT: i8 59, label [[WHILE_COND215_PREHEADER:%.*]] +; CHECK-NEXT: i8 10, label [[IF_END224_LOOPEXIT1:%.*]] +; CHECK-NEXT: ] +; CHECK: while.cond215.preheader: +; CHECK-NEXT: br label [[WHILE_COND215:%.*]] +; CHECK: while.body205: +; CHECK-NEXT: br label [[WHILE_COND192]] +; CHECK: while.cond215: +; CHECK-NEXT: [[I_8_IN:%.*]] = phi i32 [ [[I_8:%.*]], [[WHILE_COND215]] ], [ [[I_1107]], [[WHILE_COND215_PREHEADER]] ] +; CHECK-NEXT: [[I_8]] = add nsw i32 [[I_8_IN]], 1 +; CHECK-NEXT: [[IDXPROM216:%.*]] = sext i32 [[I_8]] to i64 +; CHECK-NEXT: [[ARRAYIDX217:%.*]] = getelementptr inbounds [512 x i8], [512 x i8]* [[PTR:%.*]], i64 0, i64 [[IDXPROM216]] +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[WHILE_COND215]], label [[IF_END224_LOOPEXIT:%.*]] +; CHECK: if.end224.loopexit: +; CHECK-NEXT: [[I_8_LCSSA:%.*]] = phi i32 [ [[I_8]], [[WHILE_COND215]] ] +; CHECK-NEXT: br label [[IF_END224]] +; CHECK: if.end224.loopexit1: +; CHECK-NEXT: br label [[IF_END224]] +; CHECK: if.end224: +; CHECK-NEXT: [[I_9]] = phi i32 [ [[I_8_LCSSA]], [[IF_END224_LOOPEXIT]] ], [ [[I_1107]], [[IF_END224_LOOPEXIT1]] ] +; CHECK-NEXT: [[C_3:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_3]], label [[WHILE_END225:%.*]], label [[WHILE_BODY35]] +; CHECK: while.end225: +; CHECK-NEXT: br label [[LAND_LHS_TRUE]] +; +entry: + br label %land.lhs.true + +land.lhs.true: ; preds = %while.end225, %entry + br label %while.cond22 + +while.cond22: ; preds = %while.cond22, %land.lhs.true + %c.1 = call i1 @cond() + br i1 %c.1, label %while.cond22, label %while.cond29.preheader + +while.cond29.preheader: ; preds = %while.cond22 + br label %while.body35 + +while.body35: ; preds = %if.end224, %while.cond29.preheader + %i.1107 = phi i32 [ %i.9, %if.end224 ], [ 0, %while.cond29.preheader ] + br label %while.cond192 + +while.cond192: ; preds = %while.body205, %while.body35 + %i.7 = phi i32 [ %i.1107, %while.body35 ], [ %inc206, %while.body205 ] + switch i8 %x, label %while.body205 [ + i8 59, label %while.cond215 + i8 10, label %if.end224 + ] + +while.body205: ; preds = %while.cond192 + %inc206 = add nsw i32 %i.7, 1 + br label %while.cond192 + +while.cond215: ; preds = %while.cond215, %while.cond192 + %i.8.in = phi i32 [ %i.8, %while.cond215 ], [ %i.7, %while.cond192 ] + %i.8 = add nsw i32 %i.8.in, 1 + %idxprom216 = sext i32 %i.8 to i64 + %arrayidx217 = getelementptr inbounds [512 x i8], [512 x i8]* %ptr, i64 0, i64 %idxprom216 + %c.2 = call i1 @cond() + br i1 %c.2, label %while.cond215, label %if.end224 + +if.end224: ; preds = %while.cond215, %while.cond192 + %i.9 = phi i32 [ %i.8, %while.cond215 ], [ %i.7, %while.cond192 ] + %c.3 = call i1 @cond() + br i1 %c.3, label %while.end225, label %while.body35 + +while.end225: ; preds = %if.end224 + br label %land.lhs.true +} + +define void @test2(i16 %x) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label [[FOR_COND_PREHEADER:%.*]], label [[RETURN:%.*]] +; CHECK: for.cond.preheader: +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: switch i16 [[X:%.*]], label [[RETURN_LOOPEXIT1:%.*]] [ +; CHECK-NEXT: i16 41, label [[FOR_END:%.*]] +; CHECK-NEXT: i16 43, label [[FOR_COND]] +; CHECK-NEXT: ] +; CHECK: for.end: +; CHECK-NEXT: [[I_0_LCSSA2:%.*]] = phi i32 [ 0, [[FOR_COND]] ] +; CHECK-NEXT: [[CMP8243:%.*]] = icmp sgt i32 [[I_0_LCSSA2]], 0 +; CHECK-NEXT: br i1 [[CMP8243]], label [[FOR_BODY84_PREHEADER:%.*]], label [[RETURN]] +; CHECK: for.body84.preheader: +; CHECK-NEXT: br label [[FOR_BODY84:%.*]] +; CHECK: for.body84: +; CHECK-NEXT: [[I_144:%.*]] = phi i32 [ [[INC:%.*]], [[IF_END106:%.*]] ], [ 0, [[FOR_BODY84_PREHEADER]] ] +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[IF_END106]], label [[RETURN_LOOPEXIT:%.*]] +; CHECK: if.end106: +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_144]], 1 +; CHECK-NEXT: [[CMP82:%.*]] = icmp slt i32 [[INC]], [[I_0_LCSSA2]] +; CHECK-NEXT: br i1 [[CMP82]], label [[FOR_BODY84]], label [[RETURN_LOOPEXIT]] +; CHECK: return.loopexit: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return.loopexit1: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: ret void +; +entry: + %c.1 = call i1 @cond() + br i1 %c.1, label %for.cond, label %return + +for.cond: ; preds = %for.cond, %entry + %i.0 = phi i32 [ %sub, %for.cond ], [ 0, %entry ] + %sub = add nsw i32 %i.0, -1 + switch i16 %x, label %return [ + i16 41, label %for.end + i16 43, label %for.cond + ] + +for.end: ; preds = %for.cond + %cmp8243 = icmp sgt i32 %i.0, 0 + br i1 %cmp8243, label %for.body84, label %return + +for.body84: ; preds = %if.end106, %for.end + %i.144 = phi i32 [ %inc, %if.end106 ], [ 0, %for.end ] + %c.2 = call i1 @cond() + br i1 %c.2, label %if.end106, label %return + +if.end106: ; preds = %for.body84 + %inc = add nuw nsw i32 %i.144, 1 + %cmp82 = icmp slt i32 %inc, %i.0 + br i1 %cmp82, label %for.body84, label %return + +return: ; preds = %if.end106, %for.body84, %for.end, %for.cond, %entry + ret void +} + +declare i32 @get.i32() readnone + +define void @test3(i32* %ptr) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: br label [[FOR_BODY1208:%.*]] +; CHECK: for.body1208: +; CHECK-NEXT: [[M_0804:%.*]] = phi i32 [ 1, [[WHILE_BODY]] ], [ [[INC1499:%.*]], [[FOR_INC1498:%.*]] ] +; CHECK-NEXT: [[V:%.*]] = call i32 @get.i32() +; CHECK-NEXT: [[CMP1358:%.*]] = icmp eq i32 [[V]], 0 +; CHECK-NEXT: br i1 [[CMP1358]], label [[IF_THEN1360:%.*]], label [[FOR_INC1498]] +; CHECK: if.then1360: +; CHECK-NEXT: [[M_0804_LCSSA:%.*]] = phi i32 [ [[M_0804]], [[FOR_BODY1208]] ] +; CHECK-NEXT: br label [[FOR_COND1390:%.*]] +; CHECK: for.cond1390: +; CHECK-NEXT: [[M_2_IN:%.*]] = phi i32 [ [[M_0804_LCSSA]], [[IF_THEN1360]] ], [ 0, [[FOR_BODY1394:%.*]] ] +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[FOR_BODY1394]], label [[FOR_END1469:%.*]] +; CHECK: for.body1394: +; CHECK-NEXT: br label [[FOR_COND1390]] +; CHECK: for.end1469: +; CHECK-NEXT: [[M_2_IN_LCSSA:%.*]] = phi i32 [ [[M_2_IN]], [[FOR_COND1390]] ] +; CHECK-NEXT: store i32 [[M_2_IN_LCSSA]], i32* [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[WHILE_BODY]] +; CHECK: for.inc1498: +; CHECK-NEXT: [[INC1499]] = add nuw nsw i32 [[M_0804]], 1 +; CHECK-NEXT: br label [[FOR_BODY1208]] +; +entry: + br label %while.body + +while.body: ; preds = %for.end1469, %entry + br label %for.body1208 + +for.body1208: ; preds = %for.inc1498, %while.body + %m.0804 = phi i32 [ 1, %while.body ], [ %inc1499, %for.inc1498 ] + %v = call i32 @get.i32() + %cmp1358 = icmp eq i32 %v, 0 + br i1 %cmp1358, label %if.then1360, label %for.inc1498 + +if.then1360: ; preds = %for.body1208 + br label %for.cond1390 + +for.cond1390: ; preds = %for.body1394, %if.then1360 + %m.2.in = phi i32 [ %m.0804, %if.then1360 ], [ 0, %for.body1394 ] + %c.2 = call i1 @cond() + br i1 %c.2, label %for.body1394, label %for.end1469 + +for.body1394: ; preds = %for.cond1390 + br label %for.cond1390 + +for.end1469: ; preds = %for.cond1390 + store i32 %m.2.in, i32* %ptr, align 4 + br label %while.body + +for.inc1498: ; preds = %for.body1208 + %inc1499 = add nuw nsw i32 %m.0804, 1 + br label %for.body1208 +} + +define void @test4(i32* %ptr) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: br label [[FOR_COND1204_PREHEADER:%.*]] +; CHECK: for.cond1204.preheader: +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN1504:%.*]], label [[FOR_BODY1208_LR_PH:%.*]] +; CHECK: for.body1208.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY1208:%.*]] +; CHECK: for.body1208: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY1208_LR_PH]] ], [ [[TMP1:%.*]], [[FOR_INC1498:%.*]] ] +; CHECK-NEXT: [[M_0804:%.*]] = phi i32 [ 1, [[FOR_BODY1208_LR_PH]] ], [ [[INC1499:%.*]], [[FOR_INC1498]] ] +; CHECK-NEXT: [[IDXPROM1212:%.*]] = zext i32 [[M_0804]] to i64 +; CHECK-NEXT: [[V:%.*]] = call i32 @get.i32() +; CHECK-NEXT: [[CMP1215:%.*]] = icmp eq i32 0, [[V]] +; CHECK-NEXT: [[YPOS1223:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[IDXPROM1212]] +; CHECK-NEXT: br i1 [[CMP1215]], label [[IF_THEN1217:%.*]], label [[IF_ELSE1351:%.*]] +; CHECK: if.then1217: +; CHECK-NEXT: [[M_0804_LCSSA:%.*]] = phi i32 [ [[M_0804]], [[FOR_BODY1208]] ] +; CHECK-NEXT: br label [[FOR_COND1247:%.*]] +; CHECK: for.cond1247: +; CHECK-NEXT: [[M_1_IN:%.*]] = phi i32 [ [[M_0804_LCSSA]], [[IF_THEN1217]] ], [ [[M_1:%.*]], [[IF_THEN1260:%.*]] ] +; CHECK-NEXT: [[M_1]] = add nuw nsw i32 [[M_1_IN]], 1 +; CHECK-NEXT: br label [[FOR_BODY1251:%.*]] +; CHECK: for.body1251: +; CHECK-NEXT: [[IDXPROM1255:%.*]] = zext i32 [[M_1]] to i64 +; CHECK-NEXT: [[XPOS1257:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM1255]] +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[IF_THEN1260]], label [[FOR_END1326:%.*]] +; CHECK: if.then1260: +; CHECK-NEXT: br label [[FOR_COND1247]] +; CHECK: for.end1326: +; CHECK-NEXT: br label [[IF_END1824:%.*]] +; CHECK: if.else1351: +; CHECK-NEXT: [[V_2:%.*]] = call i32 @get.i32() +; CHECK-NEXT: [[CMP1358:%.*]] = icmp eq i32 [[V_2]], 0 +; CHECK-NEXT: br i1 [[CMP1358]], label [[IF_THEN1360:%.*]], label [[FOR_INC1498]] +; CHECK: if.then1360: +; CHECK-NEXT: [[DOTLCSSA2:%.*]] = phi i32 [ [[TMP0]], [[IF_ELSE1351]] ] +; CHECK-NEXT: [[M_0804_LCSSA1:%.*]] = phi i32 [ [[M_0804]], [[IF_ELSE1351]] ] +; CHECK-NEXT: [[CMP1392:%.*]] = icmp slt i32 [[M_0804_LCSSA1]], [[DOTLCSSA2]] +; CHECK-NEXT: unreachable +; CHECK: for.inc1498: +; CHECK-NEXT: [[INC1499]] = add nuw nsw i32 [[M_0804]], 1 +; CHECK-NEXT: [[TMP1]] = load i32, i32* [[PTR]], align 8 +; CHECK-NEXT: br label [[FOR_BODY1208]] +; CHECK: if.then1504: +; CHECK-NEXT: unreachable +; CHECK: if.end1824: +; CHECK-NEXT: br label [[WHILE_BODY]] +; +entry: + br label %while.body + +while.body: ; preds = %if.end1824, %entry + br label %for.cond1204.preheader + +for.cond1204.preheader: ; preds = %while.body + %c.1 = call i1 @cond() + br i1 %c.1, label %if.then1504, label %for.body1208.lr.ph + +for.body1208.lr.ph: ; preds = %for.cond1204.preheader + br label %for.body1208 + +for.body1208: ; preds = %for.inc1498, %for.body1208.lr.ph + %0 = phi i32 [ 0, %for.body1208.lr.ph ], [ %1, %for.inc1498 ] + %m.0804 = phi i32 [ 1, %for.body1208.lr.ph ], [ %inc1499, %for.inc1498 ] + %idxprom1212 = zext i32 %m.0804 to i64 + %v = call i32 @get.i32() + %cmp1215 = icmp eq i32 0, %v + %ypos1223 = getelementptr inbounds i32, i32* %ptr , i64 %idxprom1212 + br i1 %cmp1215, label %if.then1217, label %if.else1351 + +if.then1217: ; preds = %for.body1208 + br label %for.cond1247 + +for.cond1247: ; preds = %if.then1260, %if.then1217 + %m.1.in = phi i32 [ %m.0804, %if.then1217 ], [ %m.1, %if.then1260 ] + %m.1 = add nuw nsw i32 %m.1.in, 1 + %cmp1249 = icmp slt i32 %m.1.in, %0 + br label %for.body1251 + +for.body1251: ; preds = %for.cond1247 + %idxprom1255 = zext i32 %m.1 to i64 + %xpos1257 = getelementptr inbounds i32, i32* %ptr, i64 %idxprom1255 + %c.2 = call i1 @cond() + br i1 %c.2, label %if.then1260, label %for.end1326 + +if.then1260: ; preds = %for.body1251 + br label %for.cond1247 + +for.end1326: ; preds = %for.body1251 + br label %if.end1824 + +if.else1351: ; preds = %for.body1208 + %v.2 = call i32 @get.i32() + %cmp1358 = icmp eq i32 %v.2, 0 + br i1 %cmp1358, label %if.then1360, label %for.inc1498 + +if.then1360: ; preds = %if.else1351 + %cmp1392 = icmp slt i32 %m.0804, %0 + unreachable + +for.inc1498: ; preds = %if.else1351 + %inc1499 = add nuw nsw i32 %m.0804, 1 + %1 = load i32, i32* %ptr, align 8 + br label %for.body1208 + +if.then1504: ; preds = %for.cond1204.preheader + unreachable + +if.end1824: ; preds = %for.end1326 + br label %while.body +} + +define void @test5(i8* %header, i32 %conv, i8 %n) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[POS_42:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD85:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: br label [[FOR_INNER:%.*]] +; CHECK: for.inner: +; CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC_I:%.*]], [[FOR_INNER]] ] +; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +; CHECK-NEXT: [[CMP7_I:%.*]] = icmp slt i8 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7_I]], label [[FOR_INNER]], label [[FOR_INNER_EXIT:%.*]] +; CHECK: for.inner.exit: +; CHECK-NEXT: [[INC_I_LCSSA:%.*]] = phi i32 [ [[INC_I]], [[FOR_INNER]] ] +; CHECK-NEXT: br label [[FOR_INNER_2:%.*]] +; CHECK: for.inner.2: +; CHECK-NEXT: [[I_0_I1:%.*]] = phi i32 [ 0, [[FOR_INNER_EXIT]] ], [ [[INC_I3:%.*]], [[FOR_INNER_2]] ] +; CHECK-NEXT: [[INC_I3]] = add nuw nsw i32 [[I_0_I1]], 1 +; CHECK-NEXT: [[CMP7_I4:%.*]] = icmp slt i8 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP7_I4]], label [[FOR_INNER_2]], label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC_I3_LCSSA:%.*]] = phi i32 [ [[INC_I3]], [[FOR_INNER_2]] ] +; CHECK-NEXT: [[ADD71:%.*]] = add i32 [[POS_42]], [[INC_I_LCSSA]] +; CHECK-NEXT: [[ADD85]] = add i32 [[ADD71]], [[INC_I3_LCSSA]] +; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[WHILE_COND_PREHEADER:%.*]] +; CHECK: while.cond.preheader: +; CHECK-NEXT: [[ADD85_LCSSA:%.*]] = phi i32 [ [[ADD85]], [[FOR_INC]] ] +; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2 +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[POS_8:%.*]] = phi i32 [ [[INC114:%.*]], [[WHILE_BODY:%.*]] ], [ [[ADD85_LCSSA]], [[WHILE_COND_PREHEADER]] ] +; CHECK-NEXT: [[CMP112:%.*]] = icmp ult i32 [[POS_8]], [[SHL]] +; CHECK-NEXT: br i1 [[CMP112]], label [[WHILE_BODY]], label [[CLEANUP122:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[INC114]] = add nuw i32 [[POS_8]], 1 +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label [[WHILE_COND]], label [[CLEANUP122]] +; CHECK: cleanup122: +; CHECK-NEXT: ret void +; +entry: + %shl = shl nuw nsw i32 %conv, 2 + br label %for.body + +for.body: ; preds = %entry, %for.inc + %pos.42 = phi i32 [ 0, %entry ], [ %add85, %for.inc ] + br label %for.inner + +for.inner: ; preds = %for.body.i, %for.body + %i.0.i = phi i32 [ 0, %for.body ], [ %inc.i, %for.inner ] + %inc.i = add nuw nsw i32 %i.0.i, 1 + %cmp7.i = icmp slt i8 %n, 0 + br i1 %cmp7.i, label %for.inner, label %for.inner.exit + +for.inner.exit: ; preds = %for.body.i + %add71 = add i32 %pos.42, %inc.i + br label %for.inner.2 + +for.inner.2: ; preds = %for.body.i6, %cleanup.cont74 + %i.0.i1 = phi i32 [ 0, %for.inner.exit ], [ %inc.i3, %for.inner.2] + %inc.i3 = add nuw nsw i32 %i.0.i1, 1 + %cmp7.i4 = icmp slt i8 %n, 0 + br i1 %cmp7.i4, label %for.inner.2, label %for.inc + +for.inc: ; preds = %for.body.i6 + %add85 = add i32 %add71, %inc.i3 + br i1 false, label %for.body, label %while.cond.preheader + +while.cond.preheader: ; preds = %for.inc + br label %while.cond + +while.cond: ; preds = %while.cond.preheader, %while.body + %pos.8 = phi i32 [ %inc114, %while.body ], [ %add85, %while.cond.preheader ] + %cmp112 = icmp ult i32 %pos.8, %shl + br i1 %cmp112, label %while.body, label %cleanup122 + +while.body: ; preds = %while.cond + %inc114 = add nuw i32 %pos.8, 1 + %c.1 = call i1 @cond() + br i1 %c.1, label %while.cond, label %cleanup122 + +cleanup122: ; preds = %while.body, %while.cond + ret void +} + +define void @test6(i8 %x) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label [[LAND_RHS:%.*]], label [[WHILE_END316:%.*]] +; CHECK: land.rhs: +; CHECK-NEXT: br label [[WHILE_BODY35:%.*]] +; CHECK: while.body35: +; CHECK-NEXT: br label [[WHILE_COND192:%.*]] +; CHECK: while.cond192: +; CHECK-NEXT: switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [ +; CHECK-NEXT: i8 59, label [[WHILE_COND215_PREHEADER:%.*]] +; CHECK-NEXT: i8 10, label [[IF_END224:%.*]] +; CHECK-NEXT: ] +; CHECK: while.cond215.preheader: +; CHECK-NEXT: [[I_7_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_COND192]] ] +; CHECK-NEXT: br label [[WHILE_COND215:%.*]] +; CHECK: while.body205: +; CHECK-NEXT: br label [[WHILE_COND192]] +; CHECK: while.cond215: +; CHECK-NEXT: [[I_8_IN:%.*]] = phi i32 [ [[I_8:%.*]], [[WHILE_COND215]] ], [ [[I_7_LCSSA]], [[WHILE_COND215_PREHEADER]] ] +; CHECK-NEXT: [[I_8]] = add nuw nsw i32 [[I_8_IN]], 1 +; CHECK-NEXT: [[IDXPROM216:%.*]] = sext i32 [[I_8]] to i64 +; CHECK-NEXT: [[ARRAYIDX217:%.*]] = getelementptr inbounds [512 x i8], [512 x i8]* null, i64 0, i64 [[IDXPROM216]] +; CHECK-NEXT: br label [[WHILE_COND215]] +; CHECK: if.end224: +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[WHILE_END225:%.*]], label [[WHILE_BODY35]] +; CHECK: while.end225: +; CHECK-NEXT: unreachable +; CHECK: while.end316: +; CHECK-NEXT: ret void +; +entry: + %c.1 = call i1 @cond() + br i1 %c.1, label %land.rhs, label %while.end316 + +land.rhs: ; preds = %entry + br label %while.body35 + +while.body35: ; preds = %if.end224, %land.rhs + br label %while.cond192 + +while.cond192: ; preds = %while.body205, %while.body35 + %i.7 = phi i32 [ 0, %while.body35 ], [ %inc206, %while.body205 ] + switch i8 %x, label %while.body205 [ + i8 59, label %while.cond215 + i8 10, label %if.end224 + ] + +while.body205: ; preds = %while.cond192 + %inc206 = add nsw i32 %i.7, 1 + br label %while.cond192 + +while.cond215: ; preds = %while.cond215, %while.cond192 + %i.8.in = phi i32 [ %i.8, %while.cond215 ], [ %i.7, %while.cond192 ] + %i.8 = add nsw i32 %i.8.in, 1 + %idxprom216 = sext i32 %i.8 to i64 + %arrayidx217 = getelementptr inbounds [512 x i8], [512 x i8]* null, i64 0, i64 %idxprom216 + br label %while.cond215 + +if.end224: ; preds = %while.cond192 + %c.2 = call i1 @cond() + br i1 %c.2, label %while.end225, label %while.body35 + +while.end225: ; preds = %if.end224 + unreachable + +while.end316: ; preds = %entry + ret void +} + +define void @test7(i32* %ptr) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: br label [[FOR_BODY1208:%.*]] +; CHECK: for.body1208: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[WHILE_BODY]] ], [ [[TMP1:%.*]], [[FOR_INC1498:%.*]] ] +; CHECK-NEXT: [[M_048:%.*]] = phi i32 [ 1, [[WHILE_BODY]] ], [ [[INC1499:%.*]], [[FOR_INC1498]] ] +; CHECK-NEXT: [[IDXPROM1212:%.*]] = zext i32 [[M_048]] to i64 +; CHECK-NEXT: [[XPOS1214:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[IDXPROM1212]] +; CHECK-NEXT: [[V_1:%.*]] = call i32 @get.i32() +; CHECK-NEXT: [[CMP1215:%.*]] = icmp eq i32 0, [[V_1]] +; CHECK-NEXT: br i1 [[CMP1215]], label [[IF_THEN1217:%.*]], label [[IF_ELSE1351:%.*]] +; CHECK: if.then1217: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY1208]] ] +; CHECK-NEXT: [[M_048_LCSSA:%.*]] = phi i32 [ [[M_048]], [[FOR_BODY1208]] ] +; CHECK-NEXT: [[CMP1249_NOT_NOT:%.*]] = icmp slt i32 [[M_048_LCSSA]], [[DOTLCSSA]] +; CHECK-NEXT: unreachable +; CHECK: if.else1351: +; CHECK-NEXT: [[CMP1358:%.*]] = icmp eq i32 0, undef +; CHECK-NEXT: br i1 [[CMP1358]], label [[IF_THEN1360:%.*]], label [[FOR_INC1498]] +; CHECK: if.then1360: +; CHECK-NEXT: [[M_048_LCSSA1:%.*]] = phi i32 [ [[M_048]], [[IF_ELSE1351]] ] +; CHECK-NEXT: br label [[FOR_COND1390:%.*]] +; CHECK: for.cond1390: +; CHECK-NEXT: [[M_2_IN:%.*]] = phi i32 [ [[M_048_LCSSA1]], [[IF_THEN1360]] ], [ [[M_2:%.*]], [[IF_THEN1403:%.*]] ] +; CHECK-NEXT: [[M_2]] = add nuw nsw i32 [[M_2_IN]], 1 +; CHECK-NEXT: [[IDXPROM1398:%.*]] = zext i32 [[M_2]] to i64 +; CHECK-NEXT: br label [[IF_THEN1403]] +; CHECK: if.then1403: +; CHECK-NEXT: [[XPOS1409:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM1398]] +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label [[FOR_COND1390]], label [[FOR_END1469:%.*]] +; CHECK: for.end1469: +; CHECK-NEXT: br label [[IF_END1824:%.*]] +; CHECK: for.inc1498: +; CHECK-NEXT: [[INC1499]] = add nuw nsw i32 [[M_048]], 1 +; CHECK-NEXT: [[TMP1]] = load i32, i32* undef, align 8 +; CHECK-NEXT: br label [[FOR_BODY1208]] +; CHECK: if.end1824: +; CHECK-NEXT: br label [[WHILE_BODY]] +; +entry: + br label %while.body + +while.body: ; preds = %if.end1824, %entry + br label %for.body1208 + +for.body1208: ; preds = %for.inc1498, %while.body + %0 = phi i32 [ undef, %while.body ], [ %1, %for.inc1498 ] + %m.048 = phi i32 [ 1, %while.body ], [ %inc1499, %for.inc1498 ] + %idxprom1212 = zext i32 %m.048 to i64 + %xpos1214 = getelementptr inbounds i32, i32* %ptr, i64 %idxprom1212 + %v.1 = call i32 @get.i32() + %cmp1215 = icmp eq i32 0, %v.1 + br i1 %cmp1215, label %if.then1217, label %if.else1351 + +if.then1217: ; preds = %for.body1208 + %cmp1249.not.not = icmp slt i32 %m.048, %0 + unreachable + +if.else1351: ; preds = %for.body1208 + %cmp1358 = icmp eq i32 0, undef + br i1 %cmp1358, label %if.then1360, label %for.inc1498 + +if.then1360: ; preds = %if.else1351 + br label %for.cond1390 + +for.cond1390: ; preds = %if.then1403, %if.then1360 + %m.2.in = phi i32 [ %m.048, %if.then1360 ], [ %m.2, %if.then1403 ] + %m.2 = add nuw nsw i32 %m.2.in, 1 + %cmp1392.not.not = icmp slt i32 %m.2.in, %0 + %idxprom1398 = zext i32 %m.2 to i64 + br label %if.then1403 + +if.then1403: ; preds = %for.cond1390 + %xpos1409 = getelementptr inbounds i32, i32* %ptr, i64 %idxprom1398 + %c.1 = call i1 @cond() + br i1 %c.1, label %for.cond1390, label %for.end1469 + +for.end1469: ; preds = %if.then1403 + br label %if.end1824 + +for.inc1498: ; preds = %if.else1351 + %inc1499 = add nuw nsw i32 %m.048, 1 + %1 = load i32, i32* undef, align 8 + br label %for.body1208 + +if.end1824: ; preds = %for.end1469 + br label %while.body +} diff --git a/llvm/test/Transforms/LoopStrengthReduce/funclet.ll b/llvm/test/Transforms/LoopStrengthReduce/funclet.ll index 1bee3706cafab..0f725a13b2078 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/funclet.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/funclet.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -loop-reduce -S | FileCheck %s target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" @@ -10,13 +11,42 @@ declare void @external(i32*) declare void @reserve() define void @f() personality i32 (...)* @_except_handler3 { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[THROW:%.*]] +; CHECK: throw: +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, i8* undef, i32 1 +; CHECK-NEXT: invoke void @reserve() +; CHECK-NEXT: to label [[THROW]] unwind label [[PAD:%.*]] +; CHECK: pad: +; CHECK-NEXT: [[PHI2:%.*]] = phi i8* [ [[TMP96]], [[THROW]] ] +; CHECK-NEXT: [[CS:%.*]] = catchswitch within none [label %unreachable] unwind label [[BLAH2:%.*]] +; CHECK: unreachable: +; CHECK-NEXT: [[TMP0:%.*]] = catchpad within [[CS]] [] +; CHECK-NEXT: unreachable +; CHECK: blah2: +; CHECK-NEXT: [[CLEANUPPADI4_I_I_I:%.*]] = cleanuppad within none [] +; CHECK-NEXT: [[PHI21:%.*]] = ptrtoint i8* [[PHI2]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 1, [[PHI21]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* undef, i32 [[TMP1]] +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: loop_body: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i8* [ [[SCEVGEP2:%.*]], [[ITER:%.*]] ], [ [[SCEVGEP]], [[BLAH2]] ] +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, i8* [[LSR_IV]], i32 -1 +; CHECK-NEXT: [[TMP100:%.*]] = icmp eq i8* [[SCEVGEP2]], null +; CHECK-NEXT: br i1 [[TMP100]], label [[UNWIND_OUT:%.*]], label [[ITER]] +; CHECK: iter: +; CHECK-NEXT: br i1 true, label [[UNWIND_OUT]], label [[LOOP_BODY]] +; CHECK: unwind_out: +; CHECK-NEXT: cleanupret from [[CLEANUPPADI4_I_I_I]] unwind to caller +; entry: br label %throw throw: ; preds = %throw, %entry %tmp96 = getelementptr inbounds i8, i8* undef, i32 1 invoke void @reserve() - to label %throw unwind label %pad + to label %throw unwind label %pad pad: ; preds = %throw %phi2 = phi i8* [ %tmp96, %throw ] @@ -43,18 +73,45 @@ unwind_out: ; preds = %iter, %loop_body cleanupret from %cleanuppadi4.i.i.i unwind to caller } -; CHECK-LABEL: define void @f( -; CHECK: cleanuppad within none [] -; CHECK-NEXT: ptrtoint i8* %phi2 to i32 - define void @g() personality i32 (...)* @_except_handler3 { +; CHECK-LABEL: @g( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[THROW:%.*]] +; CHECK: throw: +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, i8* undef, i32 1 +; CHECK-NEXT: invoke void @reserve() +; CHECK-NEXT: to label [[THROW]] unwind label [[PAD:%.*]] +; CHECK: pad: +; CHECK-NEXT: [[PHI2:%.*]] = phi i8* [ [[TMP96]], [[THROW]] ] +; CHECK-NEXT: [[CS:%.*]] = catchswitch within none [label [[UNREACHABLE:%.*]], label %blah] unwind to caller +; CHECK: unreachable: +; CHECK-NEXT: [[TMP0:%.*]] = catchpad within [[CS]] [] +; CHECK-NEXT: unreachable +; CHECK: blah: +; CHECK-NEXT: [[CATCHPAD:%.*]] = catchpad within [[CS]] [] +; CHECK-NEXT: [[PHI21:%.*]] = ptrtoint i8* [[PHI2]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 1, [[PHI21]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* undef, i32 [[TMP1]] +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: unwind_out: +; CHECK-NEXT: catchret from [[CATCHPAD]] to label [[LEAVE:%.*]] +; CHECK: leave: +; CHECK-NEXT: ret void +; CHECK: loop_body: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i8* [ [[SCEVGEP2:%.*]], [[ITER:%.*]] ], [ [[SCEVGEP]], [[BLAH:%.*]] ] +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, i8* [[LSR_IV]], i32 -1 +; CHECK-NEXT: [[TMP100:%.*]] = icmp eq i8* [[SCEVGEP2]], null +; CHECK-NEXT: br i1 [[TMP100]], label [[UNWIND_OUT:%.*]], label [[ITER]] +; CHECK: iter: +; CHECK-NEXT: br i1 true, label [[UNWIND_OUT]], label [[LOOP_BODY]] +; entry: br label %throw throw: ; preds = %throw, %entry %tmp96 = getelementptr inbounds i8, i8* undef, i32 1 invoke void @reserve() - to label %throw unwind label %pad + to label %throw unwind label %pad pad: %phi2 = phi i8* [ %tmp96, %throw ] @@ -84,20 +141,45 @@ iter: ; preds = %loop_body br i1 undef, label %unwind_out, label %loop_body } -; CHECK-LABEL: define void @g( -; CHECK: blah: -; CHECK-NEXT: catchpad within %cs [] -; CHECK-NEXT: ptrtoint i8* %phi2 to i32 - - define void @h() personality i32 (...)* @_except_handler3 { +; CHECK-LABEL: @h( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[THROW:%.*]] +; CHECK: throw: +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, i8* undef, i32 1 +; CHECK-NEXT: invoke void @reserve() +; CHECK-NEXT: to label [[THROW]] unwind label [[PAD:%.*]] +; CHECK: pad: +; CHECK-NEXT: [[CS:%.*]] = catchswitch within none [label [[UNREACHABLE:%.*]], label %blug] unwind to caller +; CHECK: unreachable: +; CHECK-NEXT: [[TMP0:%.*]] = catchpad within [[CS]] [] +; CHECK-NEXT: unreachable +; CHECK: blug: +; CHECK-NEXT: [[PHI2:%.*]] = phi i8* [ [[TMP96]], [[PAD]] ] +; CHECK-NEXT: [[CATCHPAD:%.*]] = catchpad within [[CS]] [] +; CHECK-NEXT: [[PHI21:%.*]] = ptrtoint i8* [[PHI2]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 1, [[PHI21]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* undef, i32 [[TMP1]] +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: unwind_out: +; CHECK-NEXT: catchret from [[CATCHPAD]] to label [[LEAVE:%.*]] +; CHECK: leave: +; CHECK-NEXT: ret void +; CHECK: loop_body: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i8* [ [[SCEVGEP2:%.*]], [[ITER:%.*]] ], [ [[SCEVGEP]], [[BLUG:%.*]] ] +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, i8* [[LSR_IV]], i32 -1 +; CHECK-NEXT: [[TMP100:%.*]] = icmp eq i8* [[SCEVGEP2]], null +; CHECK-NEXT: br i1 [[TMP100]], label [[UNWIND_OUT:%.*]], label [[ITER]] +; CHECK: iter: +; CHECK-NEXT: br i1 true, label [[UNWIND_OUT]], label [[LOOP_BODY]] +; entry: br label %throw throw: ; preds = %throw, %entry %tmp96 = getelementptr inbounds i8, i8* undef, i32 1 invoke void @reserve() - to label %throw unwind label %pad + to label %throw unwind label %pad pad: %cs = catchswitch within none [label %unreachable, label %blug] unwind to caller @@ -127,19 +209,45 @@ iter: ; preds = %loop_body br i1 undef, label %unwind_out, label %loop_body } -; CHECK-LABEL: define void @h( -; CHECK: blug: -; CHECK: catchpad within %cs [] -; CHECK-NEXT: ptrtoint i8* %phi2 to i32 - define void @i() personality i32 (...)* @_except_handler3 { +; CHECK-LABEL: @i( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[THROW:%.*]] +; CHECK: throw: +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, i8* undef, i32 1 +; CHECK-NEXT: invoke void @reserve() +; CHECK-NEXT: to label [[THROW]] unwind label [[CATCHPAD:%.*]] +; CHECK: catchpad: +; CHECK-NEXT: [[PHI2:%.*]] = phi i8* [ [[TMP96]], [[THROW]] ] +; CHECK-NEXT: [[CS:%.*]] = catchswitch within none [label %cp_body] unwind label [[CLEANUPPAD:%.*]] +; CHECK: cp_body: +; CHECK-NEXT: [[TMP0:%.*]] = catchpad within [[CS]] [] +; CHECK-NEXT: br label [[LOOP_HEAD:%.*]] +; CHECK: cleanuppad: +; CHECK-NEXT: [[TMP1:%.*]] = cleanuppad within none [] +; CHECK-NEXT: br label [[LOOP_HEAD]] +; CHECK: loop_head: +; CHECK-NEXT: [[PHI21:%.*]] = ptrtoint i8* [[PHI2]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 1, [[PHI21]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* undef, i32 [[TMP2]] +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: loop_body: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i8* [ [[SCEVGEP2:%.*]], [[ITER:%.*]] ], [ [[SCEVGEP]], [[LOOP_HEAD]] ] +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, i8* [[LSR_IV]], i32 -1 +; CHECK-NEXT: [[TMP100:%.*]] = icmp eq i8* [[SCEVGEP2]], null +; CHECK-NEXT: br i1 [[TMP100]], label [[UNWIND_OUT:%.*]], label [[ITER]] +; CHECK: iter: +; CHECK-NEXT: br i1 true, label [[UNWIND_OUT]], label [[LOOP_BODY]] +; CHECK: unwind_out: +; CHECK-NEXT: unreachable +; entry: br label %throw throw: ; preds = %throw, %entry %tmp96 = getelementptr inbounds i8, i8* undef, i32 1 invoke void @reserve() - to label %throw unwind label %catchpad + to label %throw unwind label %catchpad catchpad: ; preds = %throw %phi2 = phi i8* [ %tmp96, %throw ] @@ -169,17 +277,41 @@ unwind_out: ; preds = %iter, %loop_body unreachable } -; CHECK-LABEL: define void @i( -; CHECK: ptrtoint i8* %phi2 to i32 - define void @test1(i32* %b, i32* %c) personality i32 (...)* @__CxxFrameHandler3 { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[D_0:%.*]] = phi i32* [ [[B:%.*]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: invoke void @external(i32* [[D_0]]) +; CHECK-NEXT: to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]] +; CHECK: for.inc: +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[D_0]], i32 1 +; CHECK-NEXT: br label [[FOR_COND]] +; CHECK: catch.dispatch: +; CHECK-NEXT: [[CS:%.*]] = catchswitch within none [label %catch] unwind label [[CATCH_DISPATCH_2:%.*]] +; CHECK: catch: +; CHECK-NEXT: [[TMP0:%.*]] = catchpad within [[CS]] [i8* null, i32 64, i8* null] +; CHECK-NEXT: catchret from [[TMP0]] to label [[TRY_CONT:%.*]] +; CHECK: try.cont: +; CHECK-NEXT: invoke void @external(i32* [[C:%.*]]) +; CHECK-NEXT: to label [[TRY_CONT_7:%.*]] unwind label [[CATCH_DISPATCH_2]] +; CHECK: catch.dispatch.2: +; CHECK-NEXT: [[E_0:%.*]] = phi i32* [ [[C]], [[TRY_CONT]] ], [ [[B]], [[CATCH_DISPATCH]] ] +; CHECK-NEXT: [[CS2:%.*]] = catchswitch within none [label %catch.4] unwind to caller +; CHECK: catch.4: +; CHECK-NEXT: [[TMP1:%.*]] = catchpad within [[CS2]] [i8* null, i32 64, i8* null] +; CHECK-NEXT: unreachable +; CHECK: try.cont.7: +; CHECK-NEXT: ret void +; entry: br label %for.cond for.cond: ; preds = %for.inc, %entry %d.0 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.inc ] invoke void @external(i32* %d.0) - to label %for.inc unwind label %catch.dispatch + to label %for.inc unwind label %catch.dispatch for.inc: ; preds = %for.cond %incdec.ptr = getelementptr inbounds i32, i32* %d.0, i32 1 @@ -194,7 +326,7 @@ catch: ; preds = %catch.dispatch try.cont: ; preds = %catch invoke void @external(i32* %c) - to label %try.cont.7 unwind label %catch.dispatch.2 + to label %try.cont.7 unwind label %catch.dispatch.2 catch.dispatch.2: ; preds = %try.cont, %catchendblock %e.0 = phi i32* [ %c, %try.cont ], [ %b, %catch.dispatch ] @@ -208,21 +340,33 @@ try.cont.7: ; preds = %try.cont ret void } -; CHECK-LABEL: define void @test1( -; CHECK: for.cond: -; CHECK: %d.0 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.inc ] - -; CHECK: catch.dispatch.2: -; CHECK: %e.0 = phi i32* [ %c, %try.cont ], [ %b, %catch.dispatch ] - define i32 @test2() personality i32 (...)* @_except_handler3 { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: invoke void @reserve() +; CHECK-NEXT: to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]] +; CHECK: catch.dispatch: +; CHECK-NEXT: [[TMP18:%.*]] = catchswitch within none [label %catch.handler] unwind to caller +; CHECK: catch.handler: +; CHECK-NEXT: [[PHI_LCSSA:%.*]] = phi i32 [ [[PHI]], [[CATCH_DISPATCH]] ] +; CHECK-NEXT: [[TMP19:%.*]] = catchpad within [[TMP18]] [i8* null] +; CHECK-NEXT: catchret from [[TMP19]] to label [[DONE:%.*]] +; CHECK: done: +; CHECK-NEXT: ret i32 [[PHI_LCSSA]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add i32 [[PHI]], 1 +; CHECK-NEXT: br label [[FOR_BODY]] +; entry: br label %for.body for.body: ; preds = %for.inc, %entry %phi = phi i32 [ %inc, %for.inc ], [ 0, %entry ] invoke void @reserve() - to label %for.inc unwind label %catch.dispatch + to label %for.inc unwind label %catch.dispatch catch.dispatch: ; preds = %for.body %tmp18 = catchswitch within none [label %catch.handler] unwind to caller @@ -239,7 +383,3 @@ for.inc: ; preds = %for.body %inc = add i32 %phi, 1 br label %for.body } - -; CHECK-LABEL: define i32 @test2( -; CHECK: %phi.lcssa = phi i32 [ %phi, %catch.dispatch ] -; CHECK-NEXT: catchpad within diff --git a/llvm/test/Transforms/LoopStrengthReduce/scev-expander-lcssa.ll b/llvm/test/Transforms/LoopStrengthReduce/scev-expander-lcssa.ll new file mode 100644 index 0000000000000..e05dcda85138a --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/scev-expander-lcssa.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-reduce -S %s | FileCheck %s + +; Make sure SCEVExpander does not crash and introduce unnecessary LCSSA PHI nodes. + +define void @schedule_block() { +; CHECK-LABEL: @schedule_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i16 undef, label [[IF_END156_I:%.*]] [ +; CHECK-NEXT: i16 27, label [[IF_THEN_I:%.*]] +; CHECK-NEXT: i16 28, label [[IF_THEN_I]] +; CHECK-NEXT: i16 29, label [[IF_THEN13_I:%.*]] +; CHECK-NEXT: i16 32, label [[LAND_LHS_TRUE136_I:%.*]] +; CHECK-NEXT: ] +; CHECK: if.then.i: +; CHECK-NEXT: unreachable +; CHECK: if.then13.i: +; CHECK-NEXT: unreachable +; CHECK: land.lhs.true136.i: +; CHECK-NEXT: unreachable +; CHECK: if.end156.i: +; CHECK-NEXT: switch i16 undef, label [[WHILE_END256:%.*]] [ +; CHECK-NEXT: i16 29, label [[IF_THEN210:%.*]] +; CHECK-NEXT: i16 28, label [[IF_THEN210]] +; CHECK-NEXT: i16 27, label [[LAND_LHS_TRUE191:%.*]] +; CHECK-NEXT: i16 32, label [[IF_END248:%.*]] +; CHECK-NEXT: ] +; CHECK: land.lhs.true191: +; CHECK-NEXT: unreachable +; CHECK: if.then210: +; CHECK-NEXT: unreachable +; CHECK: if.end248: +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: while.end256: +; CHECK-NEXT: unreachable +; CHECK: for.end: +; CHECK-NEXT: br label [[WHILE_BODY1013:%.*]] +; CHECK: while.body1013: +; CHECK-NEXT: br label [[FOR_COND_I2472:%.*]] +; CHECK: for.cond.i2472: +; CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[WHILE_BODY1013]] ], [ [[TMP2:%.*]], [[FOR_END34_I:%.*]] ] +; CHECK-NEXT: br i1 false, label [[FOR_COND3_PREHEADER_I:%.*]], label [[IF_END107_I_LOOPEXIT:%.*]] +; CHECK: for.cond3.preheader.i: +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I_0_I]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1 +; CHECK-NEXT: br label [[FOR_COND3_I:%.*]] +; CHECK: for.cond3.i: +; CHECK-NEXT: [[INDVARS_IV301_I2691:%.*]] = phi i64 [ [[INDVARS_IV_NEXT302_I:%.*]], [[FOR_BODY5_I:%.*]] ], [ [[TMP1]], [[FOR_COND3_PREHEADER_I]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT302_I]] = add nsw i64 [[INDVARS_IV301_I2691]], 1 +; CHECK-NEXT: br label [[FOR_BODY5_I]] +; CHECK: for.body5.i: +; CHECK-NEXT: br i1 false, label [[FOR_COND3_I]], label [[FOR_BODY5_I_FOR_END_I2475_LOOPEXIT_CRIT_EDGE:%.*]] +; CHECK: for.body5.i.for.end.i2475.loopexit_crit_edge: +; CHECK-NEXT: [[TMP2]] = trunc i64 [[INDVARS_IV_NEXT302_I]] to i32 +; CHECK-NEXT: br label [[FOR_END34_I]] +; CHECK: for.end34.i: +; CHECK-NEXT: br i1 false, label [[FOR_COND_I2472]], label [[IF_ELSE_I2488:%.*]] +; CHECK: if.else.i2488: +; CHECK-NEXT: br i1 undef, label [[IF_END107_I:%.*]], label [[FOR_BODY45_PREHEADER_I:%.*]] +; CHECK: for.body45.preheader.i: +; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[I_0_I]] to i64 +; CHECK-NEXT: unreachable +; CHECK: if.end107.i.loopexit: +; CHECK-NEXT: br label [[IF_END107_I]] +; CHECK: if.end107.i: +; CHECK-NEXT: unreachable +; +entry: + switch i16 undef, label %if.end156.i [ + i16 27, label %if.then.i + i16 28, label %if.then.i + i16 29, label %if.then13.i + i16 32, label %land.lhs.true136.i + ] + +if.then.i: ; preds = %entry, %entry + unreachable + +if.then13.i: ; preds = %entry + unreachable + +land.lhs.true136.i: ; preds = %entry + unreachable + +if.end156.i: ; preds = %entry + switch i16 undef, label %while.end256 [ + i16 29, label %if.then210 + i16 28, label %if.then210 + i16 27, label %land.lhs.true191 + i16 32, label %if.end248 + ] + +land.lhs.true191: ; preds = %if.end156.i + unreachable + +if.then210: ; preds = %if.end156.i, %if.end156.i + unreachable + +if.end248: ; preds = %if.end156.i + br label %for.end + +while.end256: ; preds = %if.end156.i + unreachable + +for.end: ; preds = %if.end248 + br label %while.body1013 + +while.body1013: ; preds = %for.end + br label %for.cond.i2472 + +for.cond.i2472: ; preds = %for.end34.i, %while.body1013 + %i.0.i = phi i32 [ 0, %while.body1013 ], [ %2, %for.end34.i ] + br i1 undef, label %for.cond3.preheader.i, label %if.end107.i + +for.cond3.preheader.i: ; preds = %for.cond.i2472 + %0 = sext i32 %i.0.i to i64 + %1 = add nsw i64 %0, 1 + br label %for.cond3.i + +for.cond3.i: ; preds = %for.body5.i, %for.cond3.preheader.i + %indvars.iv301.i2691 = phi i64 [ %indvars.iv.next302.i, %for.body5.i ], [ %1, %for.cond3.preheader.i ] + %indvars.iv.next302.i = add nsw i64 %indvars.iv301.i2691, 1 + br label %for.body5.i + +for.body5.i: ; preds = %for.cond3.i + br i1 undef, label %for.cond3.i, label %for.body5.i.for.end.i2475.loopexit_crit_edge + +for.body5.i.for.end.i2475.loopexit_crit_edge: ; preds = %for.body5.i + %2 = trunc i64 %indvars.iv.next302.i to i32 + br label %for.end34.i + +for.end34.i: ; preds = %for.body5.i.for.end.i2475.loopexit_crit_edge + br i1 undef, label %for.cond.i2472, label %if.else.i2488 + +if.else.i2488: ; preds = %for.end34.i + br i1 undef, label %if.end107.i, label %for.body45.preheader.i + +for.body45.preheader.i: ; preds = %if.else.i2488 + %3 = sext i32 %i.0.i to i64 + unreachable + +if.end107.i: ; preds = %if.else.i2488, %for.cond.i2472 + unreachable +} From a4edc04693f76eec9068db0556d6533e4c201d74 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 12 Jul 2020 14:16:36 -0400 Subject: [PATCH 0311/1035] AMDGPU/GlobalISel: Use clamp modifier for [us]addsat/[us]subsat We also have never handled this for SelectionDAG, which needs additional work. --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 5 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 11 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 18 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 14 + llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 15 + llvm/lib/Target/AMDGPU/VOPInstructions.td | 13 + .../AMDGPU/GlobalISel/legalize-saddsat.mir | 172 +- .../AMDGPU/GlobalISel/legalize-ssubsat.mir | 172 +- .../AMDGPU/GlobalISel/legalize-uaddsat.mir | 211 +- .../AMDGPU/GlobalISel/legalize-usubsat.mir | 142 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 3401 +++-------------- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 3399 +++------------- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 2550 ++++-------- .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 2161 ++++------- 15 files changed, 2714 insertions(+), 9574 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 3f12addbcc79b..056f91db24ff6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,6 +51,11 @@ def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; +// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods? +def gi_vop3opsel : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_smrd_imm : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index bf0ebd322aa9e..e14623e650d42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -422,7 +422,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .scalarize(0); - if (ST.hasVOP3PInsts()) { + if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { + // Full set of gfx9 features. getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) .clampScalar(0, S16, S32) @@ -431,7 +432,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) - .lowerFor({S32, S16, V2S16}) // FIXME: legal and merge with add/sub/mul + .legalFor({S32, S16, V2S16}) // Clamp modifier .minScalar(0, S16) .clampMaxNumElements(0, S16, 2) .scalarize(0) @@ -447,7 +448,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Technically the saturating operations require clamp bit support, but this // was introduced at the same time as 16-bit operations. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerFor({S32, S16}) // FIXME: legal with clamp modifier + .legalFor({S32, S16}) // Clamp modifier .minScalar(0, S16) .scalarize(0) .widenScalarToNextPow2(0, 16) @@ -467,7 +468,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasIntClamp()) { getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerFor({S32}) // FIXME: legal with clamp modifier. + .legalFor({S32}) // Clamp modifier. .scalarize(0) .minScalarOrElt(0, S32) .lower(); @@ -479,6 +480,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .lower(); } + // FIXME: DAG expansion gets better results. The widening uses the smaller + // range values and goes for the min/max lowering directly. getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) .minScalar(0, S32) .scalarize(0) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 6b23830491235..6848f762fc276 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -791,6 +791,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, return CIInsts; } + /// \returns true if the target has integer add/sub instructions that do not + /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, + /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier + /// for saturation. bool hasAddNoCarry() const { return AddNoCarryInsts; } diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index d9dcfdca23b35..3451c23891811 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -828,6 +828,24 @@ def : GCNPat < } // End Predicates = [Has16BitInsts] +let SubtargetPredicate = HasIntClamp in { +// Set clamp bit for saturation. +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; +} + +let SubtargetPredicate = HasAddNoCarryInsts, OtherPredicates = [HasIntClamp] in { +let AddedComplexity = 1 in { // Prefer over form with carry-out. +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; +} +} + +let SubtargetPredicate = Has16BitInsts, OtherPredicates = [HasIntClamp] in { +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index dcbfeb547a32d..3048bcc610c76 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -667,6 +667,20 @@ def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; + + +// FIXME: Probably should hardcode clamp bit in pseudo and avoid this. +class OpSelBinOpClampPat : GCNPat< + (node (i16 (VOP3OpSel i16:$src0, i32:$src0_modifiers)), + (i16 (VOP3OpSel i16:$src1, i32:$src1_modifiers))), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE, 0) +>; + +def : OpSelBinOpClampPat; +def : OpSelBinOpClampPat; } // End SubtargetPredicate = isGFX9Plus def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index fc457ad212d48..446e87ab3fc98 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -77,6 +77,8 @@ def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, lshr_rev>; +let SubtargetPredicate = HasVOP3PInsts in { + // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. // The constant will be emitted as a mov, and folded later. @@ -86,6 +88,19 @@ def : GCNPat< (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) >; +// Integer operations with clamp bit set. +class VOP3PSatPat : GCNPat< + (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), + (v2i16 (VOP3PMods v2i16:$src1, i32:$src1_modifiers))), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE) +>; + +def : VOP3PSatPat; +def : VOP3PSatPat; +def : VOP3PSatPat; +def : VOP3PSatPat; +} // End SubtargetPredicate = HasVOP3PInsts + multiclass MadFmaMixPats { !if(!isa(Op), getDivergentFrag.ret, Op), Op); } +class getVSrcOp { + RegisterOperand ret = !if(!eq(vt.Size, 32), VSrc_b32, VSrc_b16); +} + +// Class for binary integer operations with the clamp bit set for saturation +// TODO: Add sub with negated inline constant pattern. +class VOPBinOpClampPat : + GCNPat<(node vt:$src0, vt:$src1), + (inst getVSrcOp.ret:$src0, getVSrcOp.ret:$src1, + DSTCLAMP.ENABLE) +>; + + include "VOPCInstructions.td" include "VOP1Instructions.td" include "VOP2Instructions.td" diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir index 28a8efad1d102..51b6e014a9376 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -59,17 +59,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C2]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -137,17 +128,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C2]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -282,34 +264,19 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C6]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C4]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C5]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C3]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[SMAX2:%[0-9]+]]:_(s16) = G_SMAX [[SHL2]], [[C6]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[C4]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s16) = G_SMIN [[SHL2]], [[C6]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[C5]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s16) = G_SMAX [[SUB3]], [[SHL3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s16) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[SHL2]], [[SMIN3]] - ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ADD1]], [[C3]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL2]], [[SHL3]] + ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT1]], [[C3]](s16) + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[ASHR]](s16) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C7]] + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX9: [[COPY3:%[0-9]+]]:_(s16) = COPY [[ASHR1]](s16) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C7]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -375,17 +342,8 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[TRUNC1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[SMIN1]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SADDSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -484,20 +442,8 @@ body: | ; GFX9-LABEL: name: saddsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB1]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[ADD]](<2 x s16>) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SADDSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SADDSAT %0, %1 @@ -676,30 +622,9 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC4]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC5]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB1]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC7]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC8]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SADDSAT]](<2 x s16>), [[SADDSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) @@ -870,30 +795,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB1]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC3]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC4]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[UV]], [[UV2]] + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SADDSAT]](<2 x s16>), [[SADDSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -938,17 +842,8 @@ body: | ; GFX9-LABEL: name: saddsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[ADD]](s32) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s32) = G_SADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SADDSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_SADDSAT %0, %1 @@ -1014,24 +909,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[SMIN1]] - ; GFX9: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[C2]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[C2]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[SMIN3]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s32) = G_SADDSAT [[UV]], [[UV2]] + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(s32) = G_SADDSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SADDSAT]](s32), [[SADDSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir index 40eb12034c97b..f38da863cba90 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -59,17 +59,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C1]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C2]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -137,17 +128,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C1]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C2]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -282,34 +264,19 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C6]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C4]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C5]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C3]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[SMAX2:%[0-9]+]]:_(s16) = G_SMAX [[SHL2]], [[C6]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[SMAX2]], [[C4]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s16) = G_SMIN [[SHL2]], [[C6]] - ; GFX9: [[SUB4:%[0-9]+]]:_(s16) = G_SUB [[SMIN2]], [[C5]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s16) = G_SMAX [[SUB3]], [[SHL3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s16) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[SMIN3]] - ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SUB5]], [[C3]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL2]], [[SHL3]] + ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT1]], [[C3]](s16) + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[ASHR]](s16) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C7]] + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX9: [[COPY3:%[0-9]+]]:_(s16) = COPY [[ASHR1]](s16) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C7]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -375,17 +342,8 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[TRUNC1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[SMIN1]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB2]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SSUBSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -484,20 +442,8 @@ body: | ; GFX9-LABEL: name: ssubsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[SUB2]](<2 x s16>) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SSUBSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SSUBSAT %0, %1 @@ -676,30 +622,9 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX2]], [[BUILD_VECTOR_TRUNC7]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB4:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN2]], [[BUILD_VECTOR_TRUNC8]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SUB2]](<2 x s16>), [[SUB5]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SSUBSAT]](<2 x s16>), [[SSUBSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) @@ -870,30 +795,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX2]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB4:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN2]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SUB2]](<2 x s16>), [[SUB5]](<2 x s16>) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[UV]], [[UV2]] + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SSUBSAT]](<2 x s16>), [[SSUBSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -938,17 +842,8 @@ body: | ; GFX9-LABEL: name: ssubsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[SUB2]](s32) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s32) = G_SSUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SSUBSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_SSUBSAT %0, %1 @@ -1014,24 +909,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[SMIN1]] - ; GFX9: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[C2]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SMAX2]], [[C]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[C2]] - ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SMIN2]], [[C1]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[SMIN3]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB2]](s32), [[SUB5]](s32) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s32) = G_SSUBSAT [[UV]], [[UV2]] + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(s32) = G_SSUBSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SSUBSAT]](s32), [[SSUBSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir index 690bf34482ddd..9d51870b4fed2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir @@ -32,11 +32,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: uaddsat_s7 @@ -47,11 +44,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -92,11 +86,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: uaddsat_s8 @@ -107,11 +98,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -187,24 +175,19 @@ body: | ; GFX8: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX8: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C4]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C3]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C3]](s16) ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX8: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX8: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[SHL2]], [[C4]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[ADD1]], [[C3]](s16) - ; GFX8: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL2]], [[SHL3]] + ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT1]], [[C3]](s16) + ; GFX8: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX8: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) - ; GFX8: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C5]] + ; GFX8: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX8: [[COPY3:%[0-9]+]]:_(s16) = COPY [[LSHR7]](s16) - ; GFX8: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C5]] + ; GFX8: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX8: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX8: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -226,24 +209,19 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C4]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C3]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[SHL2]], [[C4]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[ADD1]], [[C3]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL2]], [[SHL3]] + ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT1]], [[C3]](s16) + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C5]] + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX9: [[COPY3:%[0-9]+]]:_(s16) = COPY [[LSHR7]](s16) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C5]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -286,22 +264,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC1]] + ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UADDSAT]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: uaddsat_s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UADDSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -364,15 +336,10 @@ body: | ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC2]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC3]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC2]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC3]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) @@ -380,12 +347,8 @@ body: | ; GFX9-LABEL: name: uaddsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[COPY1]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[ADD]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[UADDSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_UADDSAT %0, %1 @@ -482,24 +445,17 @@ body: | ; GFX8: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX8: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC3]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC4]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[XOR2]], [[TRUNC5]] - ; GFX8: [[ADD2:%[0-9]+]]:_(s16) = G_ADD [[TRUNC2]], [[UMIN2]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC3]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC4]] + ; GFX8: [[UADDSAT2:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC2]], [[TRUNC5]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ADD2]](s16) - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT2]](s16) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) @@ -536,16 +492,9 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[UMIN]] - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UADDSAT]](<2 x s16>), [[UADDSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) @@ -650,26 +599,17 @@ body: | ; GFX8: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX8: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC4]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC5]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[XOR2]], [[TRUNC6]] - ; GFX8: [[ADD2:%[0-9]+]]:_(s16) = G_ADD [[TRUNC2]], [[UMIN2]] - ; GFX8: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC3]], [[C1]] - ; GFX8: [[UMIN3:%[0-9]+]]:_(s16) = G_UMIN [[XOR3]], [[TRUNC7]] - ; GFX8: [[ADD3:%[0-9]+]]:_(s16) = G_ADD [[TRUNC3]], [[UMIN3]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC4]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC5]] + ; GFX8: [[UADDSAT2:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC2]], [[TRUNC6]] + ; GFX8: [[UADDSAT3:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC3]], [[TRUNC7]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ADD2]](s16) - ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ADD3]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT2]](s16) + ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) @@ -680,16 +620,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[UV2]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV]], [[UMIN]] - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV1]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR1]], [[UV3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[UV]], [[UV2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UADDSAT]](<2 x s16>), [[UADDSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -714,19 +647,13 @@ body: | ; GFX8-LABEL: name: uaddsat_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]] - ; GFX8: $vgpr0 = COPY [[ADD]](s32) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX8: $vgpr0 = COPY [[UADDSAT]](s32) ; GFX9-LABEL: name: uaddsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[ADD]](s32) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[UADDSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_UADDSAT %0, %1 @@ -758,28 +685,18 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[UV2]] - ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV1]], [[C]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[UV3]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UMIN1]] - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[UV]], [[UV2]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s32) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UADDSAT]](s32), [[UADDSAT1]](s32) ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: uaddsat_v2s32 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[UV2]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UMIN]] - ; GFX9: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV1]], [[C]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[UV3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UMIN1]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[UV]], [[UV2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(s32) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UADDSAT]](s32), [[UADDSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir index 356bb38456ea6..5bb430cf4a062 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir @@ -30,9 +30,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s7 @@ -43,9 +42,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -84,9 +82,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s8 @@ -97,9 +94,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -172,16 +168,14 @@ body: | ; GFX8: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C3]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C3]](s16) ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX8: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX8: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SUB1]], [[C3]](s16) + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL2]], [[SHL3]] + ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT1]], [[C3]](s16) ; GFX8: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX8: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) ; GFX8: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] @@ -208,16 +202,14 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C3]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SUB1]], [[C3]](s16) + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL2]], [[SHL3]] + ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT1]], [[C3]](s16) ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] @@ -263,18 +255,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -334,12 +324,10 @@ body: | ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC2]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC2]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC3]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) @@ -347,9 +335,8 @@ body: | ; GFX9-LABEL: name: usubsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[SUB]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[USUBSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_USUBSAT %0, %1 @@ -442,18 +429,15 @@ body: | ; GFX8: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX8: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC3]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC4]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC2]], [[TRUNC5]] - ; GFX8: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC2]], [[UMIN2]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC3]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC4]] + ; GFX8: [[USUBSAT2:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC2]], [[TRUNC5]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[SUB2]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT2]](s16) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] @@ -492,11 +476,9 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SUB]](<2 x s16>), [[SUB1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[USUBSAT]](<2 x s16>), [[USUBSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) @@ -596,21 +578,17 @@ body: | ; GFX8: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX8: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC4]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC5]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC2]], [[TRUNC6]] - ; GFX8: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC2]], [[UMIN2]] - ; GFX8: [[UMIN3:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC3]], [[TRUNC7]] - ; GFX8: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[TRUNC3]], [[UMIN3]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC4]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC5]] + ; GFX8: [[USUBSAT2:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC2]], [[TRUNC6]] + ; GFX8: [[USUBSAT3:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC3]], [[TRUNC7]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[SUB2]](s16) - ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[SUB3]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT2]](s16) + ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) @@ -621,11 +599,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SUB]](<2 x s16>), [[SUB1]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[UV]], [[UV2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[USUBSAT]](<2 x s16>), [[USUBSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -648,15 +624,13 @@ body: | ; GFX8-LABEL: name: usubsat_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]] - ; GFX8: $vgpr0 = COPY [[SUB]](s32) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX8: $vgpr0 = COPY [[USUBSAT]](s32) ; GFX9-LABEL: name: usubsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[SUB]](s32) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[USUBSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_USUBSAT %0, %1 @@ -685,22 +659,18 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]] - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[UV]], [[UV2]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s32) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[USUBSAT]](s32), [[USUBSAT1]](s32) ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: usubsat_v2s32 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[UV]], [[UV2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(s32) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[USUBSAT]](s32), [[USUBSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index ba672883fa562..7b88123b2c9b9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -39,14 +39,8 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 -; GFX9-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_max_i16_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -57,13 +51,7 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i16_e64 v2, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v3, v0, 0 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x8000, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, 0x7fff, v3 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) @@ -118,54 +106,23 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-LABEL: s_saddsat_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, 0xffff8000, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s3, 0xffff8000, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s5 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -206,14 +163,8 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_max_i16_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -224,13 +175,7 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i16_e64 v2, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v3, v0, 0 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x8000, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, 0x7fff, v3 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) @@ -285,54 +230,23 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-LABEL: s_saddsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, 0xffff8000, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s3, 0xffff8000, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s5 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -408,26 +322,12 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v4, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i16_e32 v1, v5, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v4 -; GFX9-NEXT: v_min_i16_e32 v4, 0, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX9-NEXT: v_sub_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_max_i16_e32 v3, v4, v3 -; GFX9-NEXT: v_min_i16_e32 v1, v3, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -438,31 +338,17 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_min_i16_e64 v4, v2, 0 -; GFX10-NEXT: v_min_i16_e64 v5, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v6, v2, 0 -; GFX10-NEXT: v_max_i16_e64 v7, v0, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v4, s5, v4 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, s5, v5 -; GFX10-NEXT: v_sub_nc_u16_e64 v6, s4, v6 -; GFX10-NEXT: v_sub_nc_u16_e64 v7, s4, v7 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_max_i16_e64 v1, v4, v1 -; GFX10-NEXT: v_max_i16_e64 v10, v5, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i16_e64 v1, v1, v6 -; GFX10-NEXT: v_min_i16_e64 v3, v10, v7 -; GFX10-NEXT: v_add_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v3 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_nc_i16 v1, v2, v1 clamp +; GFX10-NEXT: v_add_nc_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -571,112 +457,40 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_saddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_sext_i32_i16 s8, 0 -; GFX9-NEXT: s_cmp_gt_i32 s7, s8 -; GFX9-NEXT: s_movk_i32 s5, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s7, s8 -; GFX9-NEXT: s_sub_i32 s9, s5, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x8000 -; GFX9-NEXT: s_cselect_b32 s7, s7, s8 -; GFX9-NEXT: s_sub_i32 s7, s6, s7 -; GFX9-NEXT: s_sext_i32_i16 s7, s7 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s1, s7, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s1, s7 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_ashr_i32 s0, s0, s4 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s7, s3, s8 -; GFX9-NEXT: s_sub_i32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_ashr_i32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_i16 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s6, 0 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s6 -; GFX10-NEXT: s_movk_i32 s9, 0x8000 -; GFX10-NEXT: s_sub_i32 s8, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_add_nc_i16 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sub_i32 s5, s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s1 -; GFX10-NEXT: s_cselect_b32 s1, s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s5, s8 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s5 -; GFX10-NEXT: s_cselect_b32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s4, s2 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s5, s4, s6 -; GFX10-NEXT: s_sub_i32 s5, s7, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_sub_i32 s4, s9, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: s_cmp_gt_i32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s4 -; GFX10-NEXT: s_cselect_b32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s3 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_ashr_i32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s0, s0, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -815,52 +629,25 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v10, 0, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v10, s5, v10 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v8, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v8, s4, v8 -; GFX9-NEXT: v_max_i16_e32 v1, v10, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v8 -; GFX9-NEXT: v_min_i16_e32 v8, 0, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX9-NEXT: v_sub_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_max_i16_e32 v5, v8, v5 -; GFX9-NEXT: v_min_i16_e32 v1, v5, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp +; GFX9-NEXT: v_add_i16 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_min_i16_e32 v6, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v6, s5, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v5, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v5, v9, v5 -; GFX9-NEXT: v_max_i16_e32 v3, v6, v3 -; GFX9-NEXT: v_min_i16_e32 v3, v3, v5 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_min_i16_e32 v6, 0, v3 -; GFX9-NEXT: v_max_i16_e32 v5, 0, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX9-NEXT: v_sub_u16_e32 v6, 0x8000, v6 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_sub_u16_e32 v5, v9, v5 -; GFX9-NEXT: v_max_i16_e32 v4, v6, v4 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_min_i16_e32 v4, v4, v5 +; GFX9-NEXT: v_add_i16 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_i16 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -871,57 +658,30 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v8, v4, 0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: v_min_i16_e64 v9, v2, 0 -; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, s5, v8 -; GFX10-NEXT: v_max_i16_e64 v10, v4, 0 -; GFX10-NEXT: s_mov_b32 s6, 24 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v15, s5, v9 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_i16_e64 v11, v2, 0 -; GFX10-NEXT: v_max_i16_e64 v7, v8, v7 -; GFX10-NEXT: v_sub_nc_u16_e64 v10, s4, v10 -; GFX10-NEXT: v_max_i16_e64 v5, v15, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_sub_nc_u16_e64 v8, s4, v11 -; GFX10-NEXT: v_min_i16_e64 v11, v3, 0 -; GFX10-NEXT: v_min_i16_e64 v7, v7, v10 -; GFX10-NEXT: v_min_i16_e64 v10, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX10-NEXT: v_min_i16_e64 v5, v5, v8 -; GFX10-NEXT: v_sub_nc_u16_e64 v11, s5, v11 -; GFX10-NEXT: v_max_i16_e64 v8, v3, 0 -; GFX10-NEXT: v_sub_nc_u16_e64 v10, 0x8000, v10 -; GFX10-NEXT: v_max_i16_e64 v12, v0, 0 -; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v5 -; GFX10-NEXT: v_max_i16_e64 v6, v11, v6 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, v9, v8 -; GFX10-NEXT: v_max_i16_e64 v1, v10, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v9, v12 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, v7 -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v5, v6, v5 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v8 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 +; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_nc_i16 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_i16 v5, v5, v6 clamp +; GFX10-NEXT: v_add_nc_i16 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v4 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_add_nc_u16_e64 v3, v3, v5 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, v4, s4, v2 -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v3), s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 +; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -1118,212 +878,70 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-LABEL: s_saddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_sext_i32_i16 s12, 0 -; GFX9-NEXT: s_cmp_gt_i32 s11, s12 -; GFX9-NEXT: s_movk_i32 s9, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s11, s12 -; GFX9-NEXT: s_sub_i32 s13, s9, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_movk_i32 s10, 0x8000 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 -; GFX9-NEXT: s_sub_i32 s11, s10, s11 -; GFX9-NEXT: s_sext_i32_i16 s11, s11 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s11, s1 -; GFX9-NEXT: s_cselect_b32 s1, s11, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s1, s11 -; GFX9-NEXT: s_cselect_b32 s1, s1, s11 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_ashr_i32 s0, s0, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s11, s5, s12 -; GFX9-NEXT: s_sub_i32 s11, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s11 -; GFX9-NEXT: s_cmp_lt_i32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_ashr_i32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s9, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s9, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_cmp_gt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_add_i16 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_add_i16 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_i16 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s6, 8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_add_nc_i16 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, 0 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s9, s10 -; GFX10-NEXT: s_movk_i32 s11, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s10 -; GFX10-NEXT: s_movk_i32 s13, 0x8000 -; GFX10-NEXT: s_sub_i32 s12, s11, s12 -; GFX10-NEXT: s_cmp_lt_i32 s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_i16 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_add_nc_i16 v2, s3, s0 clamp +; GFX10-NEXT: v_add_nc_i16 v3, s4, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sub_i32 s9, s13, s9 -; GFX10-NEXT: s_sext_i32_i16 s9, s9 -; GFX10-NEXT: s_cmp_gt_i32 s9, s1 -; GFX10-NEXT: s_cselect_b32 s1, s9, s1 -; GFX10-NEXT: s_sext_i32_i16 s9, s12 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s9 -; GFX10-NEXT: s_cselect_b32 s1, s1, s9 -; GFX10-NEXT: s_lshl_b32 s5, s5, s6 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s6 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, s6 -; GFX10-NEXT: s_cmp_gt_i32 s2, s10 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s9, s2, s10 -; GFX10-NEXT: s_sub_i32 s9, s11, s9 -; GFX10-NEXT: s_cmp_lt_i32 s2, s10 -; GFX10-NEXT: s_cselect_b32 s2, s2, s10 -; GFX10-NEXT: s_sub_i32 s2, s13, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_gt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s9 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_lshl_b32 s2, s7, s6 -; GFX10-NEXT: s_ashr_i32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s11, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s13, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s7 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s6 -; GFX10-NEXT: s_add_i32 s3, s3, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s4 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_lshl_b32 s2, s8, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s11, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s13, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s7 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_movk_i32 s7, 0xff -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_and_b32 s1, s1, s7 -; GFX10-NEXT: s_add_i32 s4, s4, s2 -; GFX10-NEXT: s_and_b32 s2, s3, s7 -; GFX10-NEXT: s_sext_i32_i16 s3, s4 -; GFX10-NEXT: s_and_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_ashr_i32 s3, s3, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s7 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -1368,14 +986,8 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_max_i32_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1386,13 +998,7 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v2, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v3, 0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x7fffffff, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) @@ -1439,39 +1045,22 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX9-LABEL: s_saddsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, s0, 0 -; GFX9-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s3, s0, 0 -; GFX9-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, 0 -; GFX10-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, s0, 0 -; GFX10-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, 8 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -1505,27 +1094,15 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { ; GFX9-LABEL: v_saddsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_max_i32_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v2, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v3, 0, v0 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x7fffffff, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1579,33 +1156,16 @@ define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: s_saddsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, s0, 0 -; GFX9-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s3, s0, 0 -; GFX9-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, 0 -; GFX10-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, s0, 0 -; GFX10-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1640,29 +1200,13 @@ define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX9-LABEL: saddsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s1, s0, 0 -; GFX9-NEXT: s_sub_i32 s1, 0x7fffffff, s1 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, s0, 0 -; GFX9-NEXT: s_sub_i32 s2, 0x80000000, s2 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX9-NEXT: v_min_i32_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s1, s0, 0 -; GFX10-NEXT: s_sub_i32 s1, 0x7fffffff, s1 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s2, s0, 0 -; GFX10-NEXT: s_sub_i32 s2, 0x80000000, s2 -; GFX10-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX10-NEXT: v_min_i32_e32 v0, s1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1694,25 +1238,13 @@ define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: saddsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_i32_e32 v2, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v1, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, 0x80000000, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v2 -; GFX9-NEXT: v_min_i32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i32 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_i32_e32 v1, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v2, 0, v0 +; GFX10-NEXT: v_add_nc_i32 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x80000000, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x7fffffff, v2 -; GFX10-NEXT: v_max_i32_e32 v1, s0, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1765,45 +1297,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX9-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX9-NEXT: v_min_i32_e32 v4, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_max_i32_e32 v3, v4, v3 -; GFX9-NEXT: v_min_i32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v4, 0, v0 -; GFX10-NEXT: v_min_i32_e32 v5, 0, v1 -; GFX10-NEXT: s_brev_b32 s4, 1 -; GFX10-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v7, 0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s4, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s4, v5 -; GFX10-NEXT: s_brev_b32 s4, -2 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_max_i32_e32 v11, v4, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s4, v7 -; GFX10-NEXT: v_max_i32_e32 v10, v5, v3 -; GFX10-NEXT: v_min_i32_e32 v2, v11, v6 -; GFX10-NEXT: v_min_i32_e32 v3, v10, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1868,59 +1372,21 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX9-LABEL: s_saddsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: s_cselect_b32 s6, s0, 0 -; GFX9-NEXT: s_sub_i32 s6, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cselect_b32 s7, s0, 0 -; GFX9-NEXT: s_sub_i32 s7, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s7, s2 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 -; GFX9-NEXT: s_cmp_lt_i32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s2, s1, 0 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s4, s1, 0 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_cmp_gt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_cmp_lt_i32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: s_cselect_b32 s5, s0, 0 -; GFX10-NEXT: s_brev_b32 s6, 1 -; GFX10-NEXT: s_sub_i32 s5, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s2 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s0, 0 -; GFX10-NEXT: s_sub_i32 s7, s6, s7 -; GFX10-NEXT: s_cmp_gt_i32 s7, s2 -; GFX10-NEXT: s_cselect_b32 s2, s7, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s2, s1, 0 -; GFX10-NEXT: s_sub_i32 s2, s4, s2 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s4, s1, 0 -; GFX10-NEXT: s_sub_i32 s4, s6, s4 -; GFX10-NEXT: s_cmp_gt_i32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s2 -; GFX10-NEXT: s_cselect_b32 s2, s3, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1986,59 +1452,19 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v7, s5, v7 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v6, s4, v6 -; GFX9-NEXT: v_max_i32_e32 v3, v7, v3 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX9-NEXT: v_min_i32_e32 v3, v4, v3 -; GFX9-NEXT: v_min_i32_e32 v4, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_min_i32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_i32 v0, v0, v3 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v4 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX10-NEXT: v_min_i32_e32 v8, 0, v1 -; GFX10-NEXT: v_min_i32_e32 v9, 0, v2 -; GFX10-NEXT: s_brev_b32 s5, 1 -; GFX10-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, s5, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, s5, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, s5, v9 -; GFX10-NEXT: v_max_i32_e32 v10, 0, v1 -; GFX10-NEXT: v_max_i32_e32 v11, 0, v2 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_max_i32_e32 v3, v14, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s4, v10 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s4, v11 -; GFX10-NEXT: v_max_i32_e32 v5, v19, v5 -; GFX10-NEXT: v_min_i32_e32 v3, v3, v6 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v3 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v4 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v4, v4, v7 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2125,81 +1551,26 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX9-LABEL: s_saddsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s6, -2 -; GFX9-NEXT: s_cselect_b32 s8, s0, 0 -; GFX9-NEXT: s_sub_i32 s8, s6, s8 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s7, 1 -; GFX9-NEXT: s_cselect_b32 s9, s0, 0 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s9, s3 -; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s3, s1, 0 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s8, s1, 0 -; GFX9-NEXT: s_sub_i32 s8, s7, s8 -; GFX9-NEXT: s_cmp_gt_i32 s8, s4 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s2, 0 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s4, s2, 0 -; GFX9-NEXT: s_sub_i32 s4, s7, s4 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s6, -2 -; GFX10-NEXT: s_cselect_b32 s7, s0, 0 -; GFX10-NEXT: s_brev_b32 s8, 1 -; GFX10-NEXT: s_sub_i32 s7, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s3 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s4 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s0, 0 -; GFX10-NEXT: s_sub_i32 s9, s8, s9 -; GFX10-NEXT: s_cmp_gt_i32 s9, s3 -; GFX10-NEXT: s_cselect_b32 s3, s9, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s3, s1, 0 -; GFX10-NEXT: s_sub_i32 s3, s6, s3 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s7, s1, 0 -; GFX10-NEXT: s_sub_i32 s7, s8, s7 -; GFX10-NEXT: s_cmp_gt_i32 s7, s4 -; GFX10-NEXT: s_cselect_b32 s4, s7, s4 -; GFX10-NEXT: s_cmp_lt_i32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s3, s2, 0 -; GFX10-NEXT: s_sub_i32 s3, s6, s3 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, s2, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2279,73 +1650,21 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v9, s5, v9 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v8, s4, v8 -; GFX9-NEXT: v_max_i32_e32 v4, v9, v4 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX9-NEXT: v_min_i32_e32 v8, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v8, s5, v8 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i32_e32 v5, v8, v5 -; GFX9-NEXT: v_min_i32_e32 v4, v5, v4 -; GFX9-NEXT: v_min_i32_e32 v5, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_min_i32_e32 v4, v5, v4 -; GFX9-NEXT: v_min_i32_e32 v5, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, 0x80000000, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, 0x7fffffff, v4 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX9-NEXT: v_min_i32_e32 v4, v5, v4 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_i32 v0, v0, v4 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v5 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v6 clamp +; GFX9-NEXT: v_add_i32 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v8, 0, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 -; GFX10-NEXT: v_min_i32_e32 v11, 0, v1 -; GFX10-NEXT: v_min_i32_e32 v12, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v9, 0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, s4, v8 -; GFX10-NEXT: v_min_i32_e32 v8, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, s4, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, 0x80000000, v12 -; GFX10-NEXT: v_max_i32_e32 v10, 0, v1 -; GFX10-NEXT: v_max_i32_e32 v13, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s4, v8 -; GFX10-NEXT: v_max_i32_e32 v14, 0, v3 -; GFX10-NEXT: s_brev_b32 s5, -2 -; GFX10-NEXT: v_max_i32_e32 v5, v11, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, s5, v10 -; GFX10-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, s5, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, s5, v9 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, 0x7fffffff, v14 -; GFX10-NEXT: v_max_i32_e32 v7, v12, v7 -; GFX10-NEXT: v_min_i32_e32 v11, v6, v11 -; GFX10-NEXT: v_min_i32_e32 v19, v5, v10 -; GFX10-NEXT: v_min_i32_e32 v15, v4, v9 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v4 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v5 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v6 clamp +; GFX10-NEXT: v_add_nc_i32 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v6, v7, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2454,103 +1773,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX9-LABEL: s_saddsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s8, -2 -; GFX9-NEXT: s_cselect_b32 s10, s0, 0 -; GFX9-NEXT: s_sub_i32 s10, s8, s10 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s9, 1 -; GFX9-NEXT: s_cselect_b32 s11, s0, 0 -; GFX9-NEXT: s_sub_i32 s11, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s11, s4 -; GFX9-NEXT: s_cselect_b32 s4, s11, s4 -; GFX9-NEXT: s_cmp_lt_i32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s4, s1, 0 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s10, s1, 0 -; GFX9-NEXT: s_sub_i32 s10, s9, s10 -; GFX9-NEXT: s_cmp_gt_i32 s10, s5 -; GFX9-NEXT: s_cselect_b32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s4, s2, 0 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s5, s2, 0 -; GFX9-NEXT: s_sub_i32 s5, s9, s5 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s4, s3, 0 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s5, s3, 0 -; GFX9-NEXT: s_sub_i32 s5, s9, s5 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s8, -2 -; GFX10-NEXT: s_cselect_b32 s9, s0, 0 -; GFX10-NEXT: s_brev_b32 s10, 1 -; GFX10-NEXT: s_sub_i32 s9, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s4 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s5 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s6 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s11, s0, 0 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_cmp_gt_i32 s11, s4 -; GFX10-NEXT: s_cselect_b32 s4, s11, s4 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s4, s1, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s9, s1, 0 -; GFX10-NEXT: s_sub_i32 s9, s10, s9 -; GFX10-NEXT: s_cmp_gt_i32 s9, s5 -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, s2, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s5, s2, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_lt_i32 s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s4, s3, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s5, s3, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2648,90 +1895,22 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v12, s5, v12 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v10, s4, v10 -; GFX9-NEXT: v_max_i32_e32 v5, v12, v5 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX9-NEXT: v_min_i32_e32 v10, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v10, s5, v10 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v10, v6 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v6, v13, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_bfrev_b32_e32 v11, -2 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, v11, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v6, v8 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v4 -; GFX9-NEXT: v_sub_u32_e32 v6, v13, v6 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, v11, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_i32 v0, v0, v5 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v6 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v7 clamp +; GFX9-NEXT: v_add_i32 v3, v3, v8 clamp +; GFX9-NEXT: v_add_i32 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v13, 0, v1 -; GFX10-NEXT: s_brev_b32 s5, 1 -; GFX10-NEXT: v_min_i32_e32 v10, 0, v0 -; GFX10-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX10-NEXT: v_bfrev_b32_e32 v15, 1 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, s5, v13 -; GFX10-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, s5, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, s5, v16 -; GFX10-NEXT: v_max_i32_e32 v11, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v23, v13, v6 -; GFX10-NEXT: v_min_i32_e32 v13, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX10-NEXT: v_bfrev_b32_e32 v12, -2 -; GFX10-NEXT: v_max_i32_e32 v14, 0, v1 -; GFX10-NEXT: v_max_i32_e32 v10, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v15, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, v15, v17 -; GFX10-NEXT: v_max_i32_e32 v18, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_max_i32_e32 v7, v16, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, s4, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, s4, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v12, v18 -; GFX10-NEXT: v_max_i32_e32 v27, v13, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, v12, v19 -; GFX10-NEXT: v_max_i32_e32 v9, v15, v9 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v11 -; GFX10-NEXT: v_min_i32_e32 v6, v23, v14 -; GFX10-NEXT: v_min_i32_e32 v7, v7, v10 -; GFX10-NEXT: v_min_i32_e32 v8, v27, v16 -; GFX10-NEXT: v_min_i32_e32 v9, v9, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v5 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v6 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v7 clamp +; GFX10-NEXT: v_add_nc_i32 v3, v3, v8 clamp +; GFX10-NEXT: v_add_nc_i32 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -2863,125 +2042,36 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX9-LABEL: s_saddsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s10, -2 -; GFX9-NEXT: s_cselect_b32 s12, s0, 0 -; GFX9-NEXT: s_sub_i32 s12, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s11, 1 -; GFX9-NEXT: s_cselect_b32 s13, s0, 0 -; GFX9-NEXT: s_sub_i32 s13, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s13, s5 -; GFX9-NEXT: s_cselect_b32 s5, s13, s5 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s5, s1, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s12, s1, 0 -; GFX9-NEXT: s_sub_i32 s12, s11, s12 -; GFX9-NEXT: s_cmp_gt_i32 s12, s6 -; GFX9-NEXT: s_cselect_b32 s6, s12, s6 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s5, s2, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s6, s2, 0 -; GFX9-NEXT: s_sub_i32 s6, s11, s6 -; GFX9-NEXT: s_cmp_gt_i32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s6, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s2, s2, s5 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s5, s3, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s6, s3, 0 -; GFX9-NEXT: s_sub_i32 s6, s11, s6 -; GFX9-NEXT: s_cmp_gt_i32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s6, s6, s8 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s3, s3, s5 -; GFX9-NEXT: s_cmp_gt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s5, s4, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s6, s4, 0 -; GFX9-NEXT: s_sub_i32 s6, s11, s6 -; GFX9-NEXT: s_cmp_gt_i32 s6, s9 -; GFX9-NEXT: s_cselect_b32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s10, -2 -; GFX10-NEXT: s_cselect_b32 s11, s0, 0 -; GFX10-NEXT: s_brev_b32 s12, 1 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s5 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s6 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s7 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s8 clamp +; GFX10-NEXT: v_add_nc_i32 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s13, s0, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, s13 -; GFX10-NEXT: s_cmp_gt_i32 s13, s5 -; GFX10-NEXT: s_cselect_b32 s5, s13, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s11 -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s5, s1, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s11, s1, 0 -; GFX10-NEXT: s_sub_i32 s11, s12, s11 -; GFX10-NEXT: s_cmp_gt_i32 s11, s6 -; GFX10-NEXT: s_cselect_b32 s6, s11, s6 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s5, s2, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s6, s2, 0 -; GFX10-NEXT: s_sub_i32 s6, s12, s6 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_gt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s5, s3, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s6, s3, 0 -; GFX10-NEXT: s_sub_i32 s6, s12, s6 -; GFX10-NEXT: s_cmp_gt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s5, s4, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s6, s4, 0 -; GFX10-NEXT: s_sub_i32 s6, s12, s6 -; GFX10-NEXT: s_cmp_gt_i32 s6, s9 -; GFX10-NEXT: s_cselect_b32 s6, s6, s9 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -3233,244 +2323,44 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, 1 -; GFX9-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v32, s4, v32 -; GFX9-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX9-NEXT: s_brev_b32 s5, -2 -; GFX9-NEXT: v_max_i32_e32 v32, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v32, s5, v32 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v32 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_min_i32_e32 v16, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v17, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_max_i32_e32 v17, 0, v2 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX9-NEXT: v_sub_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_bfrev_b32_e32 v18, -2 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v5 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v5 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v6 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v6 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v7 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v7 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v8 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v8 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v9 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v9 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v10 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v10 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v10, v10, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v11 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v11 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v12 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v12 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v13 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v13 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v14 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v14 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v17 -; GFX9-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX9-NEXT: v_sub_u32_e32 v17, v18, v17 -; GFX9-NEXT: v_min_i32_e32 v18, 0, v15 -; GFX9-NEXT: v_sub_u32_e32 v16, v16, v18 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v31 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_add_i32 v0, v0, v16 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v17 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v18 clamp +; GFX9-NEXT: v_add_i32 v3, v3, v19 clamp +; GFX9-NEXT: v_add_i32 v4, v4, v20 clamp +; GFX9-NEXT: v_add_i32 v5, v5, v21 clamp +; GFX9-NEXT: v_add_i32 v6, v6, v22 clamp +; GFX9-NEXT: v_add_i32 v7, v7, v23 clamp +; GFX9-NEXT: v_add_i32 v8, v8, v24 clamp +; GFX9-NEXT: v_add_i32 v9, v9, v25 clamp +; GFX9-NEXT: v_add_i32 v10, v10, v26 clamp +; GFX9-NEXT: v_add_i32 v11, v11, v27 clamp +; GFX9-NEXT: v_add_i32 v12, v12, v28 clamp +; GFX9-NEXT: v_add_i32 v13, v13, v29 clamp +; GFX9-NEXT: v_add_i32 v14, v14, v30 clamp +; GFX9-NEXT: v_add_i32 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v0 -; GFX10-NEXT: s_brev_b32 s5, -2 -; GFX10-NEXT: v_min_i32_e32 v36, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v35, s4, v32 -; GFX10-NEXT: v_min_i32_e32 v32, 0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v33, s5, v33 -; GFX10-NEXT: v_max_i32_e32 v37, 0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v36, s4, v36 -; GFX10-NEXT: v_max_i32_e32 v16, v35, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, s4, v32 -; GFX10-NEXT: v_bfrev_b32_e32 v35, 1 -; GFX10-NEXT: v_min_i32_e32 v38, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v18, v36, v18 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v33 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v2 -; GFX10-NEXT: v_max_i32_e32 v39, v32, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v36, v35, v38 -; GFX10-NEXT: v_sub_nc_u32_e32 v37, s5, v37 -; GFX10-NEXT: v_bfrev_b32_e32 v34, -2 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, s5, v33 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v16 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v3 -; GFX10-NEXT: v_min_i32_e32 v39, v39, v37 -; GFX10-NEXT: v_max_i32_e32 v19, v36, v19 -; GFX10-NEXT: v_min_i32_e32 v16, v18, v32 -; GFX10-NEXT: v_min_i32_e32 v32, 0, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v34, v33 -; GFX10-NEXT: v_min_i32_e32 v38, 0, v5 -; GFX10-NEXT: v_max_i32_e32 v17, 0, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v16 -; GFX10-NEXT: v_min_i32_e32 v16, 0, v4 -; GFX10-NEXT: v_min_i32_e32 v18, v19, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v35, v38 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v39 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, v35, v32 -; GFX10-NEXT: v_sub_nc_u32_e32 v39, v35, v16 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v5 -; GFX10-NEXT: v_max_i32_e32 v36, 0, v6 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_max_i32_e32 v16, v39, v20 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v34, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v34, v33 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v34, v36 -; GFX10-NEXT: v_max_i32_e32 v22, v32, v22 -; GFX10-NEXT: v_min_i32_e32 v18, 0, v7 -; GFX10-NEXT: v_min_i32_e32 v39, v16, v17 -; GFX10-NEXT: v_min_i32_e32 v38, v19, v20 -; GFX10-NEXT: v_max_i32_e32 v16, 0, v7 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v35, v18 -; GFX10-NEXT: v_min_i32_e32 v18, 0, v8 -; GFX10-NEXT: v_min_i32_e32 v20, 0, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v34, v16 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v19 -; GFX10-NEXT: v_max_i32_e32 v19, 0, v8 -; GFX10-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v35, v18 -; GFX10-NEXT: v_min_i32_e32 v22, 0, v10 -; GFX10-NEXT: v_max_i32_e32 v21, 0, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v35, v20 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v39 -; GFX10-NEXT: v_max_i32_e32 v18, v18, v24 -; GFX10-NEXT: v_sub_nc_u32_e32 v39, v35, v22 -; GFX10-NEXT: v_min_i32_e32 v16, v17, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v34, v19 -; GFX10-NEXT: v_max_i32_e32 v23, 0, v10 -; GFX10-NEXT: v_max_i32_e32 v20, v20, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v34, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v16 -; GFX10-NEXT: v_min_i32_e32 v17, v18, v19 -; GFX10-NEXT: v_min_i32_e32 v16, 0, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v34, v23 -; GFX10-NEXT: v_max_i32_e32 v19, v39, v26 -; GFX10-NEXT: v_min_i32_e32 v22, 0, v12 -; GFX10-NEXT: v_min_i32_e32 v20, v20, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v35, v16 -; GFX10-NEXT: v_min_i32_e32 v26, 0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v17 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v35, v22 -; GFX10-NEXT: v_min_i32_e32 v22, 0, v14 -; GFX10-NEXT: v_min_i32_e32 v21, 0, v13 -; GFX10-NEXT: v_max_i32_e32 v24, 0, v14 -; GFX10-NEXT: v_max_i32_e32 v25, 0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v20 -; GFX10-NEXT: v_max_i32_e32 v20, 0, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v39, v35, v22 -; GFX10-NEXT: v_max_i32_e32 v23, 0, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v17 -; GFX10-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX10-NEXT: v_max_i32_e32 v16, v16, v27 -; GFX10-NEXT: v_sub_nc_u32_e32 v27, v35, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v26, v35, v26 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v34, v23 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v34, v17 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v34, v20 -; GFX10-NEXT: v_max_i32_e32 v21, v27, v29 -; GFX10-NEXT: v_sub_nc_u32_e32 v24, v34, v24 -; GFX10-NEXT: v_max_i32_e32 v22, v39, v30 -; GFX10-NEXT: v_sub_nc_u32_e32 v25, v34, v25 -; GFX10-NEXT: v_max_i32_e32 v23, v26, v31 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v18 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX10-NEXT: v_min_i32_e32 v18, v21, v20 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v24 -; GFX10-NEXT: v_min_i32_e32 v20, v23, v25 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v38 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_add_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_add_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_add_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v15, v15, v20 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v17 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v18 clamp +; GFX10-NEXT: v_add_nc_i32 v3, v3, v19 clamp +; GFX10-NEXT: v_add_nc_i32 v4, v4, v20 clamp +; GFX10-NEXT: v_add_nc_i32 v5, v5, v21 clamp +; GFX10-NEXT: v_add_nc_i32 v6, v6, v22 clamp +; GFX10-NEXT: v_add_nc_i32 v7, v7, v23 clamp +; GFX10-NEXT: v_add_nc_i32 v8, v8, v24 clamp +; GFX10-NEXT: v_add_nc_i32 v9, v9, v25 clamp +; GFX10-NEXT: v_add_nc_i32 v10, v10, v26 clamp +; GFX10-NEXT: v_add_nc_i32 v11, v11, v27 clamp +; GFX10-NEXT: v_add_nc_i32 v12, v12, v28 clamp +; GFX10-NEXT: v_add_nc_i32 v13, v13, v29 clamp +; GFX10-NEXT: v_add_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: v_add_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) @@ -3844,367 +2734,91 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX9-LABEL: s_saddsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s32, -2 -; GFX9-NEXT: s_cselect_b32 s34, s0, 0 -; GFX9-NEXT: s_sub_i32 s34, s32, s34 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s33, 1 -; GFX9-NEXT: s_cselect_b32 s35, s0, 0 -; GFX9-NEXT: s_sub_i32 s35, s33, s35 -; GFX9-NEXT: s_cmp_gt_i32 s35, s16 -; GFX9-NEXT: s_cselect_b32 s16, s35, s16 -; GFX9-NEXT: s_cmp_lt_i32 s16, s34 -; GFX9-NEXT: s_cselect_b32 s16, s16, s34 -; GFX9-NEXT: s_add_i32 s0, s0, s16 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s16, s1, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s34, s1, 0 -; GFX9-NEXT: s_sub_i32 s34, s33, s34 -; GFX9-NEXT: s_cmp_gt_i32 s34, s17 -; GFX9-NEXT: s_cselect_b32 s17, s34, s17 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s1, s1, s16 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s16, s2, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s17, s2, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s18 -; GFX9-NEXT: s_cselect_b32 s17, s17, s18 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s2, s2, s16 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s16, s3, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s17, s3, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s19 -; GFX9-NEXT: s_cselect_b32 s17, s17, s19 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s3, s3, s16 -; GFX9-NEXT: s_cmp_gt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s16, s4, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s17, s4, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s20 -; GFX9-NEXT: s_cselect_b32 s17, s17, s20 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s4, s4, s16 -; GFX9-NEXT: s_cmp_gt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s16, s5, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s17, s5, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s21 -; GFX9-NEXT: s_cselect_b32 s17, s17, s21 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s5, s5, s16 -; GFX9-NEXT: s_cmp_gt_i32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s16, s6, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s17, s6, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s22 -; GFX9-NEXT: s_cselect_b32 s17, s17, s22 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s6, s6, s16 -; GFX9-NEXT: s_cmp_gt_i32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s16, s7, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s17, s7, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s23 -; GFX9-NEXT: s_cselect_b32 s17, s17, s23 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s7, s7, s16 -; GFX9-NEXT: s_cmp_gt_i32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s16, s8, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s17, s8, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s24 -; GFX9-NEXT: s_cselect_b32 s17, s17, s24 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s8, s8, s16 -; GFX9-NEXT: s_cmp_gt_i32 s9, 0 -; GFX9-NEXT: s_cselect_b32 s16, s9, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s9, 0 -; GFX9-NEXT: s_cselect_b32 s17, s9, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s25 -; GFX9-NEXT: s_cselect_b32 s17, s17, s25 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s9, s9, s16 -; GFX9-NEXT: s_cmp_gt_i32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s16, s10, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s17, s10, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s26 -; GFX9-NEXT: s_cselect_b32 s17, s17, s26 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s10, s10, s16 -; GFX9-NEXT: s_cmp_gt_i32 s11, 0 -; GFX9-NEXT: s_cselect_b32 s16, s11, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s11, 0 -; GFX9-NEXT: s_cselect_b32 s17, s11, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s27 -; GFX9-NEXT: s_cselect_b32 s17, s17, s27 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s11, s11, s16 -; GFX9-NEXT: s_cmp_gt_i32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s16, s12, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s17, s12, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s28 -; GFX9-NEXT: s_cselect_b32 s17, s17, s28 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_gt_i32 s13, 0 -; GFX9-NEXT: s_cselect_b32 s16, s13, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s13, 0 -; GFX9-NEXT: s_cselect_b32 s17, s13, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s29 -; GFX9-NEXT: s_cselect_b32 s17, s17, s29 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s13, s13, s16 -; GFX9-NEXT: s_cmp_gt_i32 s14, 0 -; GFX9-NEXT: s_cselect_b32 s16, s14, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s14, 0 -; GFX9-NEXT: s_cselect_b32 s17, s14, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s30 -; GFX9-NEXT: s_cselect_b32 s17, s17, s30 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_gt_i32 s15, 0 -; GFX9-NEXT: s_cselect_b32 s16, s15, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s15, 0 -; GFX9-NEXT: s_cselect_b32 s17, s15, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s31 -; GFX9-NEXT: s_cselect_b32 s17, s17, s31 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_add_i32 v5, s5, v5 clamp +; GFX9-NEXT: v_add_i32 v6, s6, v6 clamp +; GFX9-NEXT: v_add_i32 v7, s7, v7 clamp +; GFX9-NEXT: v_add_i32 v8, s8, v8 clamp +; GFX9-NEXT: v_add_i32 v9, s9, v9 clamp +; GFX9-NEXT: v_add_i32 v10, s10, v10 clamp +; GFX9-NEXT: v_add_i32 v11, s11, v11 clamp +; GFX9-NEXT: v_add_i32 v12, s12, v12 clamp +; GFX9-NEXT: v_add_i32 v13, s13, v13 clamp +; GFX9-NEXT: v_add_i32 v14, s14, v14 clamp +; GFX9-NEXT: v_add_i32 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s32, -2 -; GFX10-NEXT: s_cselect_b32 s33, s0, 0 -; GFX10-NEXT: s_brev_b32 s34, 1 -; GFX10-NEXT: s_sub_i32 s46, s32, s33 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s16 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s17 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s18 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s19 clamp +; GFX10-NEXT: v_add_nc_i32 v4, s4, s20 clamp +; GFX10-NEXT: v_add_nc_i32 v5, s5, s21 clamp +; GFX10-NEXT: v_add_nc_i32 v6, s6, s22 clamp +; GFX10-NEXT: v_add_nc_i32 v7, s7, s23 clamp +; GFX10-NEXT: v_add_nc_i32 v8, s8, s24 clamp +; GFX10-NEXT: v_add_nc_i32 v9, s9, s25 clamp +; GFX10-NEXT: v_add_nc_i32 v10, s10, s26 clamp +; GFX10-NEXT: v_add_nc_i32 v11, s11, s27 clamp +; GFX10-NEXT: v_add_nc_i32 v12, s12, s28 clamp +; GFX10-NEXT: v_add_nc_i32 v13, s13, s29 clamp +; GFX10-NEXT: v_add_nc_i32 v14, s14, s30 clamp +; GFX10-NEXT: v_add_nc_i32 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s35, s0, 0 -; GFX10-NEXT: s_sub_i32 s35, s34, s35 -; GFX10-NEXT: s_cmp_gt_i32 s35, s16 -; GFX10-NEXT: s_cselect_b32 s16, s35, s16 -; GFX10-NEXT: s_cmp_lt_i32 s16, s46 -; GFX10-NEXT: s_cselect_b32 s46, s16, s46 -; GFX10-NEXT: s_add_i32 s0, s0, s46 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s46, s1, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s33, s1, 0 -; GFX10-NEXT: s_sub_i32 s46, s34, s33 -; GFX10-NEXT: s_cmp_gt_i32 s46, s17 -; GFX10-NEXT: s_cselect_b32 s17, s46, s17 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s46, s17, s16 -; GFX10-NEXT: s_add_i32 s1, s1, s46 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s46, s2, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s17, s2, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_gt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s46, s3, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s17, s3, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s19 -; GFX10-NEXT: s_cselect_b32 s17, s17, s19 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_gt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s46, s4, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s17, s4, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s20 -; GFX10-NEXT: s_cselect_b32 s17, s17, s20 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_gt_i32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s46, s5, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s17, s5, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s21 -; GFX10-NEXT: s_cselect_b32 s17, s17, s21 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_gt_i32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s46, s6, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s17, s6, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s22 -; GFX10-NEXT: s_cselect_b32 s17, s17, s22 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_gt_i32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s46, s7, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s17, s7, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s23 -; GFX10-NEXT: s_cselect_b32 s17, s17, s23 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_gt_i32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s46, s8, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s17, s8, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s24 -; GFX10-NEXT: s_cselect_b32 s17, s17, s24 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_gt_i32 s9, 0 -; GFX10-NEXT: s_cselect_b32 s46, s9, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s9, 0 -; GFX10-NEXT: s_cselect_b32 s17, s9, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s25 -; GFX10-NEXT: s_cselect_b32 s17, s17, s25 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_gt_i32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s46, s10, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s17, s10, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s26 -; GFX10-NEXT: s_cselect_b32 s17, s17, s26 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_gt_i32 s11, 0 -; GFX10-NEXT: s_cselect_b32 s46, s11, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s11, 0 -; GFX10-NEXT: s_cselect_b32 s17, s11, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s27 -; GFX10-NEXT: s_cselect_b32 s17, s17, s27 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_gt_i32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s46, s12, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s17, s12, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s28 -; GFX10-NEXT: s_cselect_b32 s17, s17, s28 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_gt_i32 s13, 0 -; GFX10-NEXT: s_cselect_b32 s46, s13, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s13, 0 -; GFX10-NEXT: s_cselect_b32 s17, s13, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s29 -; GFX10-NEXT: s_cselect_b32 s17, s17, s29 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_gt_i32 s14, 0 -; GFX10-NEXT: s_cselect_b32 s46, s14, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s14, 0 -; GFX10-NEXT: s_cselect_b32 s17, s14, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s30 -; GFX10-NEXT: s_cselect_b32 s17, s17, s30 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_gt_i32 s15, 0 -; GFX10-NEXT: s_cselect_b32 s30, s15, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s30 -; GFX10-NEXT: s_cmp_lt_i32 s15, 0 -; GFX10-NEXT: s_cselect_b32 s17, s15, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s31 -; GFX10-NEXT: s_cselect_b32 s17, s17, s31 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -4241,27 +2855,15 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-LABEL: v_saddsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_max_i16_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i16_e64 v2, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v3, v0, 0 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x8000, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, 0x7fff, v3 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4309,45 +2911,16 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: s_saddsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s3, 0 -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s2, s3 -; GFX9-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, 0xffff8000, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, 0 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s2 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s3, s2 -; GFX10-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s2 -; GFX10-NEXT: s_cselect_b32 s2, s3, s2 -; GFX10-NEXT: s_sub_i32 s2, 0xffff8000, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_sext_i32_i16 s2, s4 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4387,33 +2960,13 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX9-LABEL: saddsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GFX9-NEXT: s_sext_i32_i16 s2, 0 -; GFX9-NEXT: s_cmp_gt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s3, s1, s2 -; GFX9-NEXT: s_sub_i32 s3, 0x7fff, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s1, 0xffff8000, s1 -; GFX9-NEXT: v_max_i16_e32 v0, s1, v0 -; GFX9-NEXT: v_min_i16_e32 v0, s3, v0 -; GFX9-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s1, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, 0 +; GFX10-NEXT: v_add_nc_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_gt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s3, s1, s2 -; GFX10-NEXT: s_sub_i32 s3, 0x7fff, s3 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s1, 0xffff8000, s1 -; GFX10-NEXT: v_max_i16_e64 v0, s1, v0 -; GFX10-NEXT: v_min_i16_e64 v0, v0, s3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4448,25 +3001,13 @@ define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: saddsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_i16_e32 v2, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v1, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x8000, v2 -; GFX9-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_max_i16_e32 v2, s0, v2 -; GFX9-NEXT: v_min_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_i16_e64 v1, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v2, v0, 0 +; GFX10-NEXT: v_add_nc_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v1, 0x8000, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x7fff, v2 -; GFX10-NEXT: v_max_i16_e64 v1, v1, s0 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v2 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4527,29 +3068,15 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v2, v3, v2 -; GFX9-NEXT: v_pk_min_i16 v3, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x80008000 -; GFX9-NEXT: v_pk_sub_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_max_i16 v1, v3, v1 -; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_i16 v2, v0, 0 -; GFX10-NEXT: v_pk_max_i16 v3, v0, 0 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, 0x80008000, v2 -; GFX10-NEXT: v_pk_sub_i16 v3, 0x7fff7fff, v3 -; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -4642,99 +3169,16 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_saddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_ashr_i32 s3, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GFX9-NEXT: s_cmp_gt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s5, s2, s4 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s6, s3, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s5, 0x7fff7fff, s5 -; GFX9-NEXT: s_sub_i32 s6, 0x7fff, s6 -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s3, s3, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_sub_i32 s2, 0x80008000, s2 -; GFX9-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_ashr_i32 s4, s5, 16 -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s1, s1, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s3, 0 -; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s3 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s2, s3 -; GFX10-NEXT: s_cmp_gt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s6, s4, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_lshr_b32 s6, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, 0x7fff7fff, s5 -; GFX10-NEXT: s_sub_i32 s6, 0x7fff, s6 -; GFX10-NEXT: s_cmp_lt_i32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_cmp_lt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s3, s4, 0 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_lshr_b32 s3, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, 0x80008000, s2 -; GFX10-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s6 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_sext_i32_i16 s2, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_ashr_i32 s3, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -4810,59 +3254,13 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GFX9-NEXT: s_ashr_i32 s2, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s3, 0 -; GFX9-NEXT: s_cmp_gt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s4, s1, s3 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s5, s2, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, 0x7fff7fff, s4 -; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_sub_i32 s1, 0x80008000, s1 -; GFX9-NEXT: s_sub_i32 s2, 0x8000, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s4 -; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s1, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, 0 -; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s1, s2 +; GFX10-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s1, s2 -; GFX10-NEXT: s_cmp_gt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s5, s3, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, 0x7fff7fff, s4 -; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s2, s3, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-NEXT: s_sub_i32 s1, 0x80008000, s1 -; GFX10-NEXT: s_sub_i32 s2, 0x8000, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX10-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5 -; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 -; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -4926,27 +3324,13 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_max_i16 v1, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v1, v2, v1 -; GFX9-NEXT: v_pk_min_i16 v2, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x80008000 -; GFX9-NEXT: v_pk_sub_i16 v2, v3, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v2, s0 -; GFX9-NEXT: v_pk_min_i16 v1, v2, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_min_i16 v1, v0, 0 -; GFX10-NEXT: v_pk_max_i16 v2, v0, 0 +; GFX10-NEXT: v_pk_add_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, 0x80008000, v1 -; GFX10-NEXT: v_pk_sub_i16 v2, 0x7fff7fff, v2 -; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -5065,43 +3449,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_i16 v6, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x80008000 -; GFX9-NEXT: v_pk_sub_i16 v6, v7, v6 -; GFX9-NEXT: v_pk_max_i16 v4, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_max_i16 v2, v6, v2 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_min_i16 v4, v1, 0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, 0 -; GFX9-NEXT: v_pk_sub_i16 v4, v7, v4 -; GFX9-NEXT: v_pk_sub_i16 v2, v5, v2 -; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_min_i16 v2, v3, v2 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_i16 v4, v0, 0 -; GFX10-NEXT: v_pk_min_i16 v5, v1, 0 -; GFX10-NEXT: v_pk_max_i16 v6, v0, 0 -; GFX10-NEXT: v_pk_max_i16 v7, v1, 0 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v4, 0x80008000, v4 -; GFX10-NEXT: v_pk_sub_i16 v5, 0x80008000, v5 -; GFX10-NEXT: v_pk_sub_i16 v6, 0x7fff7fff, v6 -; GFX10-NEXT: v_pk_sub_i16 v7, 0x7fff7fff, v7 -; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_max_i16 v10, v5, v3 -; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v10, v7 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -5265,193 +3623,21 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_saddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s6, s0 -; GFX9-NEXT: s_ashr_i32 s7, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s8, 0 -; GFX9-NEXT: s_cmp_gt_i32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s9, s6, s8 -; GFX9-NEXT: s_cmp_gt_i32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s10, s7, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff -; GFX9-NEXT: s_lshr_b32 s11, s9, 16 -; GFX9-NEXT: s_movk_i32 s10, 0x7fff -; GFX9-NEXT: s_sub_i32 s9, s4, s9 -; GFX9-NEXT: s_sub_i32 s11, s10, s11 -; GFX9-NEXT: s_cmp_lt_i32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s6, s6, s8 -; GFX9-NEXT: s_cmp_lt_i32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s7, s7, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX9-NEXT: s_mov_b32 s5, 0x80008000 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 -; GFX9-NEXT: s_mov_b32 s7, 0x8000 -; GFX9-NEXT: s_sub_i32 s6, s5, s6 -; GFX9-NEXT: s_sub_i32 s11, s7, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 -; GFX9-NEXT: s_sext_i32_i16 s11, s6 -; GFX9-NEXT: s_sext_i32_i16 s12, s2 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s12 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 -; GFX9-NEXT: s_cmp_gt_i32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: s_sext_i32_i16 s6, s2 -; GFX9-NEXT: s_sext_i32_i16 s11, s9 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 -; GFX9-NEXT: s_cmp_lt_i32 s6, s11 -; GFX9-NEXT: s_cselect_b32 s6, s6, s11 -; GFX9-NEXT: s_cmp_lt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s2, s2, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s2 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_lshr_b32 s9, s2, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_add_i32 s6, s6, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s6, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s8 -; GFX9-NEXT: s_cselect_b32 s9, s2, s8 -; GFX9-NEXT: s_cmp_gt_i32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s11, s6, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX9-NEXT: s_lshr_b32 s11, s9, 16 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s9, s10, s11 -; GFX9-NEXT: s_cmp_lt_i32 s2, s8 -; GFX9-NEXT: s_cselect_b32 s2, s2, s8 -; GFX9-NEXT: s_cmp_lt_i32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s6, s6, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_sub_i32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s4, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, 0 -; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_mov_b32 s9, 0x7fff7fff -; GFX10-NEXT: s_cselect_b32 s7, s4, s5 -; GFX10-NEXT: s_cmp_gt_i32 s6, 0 -; GFX10-NEXT: s_mov_b32 s11, 0x80008000 -; GFX10-NEXT: s_cselect_b32 s8, s6, 0 -; GFX10-NEXT: s_sext_i32_i16 s13, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_movk_i32 s8, 0x7fff -; GFX10-NEXT: s_lshr_b32 s10, s7, 16 -; GFX10-NEXT: s_sub_i32 s7, s9, s7 -; GFX10-NEXT: s_sub_i32 s10, s8, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s6, s6, 0 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX10-NEXT: s_mov_b32 s6, 0x8000 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s11, s4 -; GFX10-NEXT: s_sub_i32 s12, s6, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s13 -; GFX10-NEXT: s_cselect_b32 s12, s12, s13 -; GFX10-NEXT: s_cmp_gt_i32 s4, s2 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX10-NEXT: s_sext_i32_i16 s10, s4 -; GFX10-NEXT: s_sext_i32_i16 s7, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s7, s10 -; GFX10-NEXT: s_cselect_b32 s7, s7, s10 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s7, s2 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s7, s7, s10 -; GFX10-NEXT: s_ashr_i32 s2, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_cselect_b32 s10, s4, s5 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s12, s2, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX10-NEXT: s_lshr_b32 s12, s10, 16 -; GFX10-NEXT: s_sub_i32 s9, s9, s10 -; GFX10-NEXT: s_sub_i32 s8, s8, s12 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_cselect_b32 s2, s2, 0 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s11, s2 -; GFX10-NEXT: s_sub_i32 s4, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_gt_i32 s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s9, s8 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX10-NEXT: s_ashr_i32 s4, s5, 16 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s5, s3 -; GFX10-NEXT: s_cselect_b32 s3, s5, s3 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -5612,57 +3798,19 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_i16 v8, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x80008000 -; GFX9-NEXT: v_pk_sub_i16 v8, v9, v8 -; GFX9-NEXT: v_pk_max_i16 v6, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v6, v7, v6 -; GFX9-NEXT: v_pk_max_i16 v3, v8, v3 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 -; GFX9-NEXT: v_pk_min_i16 v6, v1, 0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, 0 -; GFX9-NEXT: v_pk_sub_i16 v6, v9, v6 -; GFX9-NEXT: v_pk_sub_i16 v3, v7, v3 -; GFX9-NEXT: v_pk_max_i16 v4, v6, v4 -; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_min_i16 v4, v2, 0 -; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, 0 -; GFX9-NEXT: v_pk_sub_i16 v3, v7, v3 -; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_add_i16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_add_i16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_i16 v7, v0, 0 -; GFX10-NEXT: v_pk_min_i16 v8, v1, 0 -; GFX10-NEXT: v_pk_min_i16 v9, v2, 0 -; GFX10-NEXT: v_pk_max_i16 v6, v0, 0 -; GFX10-NEXT: v_pk_max_i16 v10, v1, 0 -; GFX10-NEXT: v_pk_sub_i16 v14, 0x80008000, v7 -; GFX10-NEXT: v_pk_sub_i16 v15, 0x80008000, v8 -; GFX10-NEXT: v_pk_max_i16 v11, v2, 0 -; GFX10-NEXT: v_pk_sub_i16 v19, 0x80008000, v9 -; GFX10-NEXT: v_pk_sub_i16 v6, 0x7fff7fff, v6 -; GFX10-NEXT: v_pk_max_i16 v3, v14, v3 -; GFX10-NEXT: v_pk_sub_i16 v7, 0x7fff7fff, v10 -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v8, 0x7fff7fff, v11 -; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 -; GFX10-NEXT: v_pk_min_i16 v3, v3, v6 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_add_i16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v4, v4, v7 -; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -5896,279 +4044,26 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_saddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s8, s0 -; GFX9-NEXT: s_ashr_i32 s9, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s10, 0 -; GFX9-NEXT: s_cmp_gt_i32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s11, s8, s10 -; GFX9-NEXT: s_cmp_gt_i32 s9, 0 -; GFX9-NEXT: s_cselect_b32 s12, s9, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_mov_b32 s6, 0x7fff7fff -; GFX9-NEXT: s_lshr_b32 s13, s11, 16 -; GFX9-NEXT: s_movk_i32 s12, 0x7fff -; GFX9-NEXT: s_sub_i32 s11, s6, s11 -; GFX9-NEXT: s_sub_i32 s13, s12, s13 -; GFX9-NEXT: s_cmp_lt_i32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s8, s8, s10 -; GFX9-NEXT: s_cmp_lt_i32 s9, 0 -; GFX9-NEXT: s_cselect_b32 s9, s9, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_mov_b32 s7, 0x80008000 -; GFX9-NEXT: s_lshr_b32 s13, s8, 16 -; GFX9-NEXT: s_mov_b32 s9, 0x8000 -; GFX9-NEXT: s_sub_i32 s8, s7, s8 -; GFX9-NEXT: s_sub_i32 s13, s9, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s13 -; GFX9-NEXT: s_sext_i32_i16 s13, s8 -; GFX9-NEXT: s_sext_i32_i16 s14, s3 -; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s13, s14 -; GFX9-NEXT: s_cselect_b32 s13, s13, s14 -; GFX9-NEXT: s_cmp_gt_i32 s8, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s3 -; GFX9-NEXT: s_sext_i32_i16 s8, s3 -; GFX9-NEXT: s_sext_i32_i16 s13, s11 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_cmp_lt_i32 s8, s13 -; GFX9-NEXT: s_cselect_b32 s8, s8, s13 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_lshr_b32 s11, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s8 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s8, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s11, s3, s10 -; GFX9-NEXT: s_cmp_gt_i32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s13, s8, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_lshr_b32 s13, s11, 16 -; GFX9-NEXT: s_sub_i32 s11, s6, s11 -; GFX9-NEXT: s_sub_i32 s13, s12, s13 -; GFX9-NEXT: s_cmp_lt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s3, s3, s10 -; GFX9-NEXT: s_cmp_lt_i32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s8, s8, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s8, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_sext_i32_i16 s8, s3 -; GFX9-NEXT: s_sext_i32_i16 s13, s4 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s8, s13 -; GFX9-NEXT: s_cselect_b32 s8, s8, s13 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s8, s11 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s8, s3, s10 -; GFX9-NEXT: s_cmp_gt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s11, s4, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX9-NEXT: s_lshr_b32 s11, s8, 16 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s8, s12, s11 -; GFX9-NEXT: s_cmp_lt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s3, s3, s10 -; GFX9-NEXT: s_cmp_lt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s4, s4, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s7, s5 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_gt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s6, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, 0 -; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_mov_b32 s11, 0x7fff7fff -; GFX10-NEXT: s_cselect_b32 s9, s6, s7 -; GFX10-NEXT: s_cmp_gt_i32 s8, 0 -; GFX10-NEXT: s_mov_b32 s13, 0x80008000 -; GFX10-NEXT: s_cselect_b32 s10, s8, 0 -; GFX10-NEXT: s_sext_i32_i16 s15, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_movk_i32 s10, 0x7fff -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_sub_i32 s9, s11, s9 -; GFX10-NEXT: s_sub_i32 s12, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_add_i16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_add_i16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s8, s8, 0 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_mov_b32 s8, 0x8000 -; GFX10-NEXT: s_lshr_b32 s14, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s13, s6 -; GFX10-NEXT: s_sub_i32 s14, s8, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s14 -; GFX10-NEXT: s_sext_i32_i16 s14, s6 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s14, s15 -; GFX10-NEXT: s_cselect_b32 s14, s14, s15 -; GFX10-NEXT: s_cmp_gt_i32 s6, s3 -; GFX10-NEXT: s_sext_i32_i16 s15, s4 -; GFX10-NEXT: s_cselect_b32 s3, s6, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s14, s3 -; GFX10-NEXT: s_sext_i32_i16 s12, s6 -; GFX10-NEXT: s_sext_i32_i16 s9, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s9, s12 -; GFX10-NEXT: s_cselect_b32 s9, s9, s12 -; GFX10-NEXT: s_cmp_lt_i32 s3, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_add_i32 s6, s6, s9 -; GFX10-NEXT: s_ashr_i32 s9, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX10-NEXT: s_cselect_b32 s12, s3, s7 -; GFX10-NEXT: s_cmp_gt_i32 s9, 0 -; GFX10-NEXT: s_cselect_b32 s14, s9, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX10-NEXT: s_lshr_b32 s14, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s11, s12 -; GFX10-NEXT: s_sub_i32 s14, s10, s14 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_cmp_lt_i32 s9, 0 -; GFX10-NEXT: s_cselect_b32 s9, s9, 0 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s13, s3 -; GFX10-NEXT: s_sub_i32 s9, s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX10-NEXT: s_sext_i32_i16 s9, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s15 -; GFX10-NEXT: s_cselect_b32 s9, s9, s15 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX10-NEXT: s_ashr_i32 s9, s12, 16 -; GFX10-NEXT: s_sext_i32_i16 s12, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s12, s4 -; GFX10-NEXT: s_cselect_b32 s4, s12, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s9 -; GFX10-NEXT: s_sext_i32_i16 s12, s2 -; GFX10-NEXT: s_cselect_b32 s3, s3, s9 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX10-NEXT: s_ashr_i32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s14, s3, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s9, s9, s14 -; GFX10-NEXT: s_cmp_gt_i32 s12, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s9 -; GFX10-NEXT: s_cselect_b32 s3, s12, s7 -; GFX10-NEXT: s_cmp_gt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s14, s4, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s14 -; GFX10-NEXT: s_lshr_b32 s14, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s11, s3 -; GFX10-NEXT: s_sub_i32 s10, s10, s14 -; GFX10-NEXT: s_cmp_lt_i32 s12, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 -; GFX10-NEXT: s_cselect_b32 s7, s12, s7 -; GFX10-NEXT: s_cmp_lt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s4, s4, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX10-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s13, s4 -; GFX10-NEXT: s_sub_i32 s7, s8, s7 -; GFX10-NEXT: s_sext_i32_i16 s8, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s8 -; GFX10-NEXT: s_cselect_b32 s7, s7, s8 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s7, s5 -; GFX10-NEXT: s_cselect_b32 s5, s7, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s4, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -6358,71 +4253,21 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_i16 v10, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0x80008000 -; GFX9-NEXT: v_pk_sub_i16 v10, v11, v10 -; GFX9-NEXT: v_pk_max_i16 v8, v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v8, v9, v8 -; GFX9-NEXT: v_pk_max_i16 v4, v10, v4 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 -; GFX9-NEXT: v_pk_min_i16 v8, v1, 0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, 0 -; GFX9-NEXT: v_pk_sub_i16 v8, v11, v8 -; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 -; GFX9-NEXT: v_pk_max_i16 v5, v8, v5 -; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v2, 0 -; GFX9-NEXT: v_pk_sub_i16 v5, v11, v5 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, 0 -; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 -; GFX9-NEXT: v_pk_max_i16 v5, v5, v6 -; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v3, 0 -; GFX9-NEXT: v_pk_sub_i16 v5, v11, v5 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, 0 -; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 -; GFX9-NEXT: v_pk_max_i16 v5, v5, v7 -; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_add_i16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_add_i16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_add_i16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_i16 v8, v0, 0 -; GFX10-NEXT: v_pk_min_i16 v11, v1, 0 -; GFX10-NEXT: v_pk_min_i16 v12, v3, 0 -; GFX10-NEXT: v_pk_max_i16 v9, v0, 0 -; GFX10-NEXT: v_pk_max_i16 v10, v1, 0 -; GFX10-NEXT: v_pk_sub_i16 v15, 0x80008000, v8 -; GFX10-NEXT: v_pk_min_i16 v8, v2, 0 -; GFX10-NEXT: v_pk_sub_i16 v11, 0x80008000, v11 -; GFX10-NEXT: v_pk_sub_i16 v12, 0x80008000, v12 -; GFX10-NEXT: v_pk_max_i16 v13, v2, 0 -; GFX10-NEXT: v_pk_max_i16 v14, v3, 0 -; GFX10-NEXT: v_pk_sub_i16 v8, 0x80008000, v8 -; GFX10-NEXT: v_pk_max_i16 v5, v11, v5 -; GFX10-NEXT: v_pk_sub_i16 v10, 0x7fff7fff, v10 -; GFX10-NEXT: v_pk_sub_i16 v9, 0x7fff7fff, v9 -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_max_i16 v6, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v11, 0x7fff7fff, v13 -; GFX10-NEXT: v_pk_sub_i16 v8, 0x7fff7fff, v14 -; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 -; GFX10-NEXT: v_pk_min_i16 v15, v4, v9 -; GFX10-NEXT: v_pk_min_i16 v19, v5, v10 -; GFX10-NEXT: v_pk_min_i16 v11, v6, v11 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_add_i16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_add_i16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v6, v7, v8 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v15 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v19 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v11 -; GFX10-NEXT: v_pk_add_u16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -6726,365 +4571,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_saddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s10, s0 -; GFX9-NEXT: s_ashr_i32 s11, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s12, 0 -; GFX9-NEXT: s_cmp_gt_i32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s13, s10, s12 -; GFX9-NEXT: s_cmp_gt_i32 s11, 0 -; GFX9-NEXT: s_cselect_b32 s14, s11, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 -; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff -; GFX9-NEXT: s_lshr_b32 s15, s13, 16 -; GFX9-NEXT: s_movk_i32 s14, 0x7fff -; GFX9-NEXT: s_sub_i32 s13, s8, s13 -; GFX9-NEXT: s_sub_i32 s15, s14, s15 -; GFX9-NEXT: s_cmp_lt_i32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s10, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s11, 0 -; GFX9-NEXT: s_cselect_b32 s11, s11, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX9-NEXT: s_mov_b32 s9, 0x80008000 -; GFX9-NEXT: s_lshr_b32 s15, s10, 16 -; GFX9-NEXT: s_mov_b32 s11, 0x8000 -; GFX9-NEXT: s_sub_i32 s10, s9, s10 -; GFX9-NEXT: s_sub_i32 s15, s11, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s15 -; GFX9-NEXT: s_sext_i32_i16 s15, s10 -; GFX9-NEXT: s_sext_i32_i16 s16, s4 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s15, s16 -; GFX9-NEXT: s_cselect_b32 s15, s15, s16 -; GFX9-NEXT: s_cmp_gt_i32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s15, s4 -; GFX9-NEXT: s_sext_i32_i16 s10, s4 -; GFX9-NEXT: s_sext_i32_i16 s15, s13 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s13, s13, 16 -; GFX9-NEXT: s_cmp_lt_i32 s10, s15 -; GFX9-NEXT: s_cselect_b32 s10, s10, s15 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 -; GFX9-NEXT: s_lshr_b32 s10, s0, 16 -; GFX9-NEXT: s_lshr_b32 s13, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s10 -; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s10, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s13, s4, s12 -; GFX9-NEXT: s_cmp_gt_i32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s15, s10, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX9-NEXT: s_lshr_b32 s15, s13, 16 -; GFX9-NEXT: s_sub_i32 s13, s8, s13 -; GFX9-NEXT: s_sub_i32 s15, s14, s15 -; GFX9-NEXT: s_cmp_lt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s4, s4, s12 -; GFX9-NEXT: s_cmp_lt_i32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s10, s10, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s10, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX9-NEXT: s_sext_i32_i16 s10, s4 -; GFX9-NEXT: s_sext_i32_i16 s15, s5 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s10, s15 -; GFX9-NEXT: s_cselect_b32 s10, s10, s15 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s10, s13 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s13, s13, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s10, s4, s12 -; GFX9-NEXT: s_cmp_gt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s13, s5, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 -; GFX9-NEXT: s_lshr_b32 s13, s10, 16 -; GFX9-NEXT: s_sub_i32 s10, s8, s10 -; GFX9-NEXT: s_sub_i32 s13, s14, s13 -; GFX9-NEXT: s_cmp_lt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s4, s4, s12 -; GFX9-NEXT: s_cmp_lt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s5, s5, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s11, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s13, s6 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s13 -; GFX9-NEXT: s_cselect_b32 s5, s5, s13 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s10 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s6, s4, s12 -; GFX9-NEXT: s_cmp_gt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s10, s5, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s10 -; GFX9-NEXT: s_lshr_b32 s10, s6, 16 -; GFX9-NEXT: s_sub_i32 s6, s8, s6 -; GFX9-NEXT: s_sub_i32 s8, s14, s10 -; GFX9-NEXT: s_cmp_lt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s4, s4, s12 -; GFX9-NEXT: s_cmp_lt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s5, s5, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s11, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s8, s7 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s7, s6 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_add_i16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s8, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, 0 -; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s9 -; GFX10-NEXT: s_mov_b32 s13, 0x7fff7fff -; GFX10-NEXT: s_cselect_b32 s11, s8, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, 0 -; GFX10-NEXT: s_mov_b32 s15, 0x80008000 -; GFX10-NEXT: s_cselect_b32 s12, s10, 0 -; GFX10-NEXT: s_sext_i32_i16 s17, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX10-NEXT: s_movk_i32 s12, 0x7fff -; GFX10-NEXT: s_lshr_b32 s14, s11, 16 -; GFX10-NEXT: s_sub_i32 s11, s13, s11 -; GFX10-NEXT: s_sub_i32 s14, s12, s14 -; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_add_i16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_add_i16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_add_i16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s10, s10, 0 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_mov_b32 s10, 0x8000 -; GFX10-NEXT: s_lshr_b32 s16, s8, 16 -; GFX10-NEXT: s_sub_i32 s8, s15, s8 -; GFX10-NEXT: s_sub_i32 s16, s10, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s16 -; GFX10-NEXT: s_sext_i32_i16 s16, s8 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_sext_i32_i16 s17, s5 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s11, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX10-NEXT: s_sext_i32_i16 s14, s8 -; GFX10-NEXT: s_sext_i32_i16 s11, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_lt_i32 s11, s14 -; GFX10-NEXT: s_cselect_b32 s11, s11, s14 -; GFX10-NEXT: s_cmp_lt_i32 s4, s8 -; GFX10-NEXT: s_cselect_b32 s4, s4, s8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_add_i32 s8, s8, s11 -; GFX10-NEXT: s_ashr_i32 s11, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 -; GFX10-NEXT: s_cselect_b32 s14, s4, s9 -; GFX10-NEXT: s_cmp_gt_i32 s11, 0 -; GFX10-NEXT: s_cselect_b32 s16, s11, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX10-NEXT: s_lshr_b32 s16, s14, 16 -; GFX10-NEXT: s_sub_i32 s14, s13, s14 -; GFX10-NEXT: s_sub_i32 s16, s12, s16 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_cmp_lt_i32 s11, 0 -; GFX10-NEXT: s_cselect_b32 s11, s11, 0 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s15, s4 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX10-NEXT: s_sext_i32_i16 s11, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s11, s17 -; GFX10-NEXT: s_cselect_b32 s11, s11, s17 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s17, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX10-NEXT: s_ashr_i32 s11, s14, 16 -; GFX10-NEXT: s_sext_i32_i16 s14, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s14, s5 -; GFX10-NEXT: s_cselect_b32 s5, s14, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_sext_i32_i16 s14, s2 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_lshr_b32 s11, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX10-NEXT: s_ashr_i32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s16, s4, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_gt_i32 s14, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s11 -; GFX10-NEXT: s_cselect_b32 s4, s14, s9 -; GFX10-NEXT: s_cmp_gt_i32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s16, s5, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 -; GFX10-NEXT: s_lshr_b32 s16, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s13, s4 -; GFX10-NEXT: s_sub_i32 s16, s12, s16 -; GFX10-NEXT: s_cmp_lt_i32 s14, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 -; GFX10-NEXT: s_cselect_b32 s14, s14, s9 -; GFX10-NEXT: s_cmp_lt_i32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s5, s5, 0 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s14, s5 -; GFX10-NEXT: s_lshr_b32 s14, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s15, s5 -; GFX10-NEXT: s_sub_i32 s14, s10, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s14 -; GFX10-NEXT: s_sext_i32_i16 s14, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_gt_i32 s14, s17 -; GFX10-NEXT: s_cselect_b32 s14, s14, s17 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s14, s5 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s14, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s14, s6 -; GFX10-NEXT: s_cselect_b32 s6, s14, s6 -; GFX10-NEXT: s_cmp_lt_i32 s5, s4 -; GFX10-NEXT: s_sext_i32_i16 s14, s3 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s4 -; GFX10-NEXT: s_ashr_i32 s6, s3, 16 -; GFX10-NEXT: s_lshr_b32 s16, s4, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_gt_i32 s14, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX10-NEXT: s_cselect_b32 s4, s14, s9 -; GFX10-NEXT: s_cmp_gt_i32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s16, s6, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 -; GFX10-NEXT: s_lshr_b32 s16, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s13, s4 -; GFX10-NEXT: s_sub_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_lt_i32 s14, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_cselect_b32 s9, s14, s9 -; GFX10-NEXT: s_cmp_lt_i32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s6, s6, 0 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s6 -; GFX10-NEXT: s_lshr_b32 s9, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s15, s6 -; GFX10-NEXT: s_sub_i32 s9, s10, s9 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s10 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s6 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s9, s7 -; GFX10-NEXT: s_cselect_b32 s7, s9, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX10-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_add_i32 s6, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index ac2a75383cba3..f9e4ccd03955e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -39,14 +39,8 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 -; GFX9-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX9-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_max_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -57,13 +51,7 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, -1 -; GFX10-NEXT: v_min_i16_e64 v3, v0, -1 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs) @@ -118,54 +106,23 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, -1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX9-NEXT: s_sext_i32_i16 s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, -1 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX10-NEXT: s_cmp_gt_i32 s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -206,14 +163,8 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX9-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_max_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -224,13 +175,7 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, -1 -; GFX10-NEXT: v_min_i16_e64 v3, v0, -1 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) @@ -285,54 +230,23 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, -1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX9-NEXT: s_sext_i32_i16 s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, -1 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX10-NEXT: s_cmp_gt_i32 s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -408,26 +322,12 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v4, -1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v4, s4, v4 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX9-NEXT: v_max_i16_e32 v1, v4, v1 -; GFX9-NEXT: v_subrev_u16_e32 v5, s5, v5 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v5 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, -1, v2 -; GFX9-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_min_i16_e32 v4, -1, v2 -; GFX9-NEXT: v_subrev_u16_e32 v4, s5, v4 -; GFX9-NEXT: v_max_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v4 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -438,31 +338,17 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_max_i16_e64 v4, v2, -1 -; GFX10-NEXT: v_max_i16_e64 v5, v0, -1 -; GFX10-NEXT: v_min_i16_e64 v6, v2, -1 -; GFX10-NEXT: v_min_i16_e64 v7, v0, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x8000 -; GFX10-NEXT: v_sub_nc_u16_e64 v4, v4, s5 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, v5, s5 -; GFX10-NEXT: v_sub_nc_u16_e64 v6, v6, s4 -; GFX10-NEXT: v_sub_nc_u16_e64 v7, v7, s4 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_max_i16_e64 v1, v4, v1 -; GFX10-NEXT: v_max_i16_e64 v10, v5, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i16_e64 v1, v1, v6 -; GFX10-NEXT: v_min_i16_e64 v3, v10, v7 -; GFX10-NEXT: v_sub_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v3 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_sub_nc_i16 v1, v2, v1 clamp +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -571,112 +457,40 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_ssubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_sext_i32_i16 s8, -1 -; GFX9-NEXT: s_cmp_gt_i32 s7, s8 -; GFX9-NEXT: s_movk_i32 s5, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s7, s8 -; GFX9-NEXT: s_sub_i32 s9, s9, s5 -; GFX9-NEXT: s_cmp_lt_i32 s7, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x8000 -; GFX9-NEXT: s_cselect_b32 s7, s7, s8 -; GFX9-NEXT: s_sub_i32 s7, s7, s6 -; GFX9-NEXT: s_sext_i32_i16 s9, s9 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s9, s1 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s7, s7 -; GFX9-NEXT: s_cmp_lt_i32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_ashr_i32 s0, s0, s4 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s7, s3, s8 -; GFX9-NEXT: s_sub_i32 s5, s7, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_ashr_i32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_sub_i16 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s6, -1 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s6 -; GFX10-NEXT: s_movk_i32 s9, 0x8000 -; GFX10-NEXT: s_sub_i32 s8, s8, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s8, s8 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_sub_i32 s5, s5, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s1 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s1, s8, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_sub_nc_i16 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s5 -; GFX10-NEXT: s_cselect_b32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s4, s2 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s5, s4, s6 -; GFX10-NEXT: s_sub_i32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_sub_i32 s4, s4, s9 -; GFX10-NEXT: s_cmp_gt_i32 s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: s_cselect_b32 s1, s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s4 -; GFX10-NEXT: s_cselect_b32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s1, s3, s1 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_and_b32 s0, s0, s3 -; GFX10-NEXT: s_ashr_i32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -815,52 +629,25 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v8, -1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v8, s4, v8 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v10, -1, v0 -; GFX9-NEXT: v_max_i16_e32 v1, v8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v10, s5, v10 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v10 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, -1, v2 -; GFX9-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_min_i16_e32 v8, -1, v2 -; GFX9-NEXT: v_subrev_u16_e32 v8, s5, v8 -; GFX9-NEXT: v_max_i16_e32 v1, v1, v5 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v8 -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp +; GFX9-NEXT: v_sub_i16 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v5, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_min_i16_e32 v6, -1, v2 -; GFX9-NEXT: v_sub_u16_e32 v5, v5, v9 -; GFX9-NEXT: v_subrev_u16_e32 v6, s5, v6 -; GFX9-NEXT: v_max_i16_e32 v3, v5, v3 -; GFX9-NEXT: v_min_i16_e32 v3, v3, v6 -; GFX9-NEXT: v_sub_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_max_i16_e32 v5, -1, v3 -; GFX9-NEXT: v_min_i16_e32 v6, -1, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX9-NEXT: v_sub_u16_e32 v5, v5, v9 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 -; GFX9-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_min_i16_e32 v4, v4, v6 +; GFX9-NEXT: v_sub_i16 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_i16 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -871,57 +658,30 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_i16_e64 v8, v4, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_min_i16_e64 v10, v4, -1 -; GFX10-NEXT: v_max_i16_e64 v9, v2, -1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s6, 24 -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: v_sub_nc_u16_e64 v15, v9, s4 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v11, v2, -1 -; GFX10-NEXT: v_max_i16_e64 v7, v8, v7 -; GFX10-NEXT: v_sub_nc_u16_e64 v10, v10, s5 -; GFX10-NEXT: v_max_i16_e64 v5, v15, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v11, s5 -; GFX10-NEXT: v_max_i16_e64 v11, v3, -1 -; GFX10-NEXT: v_min_i16_e64 v7, v7, v10 -; GFX10-NEXT: v_max_i16_e64 v10, v0, -1 -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v5, v5, v8 -; GFX10-NEXT: v_sub_nc_u16_e64 v11, v11, v9 -; GFX10-NEXT: v_min_i16_e64 v8, v3, -1 -; GFX10-NEXT: v_sub_nc_u16_e64 v15, v10, v9 -; GFX10-NEXT: v_min_i16_e64 v12, v0, -1 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, v5 -; GFX10-NEXT: v_max_i16_e64 v6, v11, v6 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, v8, s5 -; GFX10-NEXT: v_max_i16_e64 v1, v15, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v12, 0x8000 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_sub_nc_u16_e64 v4, v4, v7 -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v5, v6, v5 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v8 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_i16 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_i16 v5, v5, v6 clamp +; GFX10-NEXT: v_sub_nc_i16 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v4 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, v5 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, v4, s4, v2 -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v3), s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 +; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -1118,212 +878,70 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-LABEL: s_ssubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_sext_i32_i16 s12, -1 -; GFX9-NEXT: s_cmp_gt_i32 s11, s12 -; GFX9-NEXT: s_movk_i32 s9, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s11, s12 -; GFX9-NEXT: s_sub_i32 s13, s13, s9 -; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_movk_i32 s10, 0x8000 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 -; GFX9-NEXT: s_sub_i32 s11, s11, s10 -; GFX9-NEXT: s_sext_i32_i16 s13, s13 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s13, s1 -; GFX9-NEXT: s_cselect_b32 s1, s13, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s11, s11 -; GFX9-NEXT: s_cmp_lt_i32 s1, s11 -; GFX9-NEXT: s_cselect_b32 s1, s1, s11 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_ashr_i32 s0, s0, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s11, s5, s12 -; GFX9-NEXT: s_sub_i32 s11, s11, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s11 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s11, s2 -; GFX9-NEXT: s_cselect_b32 s2, s11, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_ashr_i32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_sext_i32_i16 s6, s6 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_gt_i32 s6, s3 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_sext_i32_i16 s6, s6 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_sub_i16 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_sub_i16 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_sub_i16 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s6, 8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_sub_nc_i16 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, -1 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s9, s10 -; GFX10-NEXT: s_movk_i32 s11, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s10 -; GFX10-NEXT: s_movk_i32 s13, 0x8000 -; GFX10-NEXT: s_sub_i32 s12, s12, s11 -; GFX10-NEXT: s_cmp_lt_i32 s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s12, s12 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_sub_i32 s9, s9, s13 -; GFX10-NEXT: s_cmp_gt_i32 s12, s1 -; GFX10-NEXT: s_sext_i32_i16 s9, s9 -; GFX10-NEXT: s_cselect_b32 s1, s12, s1 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_sub_nc_i16 v2, s3, s0 clamp +; GFX10-NEXT: v_sub_nc_i16 v3, s4, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s9 -; GFX10-NEXT: s_cselect_b32 s1, s1, s9 -; GFX10-NEXT: s_lshl_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s6 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, s6 -; GFX10-NEXT: s_cmp_gt_i32 s2, s10 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s9, s2, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s2, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s9 -; GFX10-NEXT: s_cselect_b32 s2, s2, s10 -; GFX10-NEXT: s_sub_i32 s2, s2, s13 -; GFX10-NEXT: s_cmp_gt_i32 s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_sub_i32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_lshl_b32 s2, s7, s6 -; GFX10-NEXT: s_ashr_i32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s7, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s7, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s5, s13 -; GFX10-NEXT: s_cmp_gt_i32 s7, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s2, s7, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s6 -; GFX10-NEXT: s_sub_i32 s2, s3, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s4 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_lshl_b32 s3, s8, s6 -; GFX10-NEXT: s_ashr_i32 s2, s2, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s7, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s7, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s5, s13 -; GFX10-NEXT: s_cmp_gt_i32 s7, s3 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_movk_i32 s7, 0xff -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_and_b32 s1, s1, s7 -; GFX10-NEXT: s_sub_i32 s3, s4, s3 -; GFX10-NEXT: s_and_b32 s0, s0, s7 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s7 -; GFX10-NEXT: s_ashr_i32 s3, s3, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s7 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -1368,14 +986,8 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1386,13 +998,7 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 0x7fffffff, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 0x80000000, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs) @@ -1439,39 +1045,22 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX9-LABEL: s_ssubsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s2, s0, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s3, s0, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, -1 -; GFX10-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 -; GFX10-NEXT: s_cselect_b32 s3, s0, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, 8 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -1505,27 +1094,15 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX9-LABEL: v_ssubsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v3, -1, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 0x7fffffff, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 0x80000000, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1564,33 +1141,16 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: s_ssubsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s2, s0, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s3, s0, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, -1 -; GFX10-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 -; GFX10-NEXT: s_cselect_b32 s3, s0, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1625,29 +1185,13 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX9-LABEL: ssubsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s1, s0, -1 -; GFX9-NEXT: s_sub_i32 s1, s1, 0x7fffffff -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s2, s0, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x80000000 -; GFX9-NEXT: v_max_i32_e32 v0, s1, v0 -; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s1, s0, -1 -; GFX10-NEXT: s_sub_i32 s1, s1, 0x7fffffff -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 -; GFX10-NEXT: v_max_i32_e32 v0, s1, v0 -; GFX10-NEXT: s_cselect_b32 s1, s0, -1 -; GFX10-NEXT: s_sub_i32 s1, s1, 0x80000000 -; GFX10-NEXT: v_min_i32_e32 v0, s1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1679,25 +1223,13 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: ssubsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, 0x80000000, v2 -; GFX9-NEXT: v_max_i32_e32 v1, s0, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i32 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v2, -1, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_max_i32_e32 v1, s0, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1750,45 +1282,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v4 -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_min_i32_e32 v2, v2, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX9-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX10-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_min_i32_e32 v6, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v7, -1, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s4, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s4, v5 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_max_i32_e32 v11, v4, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s4, v7 -; GFX10-NEXT: v_max_i32_e32 v10, v5, v3 -; GFX10-NEXT: v_min_i32_e32 v2, v11, v6 -; GFX10-NEXT: v_min_i32_e32 v3, v10, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1853,59 +1357,21 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: s_cselect_b32 s6, s0, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s4 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cselect_b32 s7, s0, -1 -; GFX9-NEXT: s_sub_i32 s7, s7, s5 -; GFX9-NEXT: s_cmp_gt_i32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lt_i32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s2, s1, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s4, s1, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: s_cselect_b32 s5, s0, -1 -; GFX10-NEXT: s_brev_b32 s6, 1 -; GFX10-NEXT: s_sub_i32 s5, s5, s4 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s2 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s0, -1 -; GFX10-NEXT: s_sub_i32 s7, s7, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s7 -; GFX10-NEXT: s_cselect_b32 s2, s2, s7 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s2, s1, -1 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s4, s1, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s6 -; GFX10-NEXT: s_cmp_gt_i32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_sub_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1971,59 +1437,19 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v6 -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v7 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX9-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v4 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_sub_i32 v0, v0, v3 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v4 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX10-NEXT: v_max_i32_e32 v8, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v9, -1, v2 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v15, s4, v8 -; GFX10-NEXT: v_subrev_nc_u32_e32 v19, s4, v9 -; GFX10-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX10-NEXT: v_min_i32_e32 v11, -1, v2 -; GFX10-NEXT: s_brev_b32 s5, 1 -; GFX10-NEXT: v_max_i32_e32 v14, v6, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s5, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s5, v10 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s5, v11 -; GFX10-NEXT: v_max_i32_e32 v5, v19, v5 -; GFX10-NEXT: v_min_i32_e32 v3, v14, v7 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v3 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v4 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2110,81 +1536,26 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s6, -2 -; GFX9-NEXT: s_cselect_b32 s8, s0, -1 -; GFX9-NEXT: s_sub_i32 s8, s8, s6 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s7, 1 -; GFX9-NEXT: s_cselect_b32 s9, s0, -1 -; GFX9-NEXT: s_sub_i32 s9, s9, s7 -; GFX9-NEXT: s_cmp_gt_i32 s8, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cmp_lt_i32 s3, s9 -; GFX9-NEXT: s_cselect_b32 s3, s3, s9 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s3, s1, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s8, s1, -1 -; GFX9-NEXT: s_sub_i32 s8, s8, s7 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s3, s2, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s4, s2, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_cmp_gt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s6, -2 -; GFX10-NEXT: s_cselect_b32 s7, s0, -1 -; GFX10-NEXT: s_brev_b32 s8, 1 -; GFX10-NEXT: s_sub_i32 s7, s7, s6 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s3 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s4 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s0, -1 -; GFX10-NEXT: s_sub_i32 s9, s9, s8 -; GFX10-NEXT: s_cmp_gt_i32 s7, s3 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s9 -; GFX10-NEXT: s_cselect_b32 s3, s3, s9 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s3, s1, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, s6 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s7, s1, -1 -; GFX10-NEXT: s_sub_i32 s7, s7, s8 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s3, s2, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, s6 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s4, s2, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2264,73 +1635,21 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v8, s4, v8 -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX9-NEXT: v_subrev_u32_e32 v9, s5, v9 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v9 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_subrev_u32_e32 v8, s5, v8 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v6 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, 0x7fffffff, v4 -; GFX9-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, 0x80000000, v5 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_i32 v0, v0, v4 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v5 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v6 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_max_i32_e32 v10, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v12, -1, v3 -; GFX10-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v15, s4, v8 -; GFX10-NEXT: v_max_i32_e32 v8, -1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v12, 0x7fffffff, v12 -; GFX10-NEXT: v_min_i32_e32 v11, -1, v1 -; GFX10-NEXT: v_min_i32_e32 v13, -1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s4, v8 -; GFX10-NEXT: v_min_i32_e32 v14, -1, v3 -; GFX10-NEXT: s_brev_b32 s5, 1 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s5, v9 -; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s5, v11 -; GFX10-NEXT: v_max_i32_e32 v15, v8, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s5, v13 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 0x80000000, v14 -; GFX10-NEXT: v_max_i32_e32 v7, v12, v7 -; GFX10-NEXT: v_min_i32_e32 v19, v4, v9 -; GFX10-NEXT: v_min_i32_e32 v11, v5, v11 -; GFX10-NEXT: v_min_i32_e32 v15, v15, v10 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v4 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v5 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v6 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v6, v7, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2439,103 +1758,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s8, -2 -; GFX9-NEXT: s_cselect_b32 s10, s0, -1 -; GFX9-NEXT: s_sub_i32 s10, s10, s8 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s9, 1 -; GFX9-NEXT: s_cselect_b32 s11, s0, -1 -; GFX9-NEXT: s_sub_i32 s11, s11, s9 -; GFX9-NEXT: s_cmp_gt_i32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s4, s1, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s10, s1, -1 -; GFX9-NEXT: s_sub_i32 s10, s10, s9 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s4, s2, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s5, s2, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s9 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_gt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s4, s3, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s5, s3, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s9 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s8, -2 -; GFX10-NEXT: s_cselect_b32 s9, s0, -1 -; GFX10-NEXT: s_brev_b32 s10, 1 -; GFX10-NEXT: s_sub_i32 s9, s9, s8 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s4 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s5 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s6 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s11, s0, -1 -; GFX10-NEXT: s_sub_i32 s11, s11, s10 -; GFX10-NEXT: s_cmp_gt_i32 s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s4, s1, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s9, s1, -1 -; GFX10-NEXT: s_sub_i32 s9, s9, s10 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s4, s2, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s5, s2, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s4, s3, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_lt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s5, s3, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_gt_i32 s4, s7 -; GFX10-NEXT: s_cselect_b32 s4, s4, s7 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2633,90 +1880,22 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v10, s4, v10 -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX9-NEXT: v_subrev_u32_e32 v12, s5, v12 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v10, s5, v10 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_bfrev_b32_e32 v11, -2 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v11 -; GFX9-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v13 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v8 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v11 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v13 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v9 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_sub_i32 v0, v0, v5 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v6 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v7 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v8 clamp +; GFX9-NEXT: v_sub_i32 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX10-NEXT: v_max_i32_e32 v13, -1, v1 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_bfrev_b32_e32 v11, -2 -; GFX10-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s4, v13 -; GFX10-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX10-NEXT: v_bfrev_b32_e32 v14, 1 -; GFX10-NEXT: v_min_i32_e32 v15, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX10-NEXT: v_max_i32_e32 v10, -1, v2 -; GFX10-NEXT: v_max_i32_e32 v6, v13, v6 -; GFX10-NEXT: v_max_i32_e32 v13, -1, v3 -; GFX10-NEXT: v_min_i32_e32 v16, -1, v2 -; GFX10-NEXT: v_min_i32_e32 v23, -1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v13, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, v17, v11 -; GFX10-NEXT: s_brev_b32 s5, 1 -; GFX10-NEXT: v_max_i32_e32 v7, v10, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s5, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v15, s5, v15 -; GFX10-NEXT: v_subrev_nc_u32_e32 v16, s5, v16 -; GFX10-NEXT: v_max_i32_e32 v8, v13, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, v23, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v19, v14 -; GFX10-NEXT: v_max_i32_e32 v11, v11, v9 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX10-NEXT: v_min_i32_e32 v6, v6, v15 -; GFX10-NEXT: v_min_i32_e32 v7, v7, v16 -; GFX10-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX10-NEXT: v_min_i32_e32 v9, v11, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v5 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v6 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v7 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v8 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -2848,125 +2027,36 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s10, -2 -; GFX9-NEXT: s_cselect_b32 s12, s0, -1 -; GFX9-NEXT: s_sub_i32 s12, s12, s10 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s11, 1 -; GFX9-NEXT: s_cselect_b32 s13, s0, -1 -; GFX9-NEXT: s_sub_i32 s13, s13, s11 -; GFX9-NEXT: s_cmp_gt_i32 s12, s5 -; GFX9-NEXT: s_cselect_b32 s5, s12, s5 -; GFX9-NEXT: s_cmp_lt_i32 s5, s13 -; GFX9-NEXT: s_cselect_b32 s5, s5, s13 -; GFX9-NEXT: s_sub_i32 s0, s0, s5 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s5, s1, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s12, s1, -1 -; GFX9-NEXT: s_sub_i32 s12, s12, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s5, s2, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s6, s2, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_cmp_gt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s5, s3, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s6, s3, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_cmp_gt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s5, s4, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s6, s4, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s10, -2 -; GFX10-NEXT: s_cselect_b32 s11, s0, -1 -; GFX10-NEXT: s_brev_b32 s12, 1 -; GFX10-NEXT: s_sub_i32 s11, s11, s10 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s5 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s6 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s7 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s8 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s13, s0, -1 -; GFX10-NEXT: s_sub_i32 s13, s13, s12 -; GFX10-NEXT: s_cmp_gt_i32 s11, s5 -; GFX10-NEXT: s_cselect_b32 s5, s11, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s13 -; GFX10-NEXT: s_cselect_b32 s5, s5, s13 -; GFX10-NEXT: s_sub_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s5, s1, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s11, s1, -1 -; GFX10-NEXT: s_sub_i32 s11, s11, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_lt_i32 s5, s11 -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s5, s2, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s6, s2, -1 -; GFX10-NEXT: s_sub_i32 s6, s6, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_gt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s5, s3, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s6, s3, -1 -; GFX10-NEXT: s_sub_i32 s6, s6, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s8 -; GFX10-NEXT: s_cselect_b32 s5, s5, s8 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s5, s4, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s6, s4, -1 -; GFX10-NEXT: s_sub_i32 s6, s6, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s9 -; GFX10-NEXT: s_cselect_b32 s5, s5, s9 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -3218,244 +2308,44 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v32, s4, v32 -; GFX9-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: v_min_i32_e32 v32, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v32, s5, v32 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v32 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_max_i32_e32 v16, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_min_i32_e32 v17, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_max_i32_e32 v16, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_min_i32_e32 v17, -1, v2 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX9-NEXT: v_subrev_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_bfrev_b32_e32 v16, -2 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_bfrev_b32_e32 v18, 1 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v5 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v6 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v7 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v7, v7, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v8, v8, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v9, v9, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v10, v10, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v11, v11, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v13, v13, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v14, v14, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX9-NEXT: v_sub_u32_e32 v16, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v17, -1, v15 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v18 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v31 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_sub_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_sub_i32 v0, v0, v16 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v17 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v18 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v19 clamp +; GFX9-NEXT: v_sub_i32 v4, v4, v20 clamp +; GFX9-NEXT: v_sub_i32 v5, v5, v21 clamp +; GFX9-NEXT: v_sub_i32 v6, v6, v22 clamp +; GFX9-NEXT: v_sub_i32 v7, v7, v23 clamp +; GFX9-NEXT: v_sub_i32 v8, v8, v24 clamp +; GFX9-NEXT: v_sub_i32 v9, v9, v25 clamp +; GFX9-NEXT: v_sub_i32 v10, v10, v26 clamp +; GFX9-NEXT: v_sub_i32 v11, v11, v27 clamp +; GFX9-NEXT: v_sub_i32 v12, v12, v28 clamp +; GFX9-NEXT: v_sub_i32 v13, v13, v29 clamp +; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp +; GFX9-NEXT: v_sub_i32 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v0 -; GFX10-NEXT: s_brev_b32 s5, 1 -; GFX10-NEXT: v_max_i32_e32 v36, -1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v35, s4, v32 -; GFX10-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v33, s5, v33 -; GFX10-NEXT: v_bfrev_b32_e32 v34, -2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v36, s4, v36 -; GFX10-NEXT: v_max_i32_e32 v16, v35, v16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v32, s4, v32 -; GFX10-NEXT: v_max_i32_e32 v39, -1, v3 -; GFX10-NEXT: v_min_i32_e32 v37, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v18, v36, v18 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v33 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v2 -; GFX10-NEXT: v_max_i32_e32 v38, v32, v17 -; GFX10-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v36, v39, v34 -; GFX10-NEXT: v_bfrev_b32_e32 v35, 1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v32, s5, v33 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v16 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v17, v34 -; GFX10-NEXT: v_subrev_nc_u32_e32 v37, s5, v37 -; GFX10-NEXT: v_min_i32_e32 v16, v18, v32 -; GFX10-NEXT: v_max_i32_e32 v19, v36, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v33, v35 -; GFX10-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX10-NEXT: v_min_i32_e32 v39, v38, v37 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v16 -; GFX10-NEXT: v_min_i32_e32 v16, -1, v4 -; GFX10-NEXT: v_min_i32_e32 v18, v19, v18 -; GFX10-NEXT: v_max_i32_e32 v19, -1, v5 -; GFX10-NEXT: v_max_i32_e32 v32, -1, v6 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v16, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v39 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v19, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, v32, v34 -; GFX10-NEXT: v_min_i32_e32 v36, -1, v6 -; GFX10-NEXT: v_min_i32_e32 v39, v17, v16 -; GFX10-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX10-NEXT: v_min_i32_e32 v16, -1, v7 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v33, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v39 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v17, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v36, v35 -; GFX10-NEXT: v_max_i32_e32 v22, v32, v22 -; GFX10-NEXT: v_min_i32_e32 v38, v19, v20 -; GFX10-NEXT: v_max_i32_e32 v20, -1, v9 -; GFX10-NEXT: v_max_i32_e32 v39, -1, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v16, v35 -; GFX10-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v21 -; GFX10-NEXT: v_min_i32_e32 v21, -1, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v20, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v39, v34 -; GFX10-NEXT: v_max_i32_e32 v39, -1, v10 -; GFX10-NEXT: v_min_i32_e32 v16, v17, v16 -; GFX10-NEXT: v_min_i32_e32 v22, -1, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v38 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, v6, v19 -; GFX10-NEXT: v_max_i32_e32 v18, v18, v24 -; GFX10-NEXT: v_max_i32_e32 v20, v20, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v22, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v21, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v16 -; GFX10-NEXT: v_max_i32_e32 v16, -1, v11 -; GFX10-NEXT: v_min_i32_e32 v38, -1, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v23, v39, v34 -; GFX10-NEXT: v_min_i32_e32 v17, v18, v19 -; GFX10-NEXT: v_min_i32_e32 v20, v20, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v16, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v38, v35 -; GFX10-NEXT: v_max_i32_e32 v19, v23, v26 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, v8, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, v9, v20 -; GFX10-NEXT: v_max_i32_e32 v20, -1, v13 -; GFX10-NEXT: v_max_i32_e32 v16, v16, v27 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v18 -; GFX10-NEXT: v_max_i32_e32 v19, -1, v12 -; GFX10-NEXT: v_max_i32_e32 v27, -1, v14 -; GFX10-NEXT: v_max_i32_e32 v23, -1, v15 -; GFX10-NEXT: v_min_i32_e32 v18, -1, v11 -; GFX10-NEXT: v_min_i32_e32 v21, -1, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v19, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v20, v34 -; GFX10-NEXT: v_min_i32_e32 v24, -1, v14 -; GFX10-NEXT: v_min_i32_e32 v25, -1, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v26, v23, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, v10, v17 -; GFX10-NEXT: v_min_i32_e32 v17, -1, v12 -; GFX10-NEXT: v_sub_nc_u32_e32 v27, v27, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v18, v35 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v21, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v17, v35 -; GFX10-NEXT: v_max_i32_e32 v20, v20, v29 -; GFX10-NEXT: v_sub_nc_u32_e32 v24, v24, v35 -; GFX10-NEXT: v_max_i32_e32 v22, v27, v30 -; GFX10-NEXT: v_sub_nc_u32_e32 v25, v25, v35 -; GFX10-NEXT: v_max_i32_e32 v23, v26, v31 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v18 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX10-NEXT: v_min_i32_e32 v18, v20, v21 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v24 -; GFX10-NEXT: v_min_i32_e32 v20, v23, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, v15, v20 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v19 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, v4, v20 clamp +; GFX10-NEXT: v_sub_nc_i32 v5, v5, v21 clamp +; GFX10-NEXT: v_sub_nc_i32 v6, v6, v22 clamp +; GFX10-NEXT: v_sub_nc_i32 v7, v7, v23 clamp +; GFX10-NEXT: v_sub_nc_i32 v8, v8, v24 clamp +; GFX10-NEXT: v_sub_nc_i32 v9, v9, v25 clamp +; GFX10-NEXT: v_sub_nc_i32 v10, v10, v26 clamp +; GFX10-NEXT: v_sub_nc_i32 v11, v11, v27 clamp +; GFX10-NEXT: v_sub_nc_i32 v12, v12, v28 clamp +; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp +; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) @@ -3829,367 +2719,91 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX9-LABEL: s_ssubsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s32, -2 -; GFX9-NEXT: s_cselect_b32 s34, s0, -1 -; GFX9-NEXT: s_sub_i32 s34, s34, s32 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s33, 1 -; GFX9-NEXT: s_cselect_b32 s35, s0, -1 -; GFX9-NEXT: s_sub_i32 s35, s35, s33 -; GFX9-NEXT: s_cmp_gt_i32 s34, s16 -; GFX9-NEXT: s_cselect_b32 s16, s34, s16 -; GFX9-NEXT: s_cmp_lt_i32 s16, s35 -; GFX9-NEXT: s_cselect_b32 s16, s16, s35 -; GFX9-NEXT: s_sub_i32 s0, s0, s16 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s16, s1, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s34, s1, -1 -; GFX9-NEXT: s_sub_i32 s34, s34, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_cmp_lt_i32 s16, s34 -; GFX9-NEXT: s_cselect_b32 s16, s16, s34 -; GFX9-NEXT: s_sub_i32 s1, s1, s16 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s16, s2, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s17, s2, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s18 -; GFX9-NEXT: s_cselect_b32 s16, s16, s18 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s2, s2, s16 -; GFX9-NEXT: s_cmp_gt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s16, s3, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s17, s3, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s19 -; GFX9-NEXT: s_cselect_b32 s16, s16, s19 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s3, s3, s16 -; GFX9-NEXT: s_cmp_gt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s16, s4, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s17, s4, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s20 -; GFX9-NEXT: s_cselect_b32 s16, s16, s20 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s4, s4, s16 -; GFX9-NEXT: s_cmp_gt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s16, s5, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s17, s5, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s21 -; GFX9-NEXT: s_cselect_b32 s16, s16, s21 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s5, s5, s16 -; GFX9-NEXT: s_cmp_gt_i32 s6, -1 -; GFX9-NEXT: s_cselect_b32 s16, s6, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s6, -1 -; GFX9-NEXT: s_cselect_b32 s17, s6, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s22 -; GFX9-NEXT: s_cselect_b32 s16, s16, s22 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s6, s6, s16 -; GFX9-NEXT: s_cmp_gt_i32 s7, -1 -; GFX9-NEXT: s_cselect_b32 s16, s7, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s7, -1 -; GFX9-NEXT: s_cselect_b32 s17, s7, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s23 -; GFX9-NEXT: s_cselect_b32 s16, s16, s23 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s7, s7, s16 -; GFX9-NEXT: s_cmp_gt_i32 s8, -1 -; GFX9-NEXT: s_cselect_b32 s16, s8, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s8, -1 -; GFX9-NEXT: s_cselect_b32 s17, s8, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s24 -; GFX9-NEXT: s_cselect_b32 s16, s16, s24 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s8, s8, s16 -; GFX9-NEXT: s_cmp_gt_i32 s9, -1 -; GFX9-NEXT: s_cselect_b32 s16, s9, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s9, -1 -; GFX9-NEXT: s_cselect_b32 s17, s9, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s25 -; GFX9-NEXT: s_cselect_b32 s16, s16, s25 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s9, s9, s16 -; GFX9-NEXT: s_cmp_gt_i32 s10, -1 -; GFX9-NEXT: s_cselect_b32 s16, s10, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s10, -1 -; GFX9-NEXT: s_cselect_b32 s17, s10, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s26 -; GFX9-NEXT: s_cselect_b32 s16, s16, s26 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s10, s10, s16 -; GFX9-NEXT: s_cmp_gt_i32 s11, -1 -; GFX9-NEXT: s_cselect_b32 s16, s11, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s11, -1 -; GFX9-NEXT: s_cselect_b32 s17, s11, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s27 -; GFX9-NEXT: s_cselect_b32 s16, s16, s27 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s11, s11, s16 -; GFX9-NEXT: s_cmp_gt_i32 s12, -1 -; GFX9-NEXT: s_cselect_b32 s16, s12, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s12, -1 -; GFX9-NEXT: s_cselect_b32 s17, s12, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s28 -; GFX9-NEXT: s_cselect_b32 s16, s16, s28 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_gt_i32 s13, -1 -; GFX9-NEXT: s_cselect_b32 s16, s13, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s13, -1 -; GFX9-NEXT: s_cselect_b32 s17, s13, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s29 -; GFX9-NEXT: s_cselect_b32 s16, s16, s29 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s13, s13, s16 -; GFX9-NEXT: s_cmp_gt_i32 s14, -1 -; GFX9-NEXT: s_cselect_b32 s16, s14, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s14, -1 -; GFX9-NEXT: s_cselect_b32 s17, s14, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s30 -; GFX9-NEXT: s_cselect_b32 s16, s16, s30 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_gt_i32 s15, -1 -; GFX9-NEXT: s_cselect_b32 s16, s15, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s15, -1 -; GFX9-NEXT: s_cselect_b32 s17, s15, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s31 -; GFX9-NEXT: s_cselect_b32 s16, s16, s31 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_sub_i32 v5, s5, v5 clamp +; GFX9-NEXT: v_sub_i32 v6, s6, v6 clamp +; GFX9-NEXT: v_sub_i32 v7, s7, v7 clamp +; GFX9-NEXT: v_sub_i32 v8, s8, v8 clamp +; GFX9-NEXT: v_sub_i32 v9, s9, v9 clamp +; GFX9-NEXT: v_sub_i32 v10, s10, v10 clamp +; GFX9-NEXT: v_sub_i32 v11, s11, v11 clamp +; GFX9-NEXT: v_sub_i32 v12, s12, v12 clamp +; GFX9-NEXT: v_sub_i32 v13, s13, v13 clamp +; GFX9-NEXT: v_sub_i32 v14, s14, v14 clamp +; GFX9-NEXT: v_sub_i32 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s46, -2 -; GFX10-NEXT: s_cselect_b32 s33, s0, -1 -; GFX10-NEXT: s_brev_b32 s34, 1 -; GFX10-NEXT: s_sub_i32 s47, s33, s46 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s16 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s17 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s18 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s19 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, s4, s20 clamp +; GFX10-NEXT: v_sub_nc_i32 v5, s5, s21 clamp +; GFX10-NEXT: v_sub_nc_i32 v6, s6, s22 clamp +; GFX10-NEXT: v_sub_nc_i32 v7, s7, s23 clamp +; GFX10-NEXT: v_sub_nc_i32 v8, s8, s24 clamp +; GFX10-NEXT: v_sub_nc_i32 v9, s9, s25 clamp +; GFX10-NEXT: v_sub_nc_i32 v10, s10, s26 clamp +; GFX10-NEXT: v_sub_nc_i32 v11, s11, s27 clamp +; GFX10-NEXT: v_sub_nc_i32 v12, s12, s28 clamp +; GFX10-NEXT: v_sub_nc_i32 v13, s13, s29 clamp +; GFX10-NEXT: v_sub_nc_i32 v14, s14, s30 clamp +; GFX10-NEXT: v_sub_nc_i32 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s35, s0, -1 -; GFX10-NEXT: s_sub_i32 s35, s35, s34 -; GFX10-NEXT: s_cmp_gt_i32 s47, s16 -; GFX10-NEXT: s_cselect_b32 s16, s47, s16 -; GFX10-NEXT: s_cmp_lt_i32 s16, s35 -; GFX10-NEXT: s_cselect_b32 s47, s16, s35 -; GFX10-NEXT: s_sub_i32 s0, s0, s47 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s16, s1, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s33, s1, -1 -; GFX10-NEXT: s_sub_i32 s47, s33, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_cmp_lt_i32 s16, s47 -; GFX10-NEXT: s_cselect_b32 s47, s16, s47 -; GFX10-NEXT: s_sub_i32 s1, s1, s47 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s16, s2, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s17, s2, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s18 -; GFX10-NEXT: s_cselect_b32 s16, s16, s18 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_gt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s16, s3, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s17, s3, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s19 -; GFX10-NEXT: s_cselect_b32 s16, s16, s19 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_gt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s16, s4, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s17, s4, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s20 -; GFX10-NEXT: s_cselect_b32 s16, s16, s20 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_gt_i32 s5, -1 -; GFX10-NEXT: s_cselect_b32 s16, s5, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s5, -1 -; GFX10-NEXT: s_cselect_b32 s17, s5, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s21 -; GFX10-NEXT: s_cselect_b32 s16, s16, s21 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_gt_i32 s6, -1 -; GFX10-NEXT: s_cselect_b32 s16, s6, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s6, -1 -; GFX10-NEXT: s_cselect_b32 s17, s6, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s22 -; GFX10-NEXT: s_cselect_b32 s16, s16, s22 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_gt_i32 s7, -1 -; GFX10-NEXT: s_cselect_b32 s16, s7, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s7, -1 -; GFX10-NEXT: s_cselect_b32 s17, s7, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s23 -; GFX10-NEXT: s_cselect_b32 s16, s16, s23 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_gt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s16, s8, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s17, s8, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s24 -; GFX10-NEXT: s_cselect_b32 s16, s16, s24 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_gt_i32 s9, -1 -; GFX10-NEXT: s_cselect_b32 s16, s9, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s9, -1 -; GFX10-NEXT: s_cselect_b32 s17, s9, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s25 -; GFX10-NEXT: s_cselect_b32 s16, s16, s25 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_gt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s16, s10, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s17, s10, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s26 -; GFX10-NEXT: s_cselect_b32 s16, s16, s26 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_gt_i32 s11, -1 -; GFX10-NEXT: s_cselect_b32 s16, s11, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s11, -1 -; GFX10-NEXT: s_cselect_b32 s17, s11, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s27 -; GFX10-NEXT: s_cselect_b32 s16, s16, s27 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_gt_i32 s12, -1 -; GFX10-NEXT: s_cselect_b32 s16, s12, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s12, -1 -; GFX10-NEXT: s_cselect_b32 s17, s12, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s28 -; GFX10-NEXT: s_cselect_b32 s16, s16, s28 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_gt_i32 s13, -1 -; GFX10-NEXT: s_cselect_b32 s16, s13, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s13, -1 -; GFX10-NEXT: s_cselect_b32 s17, s13, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s29 -; GFX10-NEXT: s_cselect_b32 s16, s16, s29 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_gt_i32 s14, -1 -; GFX10-NEXT: s_cselect_b32 s16, s14, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s14, -1 -; GFX10-NEXT: s_cselect_b32 s17, s14, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s30 -; GFX10-NEXT: s_cselect_b32 s16, s16, s30 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_gt_i32 s15, -1 -; GFX10-NEXT: s_cselect_b32 s16, s15, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s15, -1 -; GFX10-NEXT: s_cselect_b32 s17, s15, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s31 -; GFX10-NEXT: s_cselect_b32 s16, s16, s31 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -4226,27 +2840,15 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-LABEL: v_ssubsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX9-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_max_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i16_e64 v2, v0, -1 -; GFX10-NEXT: v_min_i16_e64 v3, v0, -1 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4294,45 +2896,16 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: s_ssubsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s3, -1 -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s2, s3 -; GFX9-NEXT: s_sub_i32 s4, s4, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, 0xffff8000 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, -1 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s2 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s3, s2 -; GFX10-NEXT: s_sub_i32 s4, s4, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s3, s2 -; GFX10-NEXT: s_cselect_b32 s2, s3, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s4 -; GFX10-NEXT: s_sub_i32 s2, s2, 0xffff8000 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4372,33 +2945,13 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX9-LABEL: ssubsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GFX9-NEXT: s_sext_i32_i16 s2, -1 -; GFX9-NEXT: s_cmp_gt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s3, s1, s2 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s1, s1, 0xffff8000 -; GFX9-NEXT: v_max_i16_e32 v0, s3, v0 -; GFX9-NEXT: v_min_i16_e32 v0, s1, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s1, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, -1 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_gt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s3, s1, s2 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: v_max_i16_e64 v0, s3, v0 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s1, s1, 0xffff8000 -; GFX10-NEXT: v_min_i16_e64 v0, v0, s1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4433,25 +2986,13 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: ssubsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_max_i16_e32 v1, -1, v0 -; GFX9-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_min_i16_e32 v2, -1, v0 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 -; GFX9-NEXT: v_max_i16_e32 v1, s0, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_i16_e64 v1, v0, -1 -; GFX10-NEXT: v_min_i16_e64 v2, v0, -1 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v1, s0 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4512,29 +3053,15 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 -; GFX9-NEXT: v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x80008000 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 -; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX9-NEXT: v_pk_min_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, v2, 0x7fff7fff -; GFX10-NEXT: v_pk_sub_i16 v3, v3, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -4627,99 +3154,16 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_ssubsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_ashr_i32 s3, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s4, -1 -; GFX9-NEXT: s_cmp_gt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s5, s2, s4 -; GFX9-NEXT: s_cmp_gt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s6, s3, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff7fff -; GFX9-NEXT: s_sub_i32 s6, s6, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s3, s3, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x80008000 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_ashr_i32 s4, s5, 16 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_gt_i32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s3, -1 -; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s3 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s2, s3 -; GFX10-NEXT: s_cmp_gt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s6, s4, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s1 -; GFX10-NEXT: s_lshr_b32 s7, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff7fff -; GFX10-NEXT: s_sub_i32 s7, s7, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_cmp_lt_i32 s4, -1 -; GFX10-NEXT: s_sext_i32_i16 s3, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, -1 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, 0x80008000 -; GFX10-NEXT: s_sub_i32 s4, s4, 0x8000 -; GFX10-NEXT: s_cmp_gt_i32 s3, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_cselect_b32 s1, s5, s1 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sub_i32 s1, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -4795,59 +3239,13 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GFX9-NEXT: s_ashr_i32 s2, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s3, -1 -; GFX9-NEXT: s_cmp_gt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s4, s1, s3 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s5, s2, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, s4, 0x7fff7fff -; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s2, s2, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s1, s1, 0x80008000 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX9-NEXT: v_pk_max_i16 v0, s4, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s1 -; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s1, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, -1 -; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s1, s2 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s1, s2 -; GFX10-NEXT: s_cmp_gt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s5, s3, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s4, 0x7fff7fff -; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_cmp_lt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s2, s3, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s5 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: v_pk_max_i16 v0, s2, v0 -; GFX10-NEXT: s_sub_i32 s1, s1, 0x80008000 -; GFX10-NEXT: s_sub_i32 s2, s3, 0x8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 -; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -4911,27 +3309,13 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-NEXT: v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x80008000 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 -; GFX9-NEXT: v_pk_max_i16 v1, v1, s0 -; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7fff7fff -; GFX10-NEXT: v_pk_sub_i16 v2, v2, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -5050,43 +3434,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x80008000 -; GFX9-NEXT: v_pk_max_i16 v2, v4, v2 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, v7 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v6 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v5 -; GFX9-NEXT: v_pk_min_i16 v4, v1, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v4, v4, v7 -; GFX9-NEXT: v_pk_max_i16 v2, v2, v3 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_max_i16 v5, v1, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v7, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v4, v4, 0x7fff7fff -; GFX10-NEXT: v_pk_sub_i16 v5, v5, 0x7fff7fff -; GFX10-NEXT: v_pk_sub_i16 v6, v6, 0x80008000 -; GFX10-NEXT: v_pk_sub_i16 v7, v7, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_max_i16 v10, v5, v3 -; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v10, v7 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -5250,193 +3608,21 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s6, s0 -; GFX9-NEXT: s_ashr_i32 s7, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s8, -1 -; GFX9-NEXT: s_cmp_gt_i32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s9, s6, s8 -; GFX9-NEXT: s_cmp_gt_i32 s7, -1 -; GFX9-NEXT: s_cselect_b32 s10, s7, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_movk_i32 s11, 0x7fff -; GFX9-NEXT: s_sub_i32 s9, s9, s4 -; GFX9-NEXT: s_sub_i32 s10, s10, s11 -; GFX9-NEXT: s_cmp_lt_i32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s6, s6, s8 -; GFX9-NEXT: s_cmp_lt_i32 s7, -1 -; GFX9-NEXT: s_cselect_b32 s7, s7, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_mov_b32 s5, 0x80008000 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_mov_b32 s10, 0x8000 -; GFX9-NEXT: s_sub_i32 s6, s6, s5 -; GFX9-NEXT: s_sub_i32 s7, s7, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_sext_i32_i16 s7, s9 -; GFX9-NEXT: s_sext_i32_i16 s12, s2 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s7, s12 -; GFX9-NEXT: s_cselect_b32 s7, s7, s12 -; GFX9-NEXT: s_cmp_gt_i32 s9, s2 -; GFX9-NEXT: s_cselect_b32 s2, s9, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 -; GFX9-NEXT: s_sext_i32_i16 s7, s2 -; GFX9-NEXT: s_sext_i32_i16 s9, s6 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_lt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s2, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s6, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s8 -; GFX9-NEXT: s_cselect_b32 s7, s2, s8 -; GFX9-NEXT: s_cmp_gt_i32 s6, -1 -; GFX9-NEXT: s_cselect_b32 s9, s6, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s9 -; GFX9-NEXT: s_lshr_b32 s9, s7, 16 -; GFX9-NEXT: s_sub_i32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s7, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s2, s8 -; GFX9-NEXT: s_cselect_b32 s2, s2, s8 -; GFX9-NEXT: s_cmp_lt_i32 s6, -1 -; GFX9-NEXT: s_cselect_b32 s6, s6, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s6, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_gt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s4, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, -1 -; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_movk_i32 s10, 0x7fff -; GFX10-NEXT: s_cselect_b32 s7, s4, s5 -; GFX10-NEXT: s_cmp_gt_i32 s6, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x80008000 -; GFX10-NEXT: s_cselect_b32 s8, s6, -1 -; GFX10-NEXT: s_sext_i32_i16 s13, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_mov_b32 s8, 0x7fff7fff -; GFX10-NEXT: s_lshr_b32 s9, s7, 16 -; GFX10-NEXT: s_sub_i32 s7, s7, s8 -; GFX10-NEXT: s_sub_i32 s9, s9, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s6, -1 -; GFX10-NEXT: s_cselect_b32 s6, s6, -1 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s9 -; GFX10-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-NEXT: s_mov_b32 s9, 0x8000 -; GFX10-NEXT: s_sext_i32_i16 s12, s6 -; GFX10-NEXT: s_sub_i32 s4, s4, s11 -; GFX10-NEXT: s_sub_i32 s7, s7, s9 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s13 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 -; GFX10-NEXT: s_cselect_b32 s12, s12, s13 -; GFX10-NEXT: s_cmp_gt_i32 s6, s2 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX10-NEXT: s_sext_i32_i16 s6, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s2 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_sub_i32 s2, s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_ashr_i32 s6, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s7, s4, s5 -; GFX10-NEXT: s_cmp_gt_i32 s6, -1 -; GFX10-NEXT: s_cselect_b32 s12, s6, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX10-NEXT: s_lshr_b32 s12, s7, 16 -; GFX10-NEXT: s_sub_i32 s7, s7, s8 -; GFX10-NEXT: s_sub_i32 s8, s12, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s6, -1 -; GFX10-NEXT: s_cselect_b32 s5, s6, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s7, s8 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s7, s5 -; GFX10-NEXT: s_sext_i32_i16 s8, s3 -; GFX10-NEXT: s_sub_i32 s4, s4, s11 -; GFX10-NEXT: s_sub_i32 s6, s6, s9 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX10-NEXT: s_cselect_b32 s7, s7, s8 -; GFX10-NEXT: s_cmp_gt_i32 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s6, s4 -; GFX10-NEXT: s_cselect_b32 s3, s5, s3 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s3 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s3, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -5597,57 +3783,19 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v6, v6, v7 -; GFX9-NEXT: v_pk_min_i16 v8, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v9, 0x80008000 -; GFX9-NEXT: v_pk_max_i16 v3, v6, v3 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, v9 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v8 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 -; GFX9-NEXT: v_pk_min_i16 v6, v1, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_max_i16 v3, v3, v4 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, v9 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 -; GFX9-NEXT: v_pk_min_i16 v4, v2, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 -; GFX9-NEXT: v_pk_max_i16 v3, v3, v5 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v4 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_max_i16 v8, v1, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_max_i16 v9, v2, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v7, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v10, v1, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_sub_i16 v6, v6, 0x7fff7fff -; GFX10-NEXT: v_pk_sub_i16 v15, v8, 0x7fff7fff -; GFX10-NEXT: v_pk_min_i16 v11, v2, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_sub_i16 v19, v9, 0x7fff7fff -; GFX10-NEXT: v_pk_sub_i16 v7, v7, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v14, v6, v3 -; GFX10-NEXT: v_pk_sub_i16 v6, v10, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v8, v11, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 -; GFX10-NEXT: v_pk_min_i16 v3, v14, v7 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v4, v4, v6 -; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -5881,279 +4029,26 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s8, s0 -; GFX9-NEXT: s_ashr_i32 s9, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s10, -1 -; GFX9-NEXT: s_cmp_gt_i32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s11, s8, s10 -; GFX9-NEXT: s_cmp_gt_i32 s9, -1 -; GFX9-NEXT: s_cselect_b32 s12, s9, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_mov_b32 s6, 0x7fff7fff -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_movk_i32 s13, 0x7fff -; GFX9-NEXT: s_sub_i32 s11, s11, s6 -; GFX9-NEXT: s_sub_i32 s12, s12, s13 -; GFX9-NEXT: s_cmp_lt_i32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s8, s8, s10 -; GFX9-NEXT: s_cmp_lt_i32 s9, -1 -; GFX9-NEXT: s_cselect_b32 s9, s9, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_mov_b32 s7, 0x80008000 -; GFX9-NEXT: s_lshr_b32 s9, s8, 16 -; GFX9-NEXT: s_mov_b32 s12, 0x8000 -; GFX9-NEXT: s_sub_i32 s8, s8, s7 -; GFX9-NEXT: s_sub_i32 s9, s9, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_sext_i32_i16 s9, s11 -; GFX9-NEXT: s_sext_i32_i16 s14, s3 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s14 -; GFX9-NEXT: s_cselect_b32 s9, s9, s14 -; GFX9-NEXT: s_cmp_gt_i32 s11, s3 -; GFX9-NEXT: s_cselect_b32 s3, s11, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s11, s8 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_cmp_lt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s9, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s3, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s8, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s9, s3, s10 -; GFX9-NEXT: s_cmp_gt_i32 s8, -1 -; GFX9-NEXT: s_cselect_b32 s11, s8, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX9-NEXT: s_lshr_b32 s11, s9, 16 -; GFX9-NEXT: s_sub_i32 s9, s9, s6 -; GFX9-NEXT: s_sub_i32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s3, s3, s10 -; GFX9-NEXT: s_cmp_lt_i32 s8, -1 -; GFX9-NEXT: s_cselect_b32 s8, s8, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 -; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s8, s8, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 -; GFX9-NEXT: s_sext_i32_i16 s8, s9 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s8, s11 -; GFX9-NEXT: s_cselect_b32 s8, s8, s11 -; GFX9-NEXT: s_cmp_gt_i32 s9, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s4 -; GFX9-NEXT: s_sext_i32_i16 s8, s4 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s8, s9 -; GFX9-NEXT: s_cselect_b32 s8, s8, s9 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s8, s3, s10 -; GFX9-NEXT: s_cmp_gt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s9, s4, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_lshr_b32 s9, s8, 16 -; GFX9-NEXT: s_sub_i32 s6, s8, s6 -; GFX9-NEXT: s_sub_i32 s8, s9, s13 -; GFX9-NEXT: s_cmp_lt_i32 s3, s10 -; GFX9-NEXT: s_cselect_b32 s3, s3, s10 -; GFX9-NEXT: s_cmp_lt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s4, s4, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s5 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s6, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, -1 -; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_movk_i32 s12, 0x7fff -; GFX10-NEXT: s_cselect_b32 s9, s6, s7 -; GFX10-NEXT: s_cmp_gt_i32 s8, -1 -; GFX10-NEXT: s_mov_b32 s13, 0x80008000 -; GFX10-NEXT: s_cselect_b32 s10, s8, -1 -; GFX10-NEXT: s_sext_i32_i16 s15, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_mov_b32 s10, 0x7fff7fff -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 -; GFX10-NEXT: s_sub_i32 s9, s9, s10 -; GFX10-NEXT: s_sub_i32 s11, s11, s12 -; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s8, s8, -1 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s11 -; GFX10-NEXT: s_lshr_b32 s9, s6, 16 -; GFX10-NEXT: s_mov_b32 s11, 0x8000 -; GFX10-NEXT: s_sext_i32_i16 s14, s8 -; GFX10-NEXT: s_sub_i32 s6, s6, s13 -; GFX10-NEXT: s_sub_i32 s9, s9, s11 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s14, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 -; GFX10-NEXT: s_cselect_b32 s14, s14, s15 -; GFX10-NEXT: s_cmp_gt_i32 s8, s3 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 -; GFX10-NEXT: s_cselect_b32 s3, s8, s3 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s14, s3 -; GFX10-NEXT: s_sext_i32_i16 s15, s4 -; GFX10-NEXT: s_sext_i32_i16 s8, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s9 -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_sub_i32 s3, s6, s8 -; GFX10-NEXT: s_sext_i32_i16 s6, s1 -; GFX10-NEXT: s_ashr_i32 s8, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_cselect_b32 s9, s6, s7 -; GFX10-NEXT: s_cmp_gt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s14, s8, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX10-NEXT: s_lshr_b32 s14, s9, 16 -; GFX10-NEXT: s_sub_i32 s9, s9, s10 -; GFX10-NEXT: s_sub_i32 s14, s14, s12 -; GFX10-NEXT: s_cmp_lt_i32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s8, s8, -1 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s14 -; GFX10-NEXT: s_lshr_b32 s9, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s14, s8 -; GFX10-NEXT: s_sub_i32 s6, s6, s13 -; GFX10-NEXT: s_sub_i32 s9, s9, s11 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s14, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 -; GFX10-NEXT: s_cselect_b32 s14, s14, s15 -; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s14, s4 -; GFX10-NEXT: s_sext_i32_i16 s8, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s9 -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 -; GFX10-NEXT: s_lshr_b32 s8, s4, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s4, s6, s8 -; GFX10-NEXT: s_sext_i32_i16 s6, s2 -; GFX10-NEXT: s_ashr_i32 s8, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s9, s6, s7 -; GFX10-NEXT: s_cmp_gt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s14, s8, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX10-NEXT: s_lshr_b32 s14, s9, 16 -; GFX10-NEXT: s_sub_i32 s9, s9, s10 -; GFX10-NEXT: s_sub_i32 s10, s14, s12 -; GFX10-NEXT: s_cmp_lt_i32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s7, s8, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s9, s10 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s9, s7 -; GFX10-NEXT: s_sext_i32_i16 s10, s5 -; GFX10-NEXT: s_sub_i32 s6, s6, s13 -; GFX10-NEXT: s_sub_i32 s8, s8, s11 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_cmp_gt_i32 s7, s5 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 -; GFX10-NEXT: s_cselect_b32 s5, s7, s5 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s7, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s7, s8 -; GFX10-NEXT: s_cselect_b32 s7, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s5 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s3 -; GFX10-NEXT: s_sub_i32 s3, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -6343,71 +4238,21 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff7fff -; GFX9-NEXT: v_pk_sub_i16 v8, v8, v9 -; GFX9-NEXT: v_pk_min_i16 v10, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v11, 0x80008000 -; GFX9-NEXT: v_pk_max_i16 v4, v8, v4 -; GFX9-NEXT: v_pk_sub_i16 v10, v10, v11 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v10 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 -; GFX9-NEXT: v_pk_min_i16 v8, v1, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, v11 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 -; GFX9-NEXT: v_pk_min_i16 v5, v2, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v5, v5, v11 -; GFX9-NEXT: v_pk_max_i16 v4, v4, v6 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 -; GFX9-NEXT: v_pk_min_i16 v5, v3, -1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_i16 v5, v5, v11 -; GFX9-NEXT: v_pk_max_i16 v4, v4, v7 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_max_i16 v10, v1, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_max_i16 v12, v3, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v9, v0, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v11, v1, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_sub_i16 v15, v8, 0x7fff7fff -; GFX10-NEXT: v_pk_max_i16 v8, v2, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_sub_i16 v10, v10, 0x7fff7fff -; GFX10-NEXT: v_pk_sub_i16 v12, v12, 0x7fff7fff -; GFX10-NEXT: v_pk_min_i16 v13, v2, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_min_i16 v14, v3, -1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_sub_i16 v8, v8, 0x7fff7fff -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v9, v9, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v5, v10, v5 -; GFX10-NEXT: v_pk_sub_i16 v11, v11, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v15, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v10, v13, 0x80008000 -; GFX10-NEXT: v_pk_sub_i16 v8, v14, 0x80008000 -; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 -; GFX10-NEXT: v_pk_min_i16 v19, v4, v9 -; GFX10-NEXT: v_pk_min_i16 v11, v5, v11 -; GFX10-NEXT: v_pk_min_i16 v15, v15, v10 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_sub_i16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v6, v7, v8 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v19 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v11 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v15 -; GFX10-NEXT: v_pk_sub_i16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -6711,365 +4556,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s10, s0 -; GFX9-NEXT: s_ashr_i32 s11, s0, 16 -; GFX9-NEXT: s_sext_i32_i16 s12, -1 -; GFX9-NEXT: s_cmp_gt_i32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s13, s10, s12 -; GFX9-NEXT: s_cmp_gt_i32 s11, -1 -; GFX9-NEXT: s_cselect_b32 s14, s11, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 -; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff -; GFX9-NEXT: s_lshr_b32 s14, s13, 16 -; GFX9-NEXT: s_movk_i32 s15, 0x7fff -; GFX9-NEXT: s_sub_i32 s13, s13, s8 -; GFX9-NEXT: s_sub_i32 s14, s14, s15 -; GFX9-NEXT: s_cmp_lt_i32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s10, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s11, -1 -; GFX9-NEXT: s_cselect_b32 s11, s11, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 -; GFX9-NEXT: s_mov_b32 s9, 0x80008000 -; GFX9-NEXT: s_lshr_b32 s11, s10, 16 -; GFX9-NEXT: s_mov_b32 s14, 0x8000 -; GFX9-NEXT: s_sub_i32 s10, s10, s9 -; GFX9-NEXT: s_sub_i32 s11, s11, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_sext_i32_i16 s11, s13 -; GFX9-NEXT: s_sext_i32_i16 s16, s4 -; GFX9-NEXT: s_ashr_i32 s13, s13, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s16 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 -; GFX9-NEXT: s_cmp_gt_i32 s13, s4 -; GFX9-NEXT: s_cselect_b32 s4, s13, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s13, s10 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_lshr_b32 s10, s0, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s10, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s11, s4, s12 -; GFX9-NEXT: s_cmp_gt_i32 s10, -1 -; GFX9-NEXT: s_cselect_b32 s13, s10, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_lshr_b32 s13, s11, 16 -; GFX9-NEXT: s_sub_i32 s11, s11, s8 -; GFX9-NEXT: s_sub_i32 s13, s13, s15 -; GFX9-NEXT: s_cmp_lt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s4, s4, s12 -; GFX9-NEXT: s_cmp_lt_i32 s10, -1 -; GFX9-NEXT: s_cselect_b32 s10, s10, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s10, s10, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_sext_i32_i16 s10, s11 -; GFX9-NEXT: s_sext_i32_i16 s13, s5 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s10, s13 -; GFX9-NEXT: s_cselect_b32 s10, s10, s13 -; GFX9-NEXT: s_cmp_gt_i32 s11, s5 -; GFX9-NEXT: s_cselect_b32 s5, s11, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s10, s11 -; GFX9-NEXT: s_cselect_b32 s10, s10, s11 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s10, s4, s12 -; GFX9-NEXT: s_cmp_gt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s11, s5, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_lshr_b32 s11, s10, 16 -; GFX9-NEXT: s_sub_i32 s10, s10, s8 -; GFX9-NEXT: s_sub_i32 s11, s11, s15 -; GFX9-NEXT: s_cmp_lt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s4, s4, s12 -; GFX9-NEXT: s_cmp_lt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s5, s5, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s6 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s11 -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_cmp_gt_i32 s10, s6 -; GFX9-NEXT: s_cselect_b32 s6, s10, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_sext_i32_i16 s6, s5 -; GFX9-NEXT: s_sext_i32_i16 s10, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s6, s10 -; GFX9-NEXT: s_cselect_b32 s6, s6, s10 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s4 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s6, s4, s12 -; GFX9-NEXT: s_cmp_gt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s10, s5, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s10 -; GFX9-NEXT: s_lshr_b32 s10, s6, 16 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s8, s10, s15 -; GFX9-NEXT: s_cmp_lt_i32 s4, s12 -; GFX9-NEXT: s_cselect_b32 s4, s4, s12 -; GFX9-NEXT: s_cmp_lt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s5, s5, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_sext_i32_i16 s8, s7 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_cmp_gt_i32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_sext_i32_i16 s6, s5 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s6, s7 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s4 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_sub_i16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s8, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, -1 -; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s9 -; GFX10-NEXT: s_movk_i32 s14, 0x7fff -; GFX10-NEXT: s_cselect_b32 s11, s8, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, -1 -; GFX10-NEXT: s_mov_b32 s15, 0x80008000 -; GFX10-NEXT: s_cselect_b32 s12, s10, -1 -; GFX10-NEXT: s_sext_i32_i16 s17, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX10-NEXT: s_mov_b32 s12, 0x7fff7fff -; GFX10-NEXT: s_lshr_b32 s13, s11, 16 -; GFX10-NEXT: s_sub_i32 s11, s11, s12 -; GFX10-NEXT: s_sub_i32 s13, s13, s14 -; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_sub_i16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s10, s10, -1 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s13 -; GFX10-NEXT: s_lshr_b32 s11, s8, 16 -; GFX10-NEXT: s_mov_b32 s13, 0x8000 -; GFX10-NEXT: s_sext_i32_i16 s16, s10 -; GFX10-NEXT: s_sub_i32 s8, s8, s15 -; GFX10-NEXT: s_sub_i32 s11, s11, s13 -; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s16, s17 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_cmp_gt_i32 s10, s4 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 -; GFX10-NEXT: s_cselect_b32 s4, s10, s4 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX10-NEXT: s_sext_i32_i16 s17, s5 -; GFX10-NEXT: s_sext_i32_i16 s10, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s11 -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s4, s8 -; GFX10-NEXT: s_cselect_b32 s4, s4, s8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s10, s4 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_sub_i32 s4, s8, s10 -; GFX10-NEXT: s_sext_i32_i16 s8, s1 -; GFX10-NEXT: s_ashr_i32 s10, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s11, s8, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s16, s10, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 -; GFX10-NEXT: s_lshr_b32 s16, s11, 16 -; GFX10-NEXT: s_sub_i32 s11, s11, s12 -; GFX10-NEXT: s_sub_i32 s16, s16, s14 -; GFX10-NEXT: s_cmp_lt_i32 s8, s9 -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s10, s10, -1 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s16 -; GFX10-NEXT: s_lshr_b32 s11, s8, 16 -; GFX10-NEXT: s_sext_i32_i16 s16, s10 -; GFX10-NEXT: s_sub_i32 s8, s8, s15 -; GFX10-NEXT: s_sub_i32 s11, s11, s13 -; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s16, s17 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_cmp_gt_i32 s10, s5 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 -; GFX10-NEXT: s_cselect_b32 s5, s10, s5 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s16, s5 -; GFX10-NEXT: s_sext_i32_i16 s17, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s11 -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s8 -; GFX10-NEXT: s_cselect_b32 s5, s5, s8 -; GFX10-NEXT: s_lshr_b32 s8, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s10, s5 -; GFX10-NEXT: s_lshr_b32 s10, s5, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_sub_i32 s5, s8, s10 -; GFX10-NEXT: s_sext_i32_i16 s8, s2 -; GFX10-NEXT: s_ashr_i32 s10, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s11, s8, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s16, s10, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 -; GFX10-NEXT: s_lshr_b32 s16, s11, 16 -; GFX10-NEXT: s_sub_i32 s11, s11, s12 -; GFX10-NEXT: s_sub_i32 s16, s16, s14 -; GFX10-NEXT: s_cmp_lt_i32 s8, s9 -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s10, s10, -1 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s16 -; GFX10-NEXT: s_lshr_b32 s11, s8, 16 -; GFX10-NEXT: s_sext_i32_i16 s16, s10 -; GFX10-NEXT: s_sub_i32 s8, s8, s15 -; GFX10-NEXT: s_sub_i32 s11, s11, s13 -; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s16, s17 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_cmp_gt_i32 s10, s6 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 -; GFX10-NEXT: s_cselect_b32 s6, s10, s6 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s16, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, s6 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s11 -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s10, s6 -; GFX10-NEXT: s_lshr_b32 s10, s6, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s6 -; GFX10-NEXT: s_sub_i32 s6, s8, s10 -; GFX10-NEXT: s_sext_i32_i16 s8, s3 -; GFX10-NEXT: s_ashr_i32 s10, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s11, s8, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s16, s10, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 -; GFX10-NEXT: s_lshr_b32 s16, s11, 16 -; GFX10-NEXT: s_sub_i32 s11, s11, s12 -; GFX10-NEXT: s_sub_i32 s12, s16, s14 -; GFX10-NEXT: s_cmp_lt_i32 s8, s9 -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s9, s10, -1 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s11, s12 -; GFX10-NEXT: s_lshr_b32 s10, s8, 16 -; GFX10-NEXT: s_sext_i32_i16 s11, s9 -; GFX10-NEXT: s_sext_i32_i16 s12, s7 -; GFX10-NEXT: s_sub_i32 s8, s8, s15 -; GFX10-NEXT: s_sub_i32 s10, s10, s13 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_gt_i32 s11, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_cselect_b32 s11, s11, s12 -; GFX10-NEXT: s_cmp_gt_i32 s9, s7 -; GFX10-NEXT: s_sext_i32_i16 s10, s8 -; GFX10-NEXT: s_cselect_b32 s7, s9, s7 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s11, s7 -; GFX10-NEXT: s_sext_i32_i16 s9, s7 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_i32 s9, s10 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_cmp_lt_i32 s7, s8 -; GFX10-NEXT: s_cselect_b32 s4, s7, s8 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX10-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s4, s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 3a742fbcbd919..cbabb07aa9361 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -21,9 +21,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -32,9 +30,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -45,9 +41,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) @@ -69,31 +63,23 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-LABEL: s_uaddsat_i7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, -1 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, -1 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i7: @@ -102,14 +88,9 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s0, -1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -132,9 +113,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -143,9 +122,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -156,9 +133,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) @@ -180,31 +155,23 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-LABEL: s_uaddsat_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, -1 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, -1 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i8: @@ -213,14 +180,9 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s0, -1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -256,16 +218,12 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v4, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v3 -; GFX8-NEXT: v_min_u16_e32 v1, v1, v2 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -276,16 +234,12 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v4, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v2 -; GFX9-NEXT: v_min_u16_e32 v1, v1, v3 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u16_e64 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -296,21 +250,17 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v4, v1 -; GFX10-NEXT: v_min_u16_e64 v3, v5, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v3 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v1, v2, v1 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v3 clamp +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -348,98 +298,60 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-LABEL: s_uaddsat_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_xor_b32 s5, s0, -1 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s1 -; GFX8-NEXT: s_cselect_b32 s1, s5, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s2, s3, s4 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_xor_b32 s3, s1, -1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s2 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_lshr_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s3, s4 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: s_lshl_b32 s0, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_xor_b32 s5, s0, -1 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_xor_b32 s3, s1, -1 -; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_lshr_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_lshl_b32 s3, s0, s2 -; GFX10-NEXT: s_lshl_b32 s5, s1, s2 -; GFX10-NEXT: s_xor_b32 s4, s3, -1 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_lt_u32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_xor_b32 s4, s0, -1 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s3, s2 -; GFX10-NEXT: s_cmp_lt_u32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s3, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -502,27 +414,19 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v8, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v3 -; GFX8-NEXT: v_min_u16_e32 v1, v1, v2 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX8-NEXT: v_min_u16_e32 v3, v4, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u16_e64 v2, v2, v3 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX8-NEXT: v_min_u16_e32 v4, v5, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX8-NEXT: v_add_u16_e64 v3, v3, v4 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -539,33 +443,25 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v8, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v2 -; GFX9-NEXT: v_min_u16_e32 v1, v1, v5 -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: v_add_u16_e64 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX9-NEXT: v_min_u16_e32 v3, v5, v3 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_min_u16_e32 v4, v5, v4 +; GFX9-NEXT: v_add_u16_e64 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e64 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -577,36 +473,28 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_mov_b32 s6, 24 -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 -; GFX10-NEXT: v_min_u16_e64 v3, v5, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v7, v8, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v3 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v0 -; GFX10-NEXT: v_min_u16_e64 v5, v11, v5 -; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, v7 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16_e64 v5, v5, v6 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v1, v3, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v3, v6, v5 -; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v4 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v2, v4, s4, v2 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 ; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -669,188 +557,107 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-LABEL: s_uaddsat_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s9, s0, -1 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s9, s1 -; GFX8-NEXT: s_cselect_b32 s1, s9, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s8 -; GFX8-NEXT: s_lshl_b32 s2, s5, s8 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_xor_b32 s5, s1, -1 -; GFX8-NEXT: s_lshr_b32 s0, s0, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s2 -; GFX8-NEXT: s_cselect_b32 s2, s5, s2 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s3, s8 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_xor_b32 s5, s2, -1 -; GFX8-NEXT: s_lshr_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s3 -; GFX8-NEXT: s_cselect_b32 s3, s5, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s3, s4, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_xor_b32 s5, s3, -1 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s4 -; GFX8-NEXT: s_cselect_b32 s4, s5, s4 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_and_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: s_lshl_b32 s1, s6, s8 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp +; GFX8-NEXT: s_lshl_b32 s0, s4, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s9, s0, -1 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s9, s1 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_xor_b32 s5, s1, -1 -; GFX9-NEXT: s_lshr_b32 s0, s0, s8 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_xor_b32 s5, s2, -1 -; GFX9-NEXT: s_lshr_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_xor_b32 s5, s3, -1 -; GFX9-NEXT: s_lshr_b32 s2, s2, s8 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_add_u16_e64 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_u16_e64 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_add_nc_u16_e64 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, s5 -; GFX10-NEXT: s_lshl_b32 s9, s1, s5 -; GFX10-NEXT: s_xor_b32 s8, s0, -1 -; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-NEXT: s_cmp_lt_u32 s8, s9 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_lshl_b32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s8, s2, -1 -; GFX10-NEXT: s_lshl_b32 s6, s6, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s5 -; GFX10-NEXT: s_cmp_lt_u32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 -; GFX10-NEXT: s_add_i32 s2, s2, s6 -; GFX10-NEXT: s_xor_b32 s6, s3, -1 -; GFX10-NEXT: s_lshl_b32 s7, s7, s5 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX10-NEXT: s_lshr_b32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_add_i32 s3, s3, s6 -; GFX10-NEXT: s_xor_b32 s6, s4, -1 ; GFX10-NEXT: s_lshl_b32 s1, s1, s5 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s1 -; GFX10-NEXT: s_cselect_b32 s1, s6, s1 -; GFX10-NEXT: s_add_i32 s4, s4, s1 -; GFX10-NEXT: s_bfe_u32 s1, s4, 0x100000 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: s_and_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s4 -; GFX10-NEXT: s_and_b32 s1, s1, s4 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_add_nc_u16_e64 v2, s3, s0 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v3, s4, s1 clamp +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -876,9 +683,7 @@ define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -887,9 +692,7 @@ define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -900,9 +703,7 @@ define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs) @@ -923,36 +724,32 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX8-LABEL: s_uaddsat_i24: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_not_b32 s2, s0 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_not_b32 s2, s0 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_not_b32 s2, s0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -970,27 +767,21 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-LABEL: v_uaddsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1007,27 +798,23 @@ define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: s_uaddsat_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s2, s0 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s2, s0 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s2, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1043,24 +830,18 @@ define amdgpu_ps float @uaddsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX8-LABEL: uaddsat_i32_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s1, s0 -; GFX8-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s1, s0 -; GFX9-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s1, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1077,24 +858,18 @@ define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: uaddsat_i32_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1116,36 +891,24 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v2, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v3 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX10-NEXT: v_min_u32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1166,39 +929,31 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s4, s0 -; GFX8-NEXT: s_cmp_lt_u32 s4, s2 -; GFX8-NEXT: s_cselect_b32 s2, s4, s2 -; GFX8-NEXT: s_add_i32 s0, s0, s2 -; GFX8-NEXT: s_not_b32 s2, s1 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s4, s0 -; GFX9-NEXT: s_cmp_lt_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_not_b32 s2, s1 -; GFX9-NEXT: s_cmp_lt_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s4, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s2 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s4, s2 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_not_b32 s4, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_lt_u32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s2, s4, s3 -; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1222,45 +977,27 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v5 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX10-NEXT: v_min_u32_e32 v4, v7, v4 -; GFX10-NEXT: v_min_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1285,51 +1022,39 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s6, s0 -; GFX8-NEXT: s_cmp_lt_u32 s6, s3 -; GFX8-NEXT: s_cselect_b32 s3, s6, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_not_b32 s3, s1 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s1, s1, s3 -; GFX8-NEXT: s_not_b32 s3, s2 -; GFX8-NEXT: s_cmp_lt_u32 s3, s5 -; GFX8-NEXT: s_cselect_b32 s3, s3, s5 -; GFX8-NEXT: s_add_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s6, s0 -; GFX9-NEXT: s_cmp_lt_u32 s6, s3 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_not_b32 s3, s1 -; GFX9-NEXT: s_cmp_lt_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_not_b32 s3, s2 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s6, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s3 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s6, s3 -; GFX10-NEXT: s_cselect_b32 s3, s6, s3 -; GFX10-NEXT: s_not_b32 s6, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_lt_u32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s3, s6, s4 -; GFX10-NEXT: s_not_b32 s4, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s3, s4, s5 -; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1356,54 +1081,30 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v7 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v15, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v23, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v3 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v11, v15, v4 -; GFX10-NEXT: v_min_u32_e32 v15, v19, v5 -; GFX10-NEXT: v_min_u32_e32 v19, v23, v6 -; GFX10-NEXT: v_min_u32_e32 v6, v10, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1432,63 +1133,47 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s8, s0 -; GFX8-NEXT: s_cmp_lt_u32 s8, s4 -; GFX8-NEXT: s_cselect_b32 s4, s8, s4 -; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_not_b32 s4, s1 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s1, s1, s4 -; GFX8-NEXT: s_not_b32 s4, s2 -; GFX8-NEXT: s_cmp_lt_u32 s4, s6 -; GFX8-NEXT: s_cselect_b32 s4, s4, s6 -; GFX8-NEXT: s_add_i32 s2, s2, s4 -; GFX8-NEXT: s_not_b32 s4, s3 -; GFX8-NEXT: s_cmp_lt_u32 s4, s7 -; GFX8-NEXT: s_cselect_b32 s4, s4, s7 -; GFX8-NEXT: s_add_i32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u32_e64 v0, s[8:9], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s8, s0 -; GFX9-NEXT: s_cmp_lt_u32 s8, s4 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_not_b32 s4, s1 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_not_b32 s4, s2 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_not_b32 s4, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s8, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_not_b32 s8, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_lt_u32 s8, s5 -; GFX10-NEXT: s_cselect_b32 s4, s8, s5 -; GFX10-NEXT: s_not_b32 s5, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_lt_u32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s4, s5, s6 -; GFX10-NEXT: s_not_b32 s5, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_u32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s4, s5, s7 -; GFX10-NEXT: s_add_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1518,62 +1203,32 @@ define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v6 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v8 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v4 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v9 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v7 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v8 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v9 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v7 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v8 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v4 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v9 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v7 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v8 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v12, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v3 -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4 -; GFX10-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX10-NEXT: v_min_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_min_u32_e32 v7, v12, v7 -; GFX10-NEXT: v_min_u32_e32 v8, v13, v8 -; GFX10-NEXT: v_min_u32_e32 v9, v14, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v7 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v8 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -1607,75 +1262,55 @@ define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s10, s0 -; GFX8-NEXT: s_cmp_lt_u32 s10, s5 -; GFX8-NEXT: s_cselect_b32 s5, s10, s5 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_not_b32 s5, s1 -; GFX8-NEXT: s_cmp_lt_u32 s5, s6 -; GFX8-NEXT: s_cselect_b32 s5, s5, s6 -; GFX8-NEXT: s_add_i32 s1, s1, s5 -; GFX8-NEXT: s_not_b32 s5, s2 -; GFX8-NEXT: s_cmp_lt_u32 s5, s7 -; GFX8-NEXT: s_cselect_b32 s5, s5, s7 -; GFX8-NEXT: s_add_i32 s2, s2, s5 -; GFX8-NEXT: s_not_b32 s5, s3 -; GFX8-NEXT: s_cmp_lt_u32 s5, s8 -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: s_add_i32 s3, s3, s5 -; GFX8-NEXT: s_not_b32 s5, s4 -; GFX8-NEXT: s_cmp_lt_u32 s5, s9 -; GFX8-NEXT: s_cselect_b32 s5, s5, s9 -; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_add_u32_e64 v0, s[10:11], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], s4, v4 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s10, s0 -; GFX9-NEXT: s_cmp_lt_u32 s10, s5 -; GFX9-NEXT: s_cselect_b32 s5, s10, s5 -; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_not_b32 s5, s1 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: s_not_b32 s5, s2 -; GFX9-NEXT: s_cmp_lt_u32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_add_i32 s2, s2, s5 -; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: s_cmp_lt_u32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_add_i32 s3, s3, s5 -; GFX9-NEXT: s_not_b32 s5, s4 -; GFX9-NEXT: s_cmp_lt_u32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 -; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s10, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s7 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s8 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s10, s5 -; GFX10-NEXT: s_cselect_b32 s5, s10, s5 -; GFX10-NEXT: s_not_b32 s10, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_lt_u32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s5, s10, s6 -; GFX10-NEXT: s_not_b32 s6, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s5, s6, s7 -; GFX10-NEXT: s_not_b32 s6, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s5, s6, s8 -; GFX10-NEXT: s_not_b32 s6, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s9 -; GFX10-NEXT: s_cselect_b32 s5, s6, s9 -; GFX10-NEXT: s_add_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -1738,162 +1373,66 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v32, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v16, v32, v16 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v18 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v19 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v4 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v20 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v5 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v21 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v7 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v23 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v8 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v24 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v9 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v25 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v10 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v26 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v12 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v28 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v13 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v29 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v14 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v30 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v15 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v31 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_add_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_add_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v31 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v32, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v16, v32, v16 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v18 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v19 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v4 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v20 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v5 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v21 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v7 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v23 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v8 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v24 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v9 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v25 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v10 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v26 -; GFX9-NEXT: v_add_u32_e32 v10, v10, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v12 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v28 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v13 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v29 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v14 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v30 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v15 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v31 -; GFX9-NEXT: v_add_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_add_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_add_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_add_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_add_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_add_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_add_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_add_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_add_u32_e64 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v35, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v32, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v33, -1, v3 -; GFX10-NEXT: v_xor_b32_e32 v34, -1, v4 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v35, v35, v16 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v35 -; GFX10-NEXT: v_xor_b32_e32 v35, -1, v5 -; GFX10-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v32, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v33, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v34, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v35, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v16 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v17 -; GFX10-NEXT: v_xor_b32_e32 v17, -1, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_xor_b32_e32 v18, -1, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v19 -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v20 -; GFX10-NEXT: v_xor_b32_e32 v20, -1, v10 -; GFX10-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX10-NEXT: v_min_u32_e32 v17, v17, v23 -; GFX10-NEXT: v_min_u32_e32 v18, v18, v24 -; GFX10-NEXT: v_min_u32_e32 v19, v19, v25 -; GFX10-NEXT: v_min_u32_e32 v20, v20, v26 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v16 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v17 -; GFX10-NEXT: v_xor_b32_e32 v17, -1, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v18 -; GFX10-NEXT: v_xor_b32_e32 v18, -1, v13 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v19 -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v20 -; GFX10-NEXT: v_xor_b32_e32 v20, -1, v15 -; GFX10-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX10-NEXT: v_min_u32_e32 v17, v17, v28 -; GFX10-NEXT: v_min_u32_e32 v18, v18, v29 -; GFX10-NEXT: v_min_u32_e32 v19, v19, v30 -; GFX10-NEXT: v_min_u32_e32 v20, v20, v31 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_add_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_add_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_add_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v15, v15, v20 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -1970,207 +1509,143 @@ define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX8-LABEL: s_uaddsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s32, s0 -; GFX8-NEXT: s_cmp_lt_u32 s32, s16 -; GFX8-NEXT: s_cselect_b32 s16, s32, s16 -; GFX8-NEXT: s_add_i32 s0, s0, s16 -; GFX8-NEXT: s_not_b32 s16, s1 -; GFX8-NEXT: s_cmp_lt_u32 s16, s17 -; GFX8-NEXT: s_cselect_b32 s16, s16, s17 -; GFX8-NEXT: s_add_i32 s1, s1, s16 -; GFX8-NEXT: s_not_b32 s16, s2 -; GFX8-NEXT: s_cmp_lt_u32 s16, s18 -; GFX8-NEXT: s_cselect_b32 s16, s16, s18 -; GFX8-NEXT: s_add_i32 s2, s2, s16 -; GFX8-NEXT: s_not_b32 s16, s3 -; GFX8-NEXT: s_cmp_lt_u32 s16, s19 -; GFX8-NEXT: s_cselect_b32 s16, s16, s19 -; GFX8-NEXT: s_add_i32 s3, s3, s16 -; GFX8-NEXT: s_not_b32 s16, s4 -; GFX8-NEXT: s_cmp_lt_u32 s16, s20 -; GFX8-NEXT: s_cselect_b32 s16, s16, s20 -; GFX8-NEXT: s_add_i32 s4, s4, s16 -; GFX8-NEXT: s_not_b32 s16, s5 -; GFX8-NEXT: s_cmp_lt_u32 s16, s21 -; GFX8-NEXT: s_cselect_b32 s16, s16, s21 -; GFX8-NEXT: s_add_i32 s5, s5, s16 -; GFX8-NEXT: s_not_b32 s16, s6 -; GFX8-NEXT: s_cmp_lt_u32 s16, s22 -; GFX8-NEXT: s_cselect_b32 s16, s16, s22 -; GFX8-NEXT: s_add_i32 s6, s6, s16 -; GFX8-NEXT: s_not_b32 s16, s7 -; GFX8-NEXT: s_cmp_lt_u32 s16, s23 -; GFX8-NEXT: s_cselect_b32 s16, s16, s23 -; GFX8-NEXT: s_add_i32 s7, s7, s16 -; GFX8-NEXT: s_not_b32 s16, s8 -; GFX8-NEXT: s_cmp_lt_u32 s16, s24 -; GFX8-NEXT: s_cselect_b32 s16, s16, s24 -; GFX8-NEXT: s_add_i32 s8, s8, s16 -; GFX8-NEXT: s_not_b32 s16, s9 -; GFX8-NEXT: s_cmp_lt_u32 s16, s25 -; GFX8-NEXT: s_cselect_b32 s16, s16, s25 -; GFX8-NEXT: s_add_i32 s9, s9, s16 -; GFX8-NEXT: s_not_b32 s16, s10 -; GFX8-NEXT: s_cmp_lt_u32 s16, s26 -; GFX8-NEXT: s_cselect_b32 s16, s16, s26 -; GFX8-NEXT: s_add_i32 s10, s10, s16 -; GFX8-NEXT: s_not_b32 s16, s11 -; GFX8-NEXT: s_cmp_lt_u32 s16, s27 -; GFX8-NEXT: s_cselect_b32 s16, s16, s27 -; GFX8-NEXT: s_add_i32 s11, s11, s16 -; GFX8-NEXT: s_not_b32 s16, s12 -; GFX8-NEXT: s_cmp_lt_u32 s16, s28 -; GFX8-NEXT: s_cselect_b32 s16, s16, s28 -; GFX8-NEXT: s_add_i32 s12, s12, s16 -; GFX8-NEXT: s_not_b32 s16, s13 -; GFX8-NEXT: s_cmp_lt_u32 s16, s29 -; GFX8-NEXT: s_cselect_b32 s16, s16, s29 -; GFX8-NEXT: s_add_i32 s13, s13, s16 -; GFX8-NEXT: s_not_b32 s16, s14 -; GFX8-NEXT: s_cmp_lt_u32 s16, s30 -; GFX8-NEXT: s_cselect_b32 s16, s16, s30 -; GFX8-NEXT: s_add_i32 s14, s14, s16 -; GFX8-NEXT: s_not_b32 s16, s15 -; GFX8-NEXT: s_cmp_lt_u32 s16, s31 -; GFX8-NEXT: s_cselect_b32 s16, s16, s31 -; GFX8-NEXT: s_add_i32 s15, s15, s16 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NEXT: v_mov_b32_e32 v10, s26 +; GFX8-NEXT: v_mov_b32_e32 v11, s27 +; GFX8-NEXT: v_mov_b32_e32 v12, s28 +; GFX8-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_add_u32_e64 v0, s[32:33], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[16:17], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[16:17], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], s3, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], s4, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[2:3], s5, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[2:3], s6, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], s7, v7 clamp +; GFX8-NEXT: v_add_u32_e64 v8, s[2:3], s8, v8 clamp +; GFX8-NEXT: v_add_u32_e64 v9, s[2:3], s9, v9 clamp +; GFX8-NEXT: v_add_u32_e64 v10, s[2:3], s10, v10 clamp +; GFX8-NEXT: v_add_u32_e64 v11, s[2:3], s11, v11 clamp +; GFX8-NEXT: v_add_u32_e64 v12, s[2:3], s12, v12 clamp +; GFX8-NEXT: v_add_u32_e64 v13, s[2:3], s13, v13 clamp +; GFX8-NEXT: v_add_u32_e64 v14, s[2:3], s14, v14 clamp +; GFX8-NEXT: v_add_u32_e64 v15, s[2:3], s15, v15 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8-NEXT: v_readfirstlane_b32 s6, v6 +; GFX8-NEXT: v_readfirstlane_b32 s7, v7 +; GFX8-NEXT: v_readfirstlane_b32 s8, v8 +; GFX8-NEXT: v_readfirstlane_b32 s9, v9 +; GFX8-NEXT: v_readfirstlane_b32 s10, v10 +; GFX8-NEXT: v_readfirstlane_b32 s11, v11 +; GFX8-NEXT: v_readfirstlane_b32 s12, v12 +; GFX8-NEXT: v_readfirstlane_b32 s13, v13 +; GFX8-NEXT: v_readfirstlane_b32 s14, v14 +; GFX8-NEXT: v_readfirstlane_b32 s15, v15 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s32, s0 -; GFX9-NEXT: s_cmp_lt_u32 s32, s16 -; GFX9-NEXT: s_cselect_b32 s16, s32, s16 -; GFX9-NEXT: s_add_i32 s0, s0, s16 -; GFX9-NEXT: s_not_b32 s16, s1 -; GFX9-NEXT: s_cmp_lt_u32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_add_i32 s1, s1, s16 -; GFX9-NEXT: s_not_b32 s16, s2 -; GFX9-NEXT: s_cmp_lt_u32 s16, s18 -; GFX9-NEXT: s_cselect_b32 s16, s16, s18 -; GFX9-NEXT: s_add_i32 s2, s2, s16 -; GFX9-NEXT: s_not_b32 s16, s3 -; GFX9-NEXT: s_cmp_lt_u32 s16, s19 -; GFX9-NEXT: s_cselect_b32 s16, s16, s19 -; GFX9-NEXT: s_add_i32 s3, s3, s16 -; GFX9-NEXT: s_not_b32 s16, s4 -; GFX9-NEXT: s_cmp_lt_u32 s16, s20 -; GFX9-NEXT: s_cselect_b32 s16, s16, s20 -; GFX9-NEXT: s_add_i32 s4, s4, s16 -; GFX9-NEXT: s_not_b32 s16, s5 -; GFX9-NEXT: s_cmp_lt_u32 s16, s21 -; GFX9-NEXT: s_cselect_b32 s16, s16, s21 -; GFX9-NEXT: s_add_i32 s5, s5, s16 -; GFX9-NEXT: s_not_b32 s16, s6 -; GFX9-NEXT: s_cmp_lt_u32 s16, s22 -; GFX9-NEXT: s_cselect_b32 s16, s16, s22 -; GFX9-NEXT: s_add_i32 s6, s6, s16 -; GFX9-NEXT: s_not_b32 s16, s7 -; GFX9-NEXT: s_cmp_lt_u32 s16, s23 -; GFX9-NEXT: s_cselect_b32 s16, s16, s23 -; GFX9-NEXT: s_add_i32 s7, s7, s16 -; GFX9-NEXT: s_not_b32 s16, s8 -; GFX9-NEXT: s_cmp_lt_u32 s16, s24 -; GFX9-NEXT: s_cselect_b32 s16, s16, s24 -; GFX9-NEXT: s_add_i32 s8, s8, s16 -; GFX9-NEXT: s_not_b32 s16, s9 -; GFX9-NEXT: s_cmp_lt_u32 s16, s25 -; GFX9-NEXT: s_cselect_b32 s16, s16, s25 -; GFX9-NEXT: s_add_i32 s9, s9, s16 -; GFX9-NEXT: s_not_b32 s16, s10 -; GFX9-NEXT: s_cmp_lt_u32 s16, s26 -; GFX9-NEXT: s_cselect_b32 s16, s16, s26 -; GFX9-NEXT: s_add_i32 s10, s10, s16 -; GFX9-NEXT: s_not_b32 s16, s11 -; GFX9-NEXT: s_cmp_lt_u32 s16, s27 -; GFX9-NEXT: s_cselect_b32 s16, s16, s27 -; GFX9-NEXT: s_add_i32 s11, s11, s16 -; GFX9-NEXT: s_not_b32 s16, s12 -; GFX9-NEXT: s_cmp_lt_u32 s16, s28 -; GFX9-NEXT: s_cselect_b32 s16, s16, s28 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_not_b32 s16, s13 -; GFX9-NEXT: s_cmp_lt_u32 s16, s29 -; GFX9-NEXT: s_cselect_b32 s16, s16, s29 -; GFX9-NEXT: s_add_i32 s13, s13, s16 -; GFX9-NEXT: s_not_b32 s16, s14 -; GFX9-NEXT: s_cmp_lt_u32 s16, s30 -; GFX9-NEXT: s_cselect_b32 s16, s16, s30 -; GFX9-NEXT: s_add_i32 s14, s14, s16 -; GFX9-NEXT: s_not_b32 s16, s15 -; GFX9-NEXT: s_cmp_lt_u32 s16, s31 -; GFX9-NEXT: s_cselect_b32 s16, s16, s31 -; GFX9-NEXT: s_add_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v5, s5, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v6, s6, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v7, s7, v7 clamp +; GFX9-NEXT: v_add_u32_e64 v8, s8, v8 clamp +; GFX9-NEXT: v_add_u32_e64 v9, s9, v9 clamp +; GFX9-NEXT: v_add_u32_e64 v10, s10, v10 clamp +; GFX9-NEXT: v_add_u32_e64 v11, s11, v11 clamp +; GFX9-NEXT: v_add_u32_e64 v12, s12, v12 clamp +; GFX9-NEXT: v_add_u32_e64 v13, s13, v13 clamp +; GFX9-NEXT: v_add_u32_e64 v14, s14, v14 clamp +; GFX9-NEXT: v_add_u32_e64 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s46, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s16 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s17 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s18 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s19 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, s4, s20 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v5, s5, s21 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v6, s6, s22 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v7, s7, s23 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v8, s8, s24 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v9, s9, s25 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v10, s10, s26 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v11, s11, s27 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v12, s12, s28 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v13, s13, s29 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v14, s14, s30 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s46, s16 -; GFX10-NEXT: s_cselect_b32 s46, s46, s16 -; GFX10-NEXT: s_not_b32 s47, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s46 -; GFX10-NEXT: s_cmp_lt_u32 s47, s17 -; GFX10-NEXT: s_cselect_b32 s46, s47, s17 -; GFX10-NEXT: s_not_b32 s17, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s46 -; GFX10-NEXT: s_cmp_lt_u32 s17, s18 -; GFX10-NEXT: s_cselect_b32 s16, s17, s18 -; GFX10-NEXT: s_not_b32 s17, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s19 -; GFX10-NEXT: s_cselect_b32 s16, s17, s19 -; GFX10-NEXT: s_not_b32 s17, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s20 -; GFX10-NEXT: s_cselect_b32 s16, s17, s20 -; GFX10-NEXT: s_not_b32 s17, s5 -; GFX10-NEXT: s_add_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s21 -; GFX10-NEXT: s_cselect_b32 s16, s17, s21 -; GFX10-NEXT: s_not_b32 s17, s6 -; GFX10-NEXT: s_add_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s22 -; GFX10-NEXT: s_cselect_b32 s16, s17, s22 -; GFX10-NEXT: s_not_b32 s17, s7 -; GFX10-NEXT: s_add_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s23 -; GFX10-NEXT: s_cselect_b32 s16, s17, s23 -; GFX10-NEXT: s_not_b32 s17, s8 -; GFX10-NEXT: s_add_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s24 -; GFX10-NEXT: s_cselect_b32 s16, s17, s24 -; GFX10-NEXT: s_not_b32 s17, s9 -; GFX10-NEXT: s_add_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s25 -; GFX10-NEXT: s_cselect_b32 s16, s17, s25 -; GFX10-NEXT: s_not_b32 s17, s10 -; GFX10-NEXT: s_add_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s26 -; GFX10-NEXT: s_cselect_b32 s16, s17, s26 -; GFX10-NEXT: s_not_b32 s17, s11 -; GFX10-NEXT: s_add_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s27 -; GFX10-NEXT: s_cselect_b32 s16, s17, s27 -; GFX10-NEXT: s_not_b32 s17, s12 -; GFX10-NEXT: s_add_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s28 -; GFX10-NEXT: s_cselect_b32 s16, s17, s28 -; GFX10-NEXT: s_not_b32 s17, s13 -; GFX10-NEXT: s_add_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s29 -; GFX10-NEXT: s_cselect_b32 s16, s17, s29 -; GFX10-NEXT: s_not_b32 s17, s14 -; GFX10-NEXT: s_add_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s30 -; GFX10-NEXT: s_cselect_b32 s16, s17, s30 -; GFX10-NEXT: s_not_b32 s17, s15 -; GFX10-NEXT: s_add_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s31 -; GFX10-NEXT: s_cselect_b32 s16, s17, s31 -; GFX10-NEXT: s_add_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -2191,27 +1666,21 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_uaddsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -2231,33 +1700,23 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: s_uaddsat_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s2, s0, -1 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s2, s0, -1 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s2, s0, -1 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -2276,24 +1735,18 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX8-LABEL: uaddsat_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s1, s0, -1 -; GFX8-NEXT: v_min_u16_e32 v0, s1, v0 -; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s1, s0, -1 -; GFX9-NEXT: v_min_u16_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s1, s0, -1 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v0, s1, v0 -; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -2313,24 +1766,18 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: uaddsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX8-NEXT: v_min_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX9-NEXT: v_min_u16_e32 v1, s0, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v1, s0 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -2358,32 +1805,25 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v0 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX8-NEXT: v_min_u16_e32 v3, v3, v1 -; GFX8-NEXT: v_min_u16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v2, v0, v1 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_pk_min_u16 v1, v2, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_u16 v1, v2, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -2415,65 +1855,30 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX8-LABEL: s_uaddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s4, s0, -1 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s1 -; GFX8-NEXT: s_cselect_b32 s1, s4, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_xor_b32 s1, s2, -1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v1, s2, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s2, s0, -1 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s2, s0, -1 -; GFX10-NEXT: s_mov_b32 s3, 0xffff -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s2, s3 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_cmp_lt_u32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -2504,31 +1909,24 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX8-LABEL: uaddsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s2, s0, -1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_min_u16_e32 v1, s2, v0 -; GFX8-NEXT: s_xor_b32 s2, s1, -1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_add_u16_e64 v1, s0, v0 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s1, s0, -1 -; GFX9-NEXT: v_pk_min_u16 v0, s1, v0 -; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s1, s0, -1 +; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_u16 v0, s1, v0 -; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2559,30 +1957,24 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: uaddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX8-NEXT: v_min_u16_e32 v2, s0, v2 -; GFX8-NEXT: v_min_u16_e32 v3, s1, v3 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u16_e64 v1, v0, s0 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX9-NEXT: v_pk_min_u16 v1, v1, s0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_pk_add_u16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_u16 v1, v1, s0 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2642,46 +2034,31 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 -; GFX8-NEXT: v_min_u16_e32 v6, v6, v2 -; GFX8-NEXT: v_min_u16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v1 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v5 -; GFX8-NEXT: v_min_u16_e32 v7, v7, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_add_u16_e64 v4, v0, v2 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v2, v1, v3 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX9-NEXT: v_pk_min_u16 v2, v4, v2 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX9-NEXT: v_pk_min_u16 v2, v2, v3 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v1 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_u16 v2, v4, v2 -; GFX10-NEXT: v_pk_min_u16 v3, v5, v3 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -2732,113 +2109,44 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s8, s0, -1 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s2 -; GFX8-NEXT: s_cselect_b32 s2, s8, s2 -; GFX8-NEXT: s_add_i32 s0, s0, s2 -; GFX8-NEXT: s_xor_b32 s2, s4, -1 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s2, s6 -; GFX8-NEXT: s_add_i32 s4, s4, s2 -; GFX8-NEXT: s_xor_b32 s2, s1, -1 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s2, s5, -1 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s5, s5, s2 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s5, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s4, s0, -1 -; GFX9-NEXT: s_mov_b32 s6, 0xffff -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s2, s2, s6 -; GFX9-NEXT: s_cmp_lt_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_cmp_lt_u32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s4, s5, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_xor_b32 s2, s1, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s3, s3, s6 -; GFX9-NEXT: s_cmp_lt_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s4, s0, -1 -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_and_b32 s7, s2, s5 -; GFX10-NEXT: s_and_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s7 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_add_u16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s4, s7 -; GFX10-NEXT: s_cmp_lt_u32 s6, s2 -; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_xor_b32 s2, s1, -1 -; GFX10-NEXT: s_add_i32 s4, s4, s6 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s5 -; GFX10-NEXT: s_and_b32 s5, s3, s5 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s3 -; GFX10-NEXT: s_cselect_b32 s3, s6, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -2914,60 +2222,38 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v10, -1, v6 -; GFX8-NEXT: v_min_u16_e32 v9, v9, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v10, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v11, -1, v7 -; GFX8-NEXT: v_min_u16_e32 v10, v10, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v11, -1, v2 -; GFX8-NEXT: v_xor_b32_e32 v12, -1, v8 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v11, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u16_e64 v6, v0, v3 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v3, v1, v4 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX9-NEXT: v_pk_min_u16 v3, v6, v3 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX9-NEXT: v_pk_min_u16 v3, v3, v4 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX9-NEXT: v_pk_min_u16 v3, v3, v5 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_add_u16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_u16 v3, v6, v3 -; GFX10-NEXT: v_pk_min_u16 v4, v7, v4 -; GFX10-NEXT: v_pk_min_u16 v5, v8, v5 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -3036,161 +2322,58 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s12, s0, -1 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_add_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s12, s3 -; GFX8-NEXT: s_cselect_b32 s3, s12, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_xor_b32 s3, s6, -1 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s3, s9 -; GFX8-NEXT: s_add_i32 s6, s6, s3 -; GFX8-NEXT: s_xor_b32 s3, s1, -1 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s1, s1, s3 -; GFX8-NEXT: s_xor_b32 s3, s7, -1 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s7, s7, s3 -; GFX8-NEXT: s_xor_b32 s3, s2, -1 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_xor_b32 s3, s8, -1 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s8, s8, s3 -; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s6, s0, -1 -; GFX9-NEXT: s_mov_b32 s8, 0xffff -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_and_b32 s6, s6, s8 -; GFX9-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NEXT: s_cmp_lt_u32 s6, s3 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_cmp_lt_u32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s6, s7, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_xor_b32 s3, s1, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_u32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s4, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s6 -; GFX9-NEXT: s_xor_b32 s3, s2, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s8 -; GFX9-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s6, s0, -1 -; GFX10-NEXT: s_mov_b32 s7, 0xffff -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_and_b32 s9, s3, s7 -; GFX10-NEXT: s_and_b32 s6, s6, s7 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s6, s9 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_add_u16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_add_u16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s9 -; GFX10-NEXT: s_cmp_lt_u32 s8, s3 -; GFX10-NEXT: s_cselect_b32 s3, s8, s3 -; GFX10-NEXT: s_and_b32 s9, s4, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s3 -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_xor_b32 s3, s1, -1 -; GFX10-NEXT: s_add_i32 s6, s6, s8 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s9 -; GFX10-NEXT: s_cmp_lt_u32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_xor_b32 s3, s2, -1 -; GFX10-NEXT: s_add_i32 s4, s4, s8 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s7 -; GFX10-NEXT: s_and_b32 s7, s5, s7 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_cmp_lt_u32 s8, s5 -; GFX10-NEXT: s_cselect_b32 s5, s8, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -3271,74 +2454,44 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v12, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v13, -1, v8 -; GFX8-NEXT: v_min_u16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v13, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v14, -1, v9 -; GFX8-NEXT: v_min_u16_e32 v13, v13, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v14, -1, v2 -; GFX8-NEXT: v_xor_b32_e32 v15, -1, v10 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v14, v14, v6 -; GFX8-NEXT: v_min_u16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v15, -1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v15, v15, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_u16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_u16_e64 v8, v0, v4 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v4, v1, v5 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v5, v2, v6 clamp +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v6, v3, v7 clamp +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX9-NEXT: v_pk_min_u16 v4, v8, v4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v1 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v5 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v6 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v3 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v7 -; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v15, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v23, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v3 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_add_u16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_add_u16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_u16 v11, v15, v4 -; GFX10-NEXT: v_pk_min_u16 v15, v19, v5 -; GFX10-NEXT: v_pk_min_u16 v19, v23, v6 -; GFX10-NEXT: v_pk_min_u16 v6, v10, v7 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v11 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v15 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v19 -; GFX10-NEXT: v_pk_add_u16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -3425,209 +2578,72 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s16, s0, -1 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_add_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_bfe_u32 s16, s16, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s16, s4 -; GFX8-NEXT: s_cselect_b32 s4, s16, s4 -; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_xor_b32 s4, s8, -1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s12 -; GFX8-NEXT: s_cselect_b32 s4, s4, s12 -; GFX8-NEXT: s_add_i32 s8, s8, s4 -; GFX8-NEXT: s_xor_b32 s4, s1, -1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s1, s1, s4 -; GFX8-NEXT: s_xor_b32 s4, s9, -1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s13, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s9, s9, s4 -; GFX8-NEXT: s_xor_b32 s4, s2, -1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s2, s2, s4 -; GFX8-NEXT: s_xor_b32 s4, s10, -1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s14, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s10, s10, s4 -; GFX8-NEXT: s_xor_b32 s4, s3, -1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_xor_b32 s4, s11, -1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s15, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s11, s11, s4 -; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp +; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s8, s0, -1 -; GFX9-NEXT: s_mov_b32 s10, 0xffff -; GFX9-NEXT: s_lshr_b32 s9, s8, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_and_b32 s8, s8, s10 -; GFX9-NEXT: s_and_b32 s4, s4, s10 -; GFX9-NEXT: s_cmp_lt_u32 s8, s4 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_u32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s8, s9, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s8 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s8, s8, s9 -; GFX9-NEXT: s_xor_b32 s4, s1, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s8 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_lshr_b32 s9, s5, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s8, s9 -; GFX9-NEXT: s_cselect_b32 s5, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s8 -; GFX9-NEXT: s_xor_b32 s4, s2, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b32 s6, s6, s10 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_u32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_xor_b32 s4, s3, -1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b32 s7, s7, s10 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_add_u16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s8, s0, -1 -; GFX10-NEXT: s_mov_b32 s9, 0xffff -; GFX10-NEXT: s_lshr_b32 s10, s8, 16 -; GFX10-NEXT: s_and_b32 s11, s4, s9 -; GFX10-NEXT: s_and_b32 s8, s8, s9 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s11 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_add_u16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_add_u16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_add_u16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s11 -; GFX10-NEXT: s_cmp_lt_u32 s10, s4 -; GFX10-NEXT: s_cselect_b32 s4, s10, s4 -; GFX10-NEXT: s_and_b32 s11, s5, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_xor_b32 s4, s1, -1 -; GFX10-NEXT: s_add_i32 s8, s8, s10 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_u32 s10, s5 -; GFX10-NEXT: s_cselect_b32 s5, s10, s5 -; GFX10-NEXT: s_and_b32 s11, s6, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s4, s2, -1 -; GFX10-NEXT: s_add_i32 s5, s5, s10 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_u32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s6, s10, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_xor_b32 s4, s3, -1 -; GFX10-NEXT: s_add_i32 s6, s6, s10 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_and_b32 s9, s7, s9 -; GFX10-NEXT: s_lshr_b32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_cmp_lt_u32 s10, s7 -; GFX10-NEXT: s_cselect_b32 s7, s10, s7 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 -; GFX10-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index b111fd31851cf..8553853ff00c2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -20,8 +20,7 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -30,8 +29,7 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -42,8 +40,7 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) @@ -66,13 +63,10 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i7: @@ -80,28 +74,21 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -123,8 +110,7 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -133,8 +119,7 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -145,8 +130,7 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) @@ -169,13 +153,10 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i8: @@ -183,28 +164,21 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -241,11 +215,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v3, v2 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -259,11 +231,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v3 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_u16_e64 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -280,10 +250,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_u16_e64 v3, v0, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v3 +; GFX10-NEXT: v_sub_nc_u16_e64 v1, v2, v1 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v3 clamp ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -322,92 +290,60 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-LABEL: s_usubsat_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s1 -; GFX8-NEXT: s_cselect_b32 s1, s5, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s2, s3, s4 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s3, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s2 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_lshr_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s3, s4 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: s_lshl_b32 s0, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s3, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_lshr_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshl_b32 s4, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_lt_u32 s5, s4 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_cmp_lt_u32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sub_i32 s1, s3, s1 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s3 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -471,18 +407,14 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v3, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_min_u16_e32 v3, v2, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_u16_e64 v2, v2, v3 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_min_u16_e32 v4, v3, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v4 +; GFX8-NEXT: v_sub_u16_e64 v3, v3, v4 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -506,22 +438,18 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v5 -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: v_sub_u16_e64 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_min_u16_e32 v3, v2, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_min_u16_e32 v4, v3, v4 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_e64 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_u16_e64 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -533,32 +461,28 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v1 -; GFX10-NEXT: s_mov_b32 s4, 16 -; GFX10-NEXT: s_mov_b32 s5, 24 -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v3, v2, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v5, v4, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, v3 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_min_u16_e64 v3, v6, v7 -; GFX10-NEXT: v_sub_nc_u16_e64 v4, v4, v5 -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 +; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u16_e64 v5, v5, v6 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v6, v3 -; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v4 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_and_b32_sdwa v1, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v2, v4, s4, v2 +; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 ; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -617,176 +541,107 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-LABEL: s_usubsat_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s9, s1 -; GFX8-NEXT: s_cselect_b32 s1, s9, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s2, s8 -; GFX8-NEXT: s_lshl_b32 s2, s5, s8 -; GFX8-NEXT: s_lshr_b32 s0, s0, s8 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s2 -; GFX8-NEXT: s_cselect_b32 s2, s5, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s3, s8 -; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_lshr_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s3 -; GFX8-NEXT: s_cselect_b32 s3, s5, s3 -; GFX8-NEXT: s_sub_i32 s2, s2, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s4, s8 -; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s5, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s4 -; GFX8-NEXT: s_cselect_b32 s4, s5, s4 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_and_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: s_lshl_b32 s1, s6, s8 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp +; GFX8-NEXT: s_lshl_b32 s0, s4, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s9, s1 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_lshr_b32 s0, s0, s8 -; GFX9-NEXT: s_bfe_u32 s5, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_lshr_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_lshr_b32 s2, s2, s8 -; GFX9-NEXT: s_bfe_u32 s5, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_sub_u16_e64 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_sub_u16_e64 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s6, 8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 -; GFX10-NEXT: s_lshl_b32 s8, s1, s6 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_sub_nc_u16_e64 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s9, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-NEXT: s_cmp_lt_u32 s9, s8 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v2, s3, s0 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v3, s4, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s9, s8 -; GFX10-NEXT: s_lshl_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s0, s0, s8 -; GFX10-NEXT: s_lshl_b32 s2, s2, s6 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s6 -; GFX10-NEXT: s_cmp_lt_u32 s8, s5 -; GFX10-NEXT: s_cselect_b32 s5, s8, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s5, s7, s6 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_cmp_lt_u32 s7, s5 -; GFX10-NEXT: s_cselect_b32 s5, s7, s5 -; GFX10-NEXT: s_lshl_b32 s1, s1, s6 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s6 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s3, s6 -; GFX10-NEXT: s_cmp_lt_u32 s5, s1 -; GFX10-NEXT: s_cselect_b32 s1, s5, s1 -; GFX10-NEXT: s_sub_i32 s1, s4, s1 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s1, s1, s6 -; GFX10-NEXT: s_and_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s4 -; GFX10-NEXT: s_and_b32 s1, s1, s4 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -811,8 +666,7 @@ define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -821,8 +675,7 @@ define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -833,8 +686,7 @@ define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs) @@ -854,22 +706,22 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX8-LABEL: s_usubsat_i24: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_cmp_lt_u32 s0, s1 -; GFX8-NEXT: s_cselect_b32 s1, s0, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_cmp_lt_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s1, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i24: @@ -877,10 +729,9 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s0, s1 -; GFX10-NEXT: s_cselect_b32 s1, s0, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -897,24 +748,21 @@ define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-LABEL: v_usubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -930,24 +778,23 @@ define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: s_usubsat_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s1 -; GFX8-NEXT: s_cselect_b32 s1, s0, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s1, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s1 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s1, s0, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -962,21 +809,18 @@ define amdgpu_ps float @usubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX8-LABEL: usubsat_i32_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -992,21 +836,18 @@ define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: usubsat_i32_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u32_e32 v1, s0, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1026,30 +867,24 @@ define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_min_u32_e32 v2, v1, v3 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_min_u32_e32 v2, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX10-NEXT: v_min_u32_e32 v3, v1, v3 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1068,33 +903,31 @@ define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX8-LABEL: s_usubsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s2 -; GFX8-NEXT: s_cselect_b32 s2, s0, s2 -; GFX8-NEXT: s_sub_i32 s0, s0, s2 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s2, s1, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s2, s0, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_lt_u32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s2, s1, s3 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s2 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s2 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_lt_u32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s2, s1, s3 -; GFX10-NEXT: s_sub_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1115,36 +948,27 @@ define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_min_u32_e32 v3, v1, v4 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_min_u32_e32 v3, v2, v5 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v1, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v2, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX10-NEXT: v_min_u32_e32 v4, v1, v4 -; GFX10-NEXT: v_min_u32_e32 v5, v2, v5 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1166,42 +990,39 @@ define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX8-LABEL: s_usubsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s3 -; GFX8-NEXT: s_cselect_b32 s3, s0, s3 -; GFX8-NEXT: s_sub_i32 s0, s0, s3 -; GFX8-NEXT: s_cmp_lt_u32 s1, s4 -; GFX8-NEXT: s_cselect_b32 s3, s1, s4 -; GFX8-NEXT: s_sub_i32 s1, s1, s3 -; GFX8-NEXT: s_cmp_lt_u32 s2, s5 -; GFX8-NEXT: s_cselect_b32 s3, s2, s5 -; GFX8-NEXT: s_sub_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_sub_u32_e64 v0, s[6:7], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s3 -; GFX9-NEXT: s_cselect_b32 s3, s0, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_cmp_lt_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s3, s1, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_cmp_lt_u32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s3, s2, s5 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s3 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s3 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s3, s0, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_lt_u32 s1, s4 -; GFX10-NEXT: s_cselect_b32 s3, s1, s4 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s3, s2, s5 -; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1224,42 +1045,30 @@ define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v4, v0, v4 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v1, v5 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v2, v6 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v3, v7 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v4, v0, v4 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v1, v5 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v2, v6 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v3, v7 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v19, v2, v6 -; GFX10-NEXT: v_min_u32_e32 v11, v0, v4 -; GFX10-NEXT: v_min_u32_e32 v15, v1, v5 -; GFX10-NEXT: v_min_u32_e32 v6, v3, v7 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1284,51 +1093,47 @@ define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX8-LABEL: s_usubsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s4 -; GFX8-NEXT: s_cselect_b32 s4, s0, s4 -; GFX8-NEXT: s_sub_i32 s0, s0, s4 -; GFX8-NEXT: s_cmp_lt_u32 s1, s5 -; GFX8-NEXT: s_cselect_b32 s4, s1, s5 -; GFX8-NEXT: s_sub_i32 s1, s1, s4 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s4, s2, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s4 -; GFX8-NEXT: s_cmp_lt_u32 s3, s7 -; GFX8-NEXT: s_cselect_b32 s4, s3, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_sub_u32_e64 v0, s[8:9], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s4, s0, s4 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_cmp_lt_u32 s1, s5 -; GFX9-NEXT: s_cselect_b32 s4, s1, s5 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_cmp_lt_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s4, s2, s6 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s4, s3, s7 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s4 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s0, s4 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_lt_u32 s1, s5 -; GFX10-NEXT: s_cselect_b32 s4, s1, s5 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_lt_u32 s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, s2, s6 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_u32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s4, s3, s7 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1353,47 +1158,32 @@ define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v1, v6 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v2, v7 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v3, v8 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v4, v9 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v7 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v8 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v9 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v1, v6 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v2, v7 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v3, v8 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v4, v9 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v7 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v8 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX10-NEXT: v_min_u32_e32 v6, v1, v6 -; GFX10-NEXT: v_min_u32_e32 v7, v2, v7 -; GFX10-NEXT: v_min_u32_e32 v8, v3, v8 -; GFX10-NEXT: v_min_u32_e32 v9, v4, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v7 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v8 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -1422,60 +1212,55 @@ define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX8-LABEL: s_usubsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s5 -; GFX8-NEXT: s_cselect_b32 s5, s0, s5 -; GFX8-NEXT: s_sub_i32 s0, s0, s5 -; GFX8-NEXT: s_cmp_lt_u32 s1, s6 -; GFX8-NEXT: s_cselect_b32 s5, s1, s6 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 -; GFX8-NEXT: s_cmp_lt_u32 s2, s7 -; GFX8-NEXT: s_cselect_b32 s5, s2, s7 -; GFX8-NEXT: s_sub_i32 s2, s2, s5 -; GFX8-NEXT: s_cmp_lt_u32 s3, s8 -; GFX8-NEXT: s_cselect_b32 s5, s3, s8 -; GFX8-NEXT: s_sub_i32 s3, s3, s5 -; GFX8-NEXT: s_cmp_lt_u32 s4, s9 -; GFX8-NEXT: s_cselect_b32 s5, s4, s9 -; GFX8-NEXT: s_sub_i32 s4, s4, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_sub_u32_e64 v0, s[10:11], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[0:1], s4, v4 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s5 -; GFX9-NEXT: s_cselect_b32 s5, s0, s5 -; GFX9-NEXT: s_sub_i32 s0, s0, s5 -; GFX9-NEXT: s_cmp_lt_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s5, s1, s6 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_cmp_lt_u32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s5, s2, s7 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_cmp_lt_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s5, s3, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s9 -; GFX9-NEXT: s_cselect_b32 s5, s4, s9 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s5 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s0, s5 -; GFX10-NEXT: s_sub_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_lt_u32 s1, s6 -; GFX10-NEXT: s_cselect_b32 s5, s1, s6 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_lt_u32 s2, s7 -; GFX10-NEXT: s_cselect_b32 s5, s2, s7 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s3, s8 -; GFX10-NEXT: s_cselect_b32 s5, s3, s8 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_u32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s5, s4, s9 -; GFX10-NEXT: s_sub_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -1522,113 +1307,65 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v16, v0, v16 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v2, v18 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v3, v19 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v15, v31 -; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v31 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v16, v0, v16 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v2, v18 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v3, v19 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX9-NEXT: v_sub_u32_e32 v7, v7, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX9-NEXT: v_sub_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX9-NEXT: v_sub_u32_e32 v9, v9, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX9-NEXT: v_sub_u32_e32 v10, v10, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX9-NEXT: v_sub_u32_e32 v11, v11, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX9-NEXT: v_sub_u32_e32 v13, v13, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX9-NEXT: v_sub_u32_e32 v14, v14, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v15, v31 -; GFX9-NEXT: v_sub_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v35, v0, v16 -; GFX10-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v2, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v3, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v4, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v5, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v16 -; GFX10-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v7, v23 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v8, v24 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v9, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v10, v26 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, v6, v16 -; GFX10-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v12, v28 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, v8, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v13, v29 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, v9, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v14, v30 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, v10, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v15, v31 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, v15, v20 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) @@ -1690,159 +1427,143 @@ define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX8-LABEL: s_usubsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s16 -; GFX8-NEXT: s_cselect_b32 s16, s0, s16 -; GFX8-NEXT: s_sub_i32 s0, s0, s16 -; GFX8-NEXT: s_cmp_lt_u32 s1, s17 -; GFX8-NEXT: s_cselect_b32 s16, s1, s17 -; GFX8-NEXT: s_sub_i32 s1, s1, s16 -; GFX8-NEXT: s_cmp_lt_u32 s2, s18 -; GFX8-NEXT: s_cselect_b32 s16, s2, s18 -; GFX8-NEXT: s_sub_i32 s2, s2, s16 -; GFX8-NEXT: s_cmp_lt_u32 s3, s19 -; GFX8-NEXT: s_cselect_b32 s16, s3, s19 -; GFX8-NEXT: s_sub_i32 s3, s3, s16 -; GFX8-NEXT: s_cmp_lt_u32 s4, s20 -; GFX8-NEXT: s_cselect_b32 s16, s4, s20 -; GFX8-NEXT: s_sub_i32 s4, s4, s16 -; GFX8-NEXT: s_cmp_lt_u32 s5, s21 -; GFX8-NEXT: s_cselect_b32 s16, s5, s21 -; GFX8-NEXT: s_sub_i32 s5, s5, s16 -; GFX8-NEXT: s_cmp_lt_u32 s6, s22 -; GFX8-NEXT: s_cselect_b32 s16, s6, s22 -; GFX8-NEXT: s_sub_i32 s6, s6, s16 -; GFX8-NEXT: s_cmp_lt_u32 s7, s23 -; GFX8-NEXT: s_cselect_b32 s16, s7, s23 -; GFX8-NEXT: s_sub_i32 s7, s7, s16 -; GFX8-NEXT: s_cmp_lt_u32 s8, s24 -; GFX8-NEXT: s_cselect_b32 s16, s8, s24 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_cmp_lt_u32 s9, s25 -; GFX8-NEXT: s_cselect_b32 s16, s9, s25 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 -; GFX8-NEXT: s_cmp_lt_u32 s10, s26 -; GFX8-NEXT: s_cselect_b32 s16, s10, s26 -; GFX8-NEXT: s_sub_i32 s10, s10, s16 -; GFX8-NEXT: s_cmp_lt_u32 s11, s27 -; GFX8-NEXT: s_cselect_b32 s16, s11, s27 -; GFX8-NEXT: s_sub_i32 s11, s11, s16 -; GFX8-NEXT: s_cmp_lt_u32 s12, s28 -; GFX8-NEXT: s_cselect_b32 s16, s12, s28 -; GFX8-NEXT: s_sub_i32 s12, s12, s16 -; GFX8-NEXT: s_cmp_lt_u32 s13, s29 -; GFX8-NEXT: s_cselect_b32 s16, s13, s29 -; GFX8-NEXT: s_sub_i32 s13, s13, s16 -; GFX8-NEXT: s_cmp_lt_u32 s14, s30 -; GFX8-NEXT: s_cselect_b32 s16, s14, s30 -; GFX8-NEXT: s_sub_i32 s14, s14, s16 -; GFX8-NEXT: s_cmp_lt_u32 s15, s31 -; GFX8-NEXT: s_cselect_b32 s16, s15, s31 -; GFX8-NEXT: s_sub_i32 s15, s15, s16 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NEXT: v_mov_b32_e32 v10, s26 +; GFX8-NEXT: v_mov_b32_e32 v11, s27 +; GFX8-NEXT: v_mov_b32_e32 v12, s28 +; GFX8-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_sub_u32_e64 v0, s[32:33], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[16:17], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[16:17], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[2:3], s3, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[2:3], s4, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[2:3], s5, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[2:3], s6, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[2:3], s7, v7 clamp +; GFX8-NEXT: v_sub_u32_e64 v8, s[2:3], s8, v8 clamp +; GFX8-NEXT: v_sub_u32_e64 v9, s[2:3], s9, v9 clamp +; GFX8-NEXT: v_sub_u32_e64 v10, s[2:3], s10, v10 clamp +; GFX8-NEXT: v_sub_u32_e64 v11, s[2:3], s11, v11 clamp +; GFX8-NEXT: v_sub_u32_e64 v12, s[2:3], s12, v12 clamp +; GFX8-NEXT: v_sub_u32_e64 v13, s[2:3], s13, v13 clamp +; GFX8-NEXT: v_sub_u32_e64 v14, s[2:3], s14, v14 clamp +; GFX8-NEXT: v_sub_u32_e64 v15, s[2:3], s15, v15 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8-NEXT: v_readfirstlane_b32 s6, v6 +; GFX8-NEXT: v_readfirstlane_b32 s7, v7 +; GFX8-NEXT: v_readfirstlane_b32 s8, v8 +; GFX8-NEXT: v_readfirstlane_b32 s9, v9 +; GFX8-NEXT: v_readfirstlane_b32 s10, v10 +; GFX8-NEXT: v_readfirstlane_b32 s11, v11 +; GFX8-NEXT: v_readfirstlane_b32 s12, v12 +; GFX8-NEXT: v_readfirstlane_b32 s13, v13 +; GFX8-NEXT: v_readfirstlane_b32 s14, v14 +; GFX8-NEXT: v_readfirstlane_b32 s15, v15 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s16 -; GFX9-NEXT: s_cselect_b32 s16, s0, s16 -; GFX9-NEXT: s_sub_i32 s0, s0, s16 -; GFX9-NEXT: s_cmp_lt_u32 s1, s17 -; GFX9-NEXT: s_cselect_b32 s16, s1, s17 -; GFX9-NEXT: s_sub_i32 s1, s1, s16 -; GFX9-NEXT: s_cmp_lt_u32 s2, s18 -; GFX9-NEXT: s_cselect_b32 s16, s2, s18 -; GFX9-NEXT: s_sub_i32 s2, s2, s16 -; GFX9-NEXT: s_cmp_lt_u32 s3, s19 -; GFX9-NEXT: s_cselect_b32 s16, s3, s19 -; GFX9-NEXT: s_sub_i32 s3, s3, s16 -; GFX9-NEXT: s_cmp_lt_u32 s4, s20 -; GFX9-NEXT: s_cselect_b32 s16, s4, s20 -; GFX9-NEXT: s_sub_i32 s4, s4, s16 -; GFX9-NEXT: s_cmp_lt_u32 s5, s21 -; GFX9-NEXT: s_cselect_b32 s16, s5, s21 -; GFX9-NEXT: s_sub_i32 s5, s5, s16 -; GFX9-NEXT: s_cmp_lt_u32 s6, s22 -; GFX9-NEXT: s_cselect_b32 s16, s6, s22 -; GFX9-NEXT: s_sub_i32 s6, s6, s16 -; GFX9-NEXT: s_cmp_lt_u32 s7, s23 -; GFX9-NEXT: s_cselect_b32 s16, s7, s23 -; GFX9-NEXT: s_sub_i32 s7, s7, s16 -; GFX9-NEXT: s_cmp_lt_u32 s8, s24 -; GFX9-NEXT: s_cselect_b32 s16, s8, s24 -; GFX9-NEXT: s_sub_i32 s8, s8, s16 -; GFX9-NEXT: s_cmp_lt_u32 s9, s25 -; GFX9-NEXT: s_cselect_b32 s16, s9, s25 -; GFX9-NEXT: s_sub_i32 s9, s9, s16 -; GFX9-NEXT: s_cmp_lt_u32 s10, s26 -; GFX9-NEXT: s_cselect_b32 s16, s10, s26 -; GFX9-NEXT: s_sub_i32 s10, s10, s16 -; GFX9-NEXT: s_cmp_lt_u32 s11, s27 -; GFX9-NEXT: s_cselect_b32 s16, s11, s27 -; GFX9-NEXT: s_sub_i32 s11, s11, s16 -; GFX9-NEXT: s_cmp_lt_u32 s12, s28 -; GFX9-NEXT: s_cselect_b32 s16, s12, s28 -; GFX9-NEXT: s_sub_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_lt_u32 s13, s29 -; GFX9-NEXT: s_cselect_b32 s16, s13, s29 -; GFX9-NEXT: s_sub_i32 s13, s13, s16 -; GFX9-NEXT: s_cmp_lt_u32 s14, s30 -; GFX9-NEXT: s_cselect_b32 s16, s14, s30 -; GFX9-NEXT: s_sub_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_lt_u32 s15, s31 -; GFX9-NEXT: s_cselect_b32 s16, s15, s31 -; GFX9-NEXT: s_sub_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, s5, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, s6, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, s7, v7 clamp +; GFX9-NEXT: v_sub_u32_e64 v8, s8, v8 clamp +; GFX9-NEXT: v_sub_u32_e64 v9, s9, v9 clamp +; GFX9-NEXT: v_sub_u32_e64 v10, s10, v10 clamp +; GFX9-NEXT: v_sub_u32_e64 v11, s11, v11 clamp +; GFX9-NEXT: v_sub_u32_e64 v12, s12, v12 clamp +; GFX9-NEXT: v_sub_u32_e64 v13, s13, v13 clamp +; GFX9-NEXT: v_sub_u32_e64 v14, s14, v14 clamp +; GFX9-NEXT: v_sub_u32_e64 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s16 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, s4, s20 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v5, s5, s21 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v6, s6, s22 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v7, s7, s23 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v8, s8, s24 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v9, s9, s25 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v10, s10, s26 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v11, s11, s27 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v12, s12, s28 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v13, s13, s29 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v14, s14, s30 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s46, s0, s16 -; GFX10-NEXT: s_sub_i32 s0, s0, s46 -; GFX10-NEXT: s_cmp_lt_u32 s1, s17 -; GFX10-NEXT: s_cselect_b32 s46, s1, s17 -; GFX10-NEXT: s_sub_i32 s1, s1, s46 -; GFX10-NEXT: s_cmp_lt_u32 s2, s18 -; GFX10-NEXT: s_cselect_b32 s16, s2, s18 -; GFX10-NEXT: s_sub_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s19 -; GFX10-NEXT: s_cselect_b32 s16, s3, s19 -; GFX10-NEXT: s_sub_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s20 -; GFX10-NEXT: s_cselect_b32 s16, s4, s20 -; GFX10-NEXT: s_sub_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_lt_u32 s5, s21 -; GFX10-NEXT: s_cselect_b32 s16, s5, s21 -; GFX10-NEXT: s_sub_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_lt_u32 s6, s22 -; GFX10-NEXT: s_cselect_b32 s16, s6, s22 -; GFX10-NEXT: s_sub_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_lt_u32 s7, s23 -; GFX10-NEXT: s_cselect_b32 s16, s7, s23 -; GFX10-NEXT: s_sub_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s24 -; GFX10-NEXT: s_cselect_b32 s16, s8, s24 -; GFX10-NEXT: s_sub_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_lt_u32 s9, s25 -; GFX10-NEXT: s_cselect_b32 s16, s9, s25 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s26 -; GFX10-NEXT: s_cselect_b32 s16, s10, s26 -; GFX10-NEXT: s_sub_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_lt_u32 s11, s27 -; GFX10-NEXT: s_cselect_b32 s16, s11, s27 -; GFX10-NEXT: s_sub_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_lt_u32 s12, s28 -; GFX10-NEXT: s_cselect_b32 s16, s12, s28 -; GFX10-NEXT: s_sub_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_lt_u32 s13, s29 -; GFX10-NEXT: s_cselect_b32 s16, s13, s29 -; GFX10-NEXT: s_sub_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_lt_u32 s14, s30 -; GFX10-NEXT: s_cselect_b32 s16, s14, s30 -; GFX10-NEXT: s_sub_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_lt_u32 s15, s31 -; GFX10-NEXT: s_cselect_b32 s16, s15, s31 -; GFX10-NEXT: s_sub_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -1862,24 +1583,21 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_usubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -1898,30 +1616,23 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: s_usubsat_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -1939,21 +1650,18 @@ define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX8-LABEL: usubsat_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u16_e32 v0, s0, v0 -; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u16_e64 v0, s0, v0 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -1972,21 +1680,18 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: usubsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u16_e64 v1, v0, s0 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -2012,28 +1717,25 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v3, v0, v1 -; GFX8-NEXT: v_min_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v1, v0, v1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v1, v0, v1 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -2065,57 +1767,28 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s1 -; GFX8-NEXT: s_cselect_b32 s1, s4, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_sub_i32 s1, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v1, s2, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s4, s0, s3 -; GFX9-NEXT: s_and_b32 s1, s1, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_cmp_lt_u32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s3, s2, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_and_b32 s4, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s2 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sub_i32 s1, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -2146,24 +1819,22 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_min_u16 v0, s0, v0 -; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_min_u16 v0, s0, v0 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2192,26 +1863,24 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: usubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_min_u16_e32 v2, s0, v0 -; GFX8-NEXT: v_min_u16_e32 v3, s1, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_min_u16 v1, v0, s0 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_min_u16 v1, v0, s0 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2267,38 +1936,31 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v6, v0, v2 -; GFX8-NEXT: v_min_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v7, v1, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v2, v0, v2 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: v_pk_min_u16 v2, v1, v3 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v2, v0, v2 -; GFX10-NEXT: v_pk_min_u16 v3, v1, v3 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -2346,100 +2008,43 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-LABEL: s_usubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s2 -; GFX8-NEXT: s_cselect_b32 s2, s8, s2 -; GFX8-NEXT: s_sub_i32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s2, s6 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s3 -; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_sub_i32 s3, s5, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_sub_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s5, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s6, s0, s5 -; GFX9-NEXT: s_and_b32 s2, s2, s5 -; GFX9-NEXT: s_cmp_lt_u32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s6, s4, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s2, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s4, s1, s5 -; GFX9-NEXT: s_and_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_cmp_lt_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s4, s2, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_and_b32 s7, s2, s4 -; GFX10-NEXT: s_and_b32 s6, s0, s4 -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s6, s7 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_u32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s2 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_sub_i32 s2, s5, s6 -; GFX10-NEXT: s_and_b32 s6, s1, s4 -; GFX10-NEXT: s_and_b32 s4, s3, s4 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_cmp_lt_u32 s5, s3 -; GFX10-NEXT: s_cselect_b32 s3, s5, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s3, s5, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -2509,48 +2114,38 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v9, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v10, v1, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_min_u16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v11, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v3, v0, v3 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX9-NEXT: v_pk_min_u16 v3, v1, v4 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_min_u16 v3, v2, v5 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v3, v0, v3 -; GFX10-NEXT: v_pk_min_u16 v4, v1, v4 -; GFX10-NEXT: v_pk_min_u16 v5, v2, v5 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -2614,142 +2209,57 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-LABEL: s_usubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_sub_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s12, s3 -; GFX8-NEXT: s_cselect_b32 s3, s12, s3 -; GFX8-NEXT: s_sub_i32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s3, s9 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s4 -; GFX8-NEXT: s_cselect_b32 s4, s6, s4 -; GFX8-NEXT: s_sub_i32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s10, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s6 -; GFX8-NEXT: s_cselect_b32 s4, s4, s6 -; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s5 -; GFX8-NEXT: s_cselect_b32 s5, s6, s5 -; GFX8-NEXT: s_sub_i32 s2, s2, s5 -; GFX8-NEXT: s_bfe_u32 s5, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s11, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s6 -; GFX8-NEXT: s_cselect_b32 s5, s5, s6 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_sub_i32 s5, s8, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_and_b32 s3, s3, s7 -; GFX9-NEXT: s_cmp_lt_u32 s8, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cmp_lt_u32 s6, s9 -; GFX9-NEXT: s_cselect_b32 s8, s6, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s3, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s6, s1, s7 -; GFX9-NEXT: s_and_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cmp_lt_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s6, s3, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_and_b32 s4, s2, s7 -; GFX9-NEXT: s_and_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s5, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s6, 0xffff -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s9, s3, s6 -; GFX10-NEXT: s_and_b32 s8, s0, s6 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s9 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_u32 s7, s3 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_and_b32 s9, s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_sub_i32 s3, s7, s8 -; GFX10-NEXT: s_and_b32 s8, s1, s6 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_u32 s7, s4 -; GFX10-NEXT: s_cselect_b32 s4, s7, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 -; GFX10-NEXT: s_lshr_b32 s8, s4, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s4, s7, s8 -; GFX10-NEXT: s_and_b32 s8, s2, s6 -; GFX10-NEXT: s_and_b32 s6, s5, s6 -; GFX10-NEXT: s_lshr_b32 s7, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_cmp_lt_u32 s7, s5 -; GFX10-NEXT: s_cselect_b32 s5, s7, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s5 -; GFX10-NEXT: s_lshr_b32 s3, s5, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_sub_i32 s3, s7, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -2822,58 +2332,44 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v12, v0, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v13, v1, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_min_u16_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v14, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_u16_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v15, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_u16_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v4, v0, v4 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v1, v5 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v2, v6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v3, v7 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_sub_u16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v19, v2, v6 -; GFX10-NEXT: v_pk_min_u16 v11, v0, v4 -; GFX10-NEXT: v_pk_min_u16 v15, v1, v5 -; GFX10-NEXT: v_pk_min_u16 v6, v3, v7 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_sub_u16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v19 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v11 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v15 -; GFX10-NEXT: v_pk_sub_i16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -2953,184 +2449,71 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-LABEL: s_usubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_sub_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_bfe_u32 s16, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s16, s4 -; GFX8-NEXT: s_cselect_b32 s4, s16, s4 -; GFX8-NEXT: s_sub_i32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s12 -; GFX8-NEXT: s_cselect_b32 s4, s4, s12 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 -; GFX8-NEXT: s_bfe_u32 s8, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s5 -; GFX8-NEXT: s_cselect_b32 s5, s8, s5 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 -; GFX8-NEXT: s_bfe_u32 s5, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s13, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s8 -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s6 -; GFX8-NEXT: s_cselect_b32 s6, s8, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s6 -; GFX8-NEXT: s_bfe_u32 s6, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s14, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s8 -; GFX8-NEXT: s_cselect_b32 s6, s6, s8 -; GFX8-NEXT: s_sub_i32 s6, s10, s6 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s7 -; GFX8-NEXT: s_cselect_b32 s7, s8, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s7 -; GFX8-NEXT: s_bfe_u32 s7, s11, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s15, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s7, s8 -; GFX8-NEXT: s_cselect_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000 -; GFX8-NEXT: s_sub_i32 s7, s11, s7 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp +; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_u32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_cmp_lt_u32 s8, s11 -; GFX9-NEXT: s_cselect_b32 s10, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s8, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s10, s5, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s8, s1, s9 -; GFX9-NEXT: s_and_b32 s5, s5, s9 -; GFX9-NEXT: s_cmp_lt_u32 s8, s5 -; GFX9-NEXT: s_cselect_b32 s5, s8, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s8, s4, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s8 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s5, s2, s9 -; GFX9-NEXT: s_and_b32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_u32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s6, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s5, s3, s9 -; GFX9-NEXT: s_and_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_u32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s6, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_and_b32 s11, s4, s8 -; GFX10-NEXT: s_and_b32 s10, s0, s8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s11 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_sub_u16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_u32 s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_and_b32 s11, s5, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s10, s4 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_sub_i32 s4, s9, s10 -; GFX10-NEXT: s_and_b32 s10, s1, s8 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_u32 s9, s5 -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_and_b32 s11, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s10, s5 -; GFX10-NEXT: s_lshr_b32 s6, s6, 16 -; GFX10-NEXT: s_lshr_b32 s10, s5, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_sub_i32 s5, s9, s10 -; GFX10-NEXT: s_and_b32 s10, s2, s8 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s6, s9, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s10, s6 -; GFX10-NEXT: s_lshr_b32 s10, s6, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s6 -; GFX10-NEXT: s_sub_i32 s6, s9, s10 -; GFX10-NEXT: s_and_b32 s10, s3, s8 -; GFX10-NEXT: s_and_b32 s8, s7, s8 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_lshr_b32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s8, s10, s8 -; GFX10-NEXT: s_cmp_lt_u32 s9, s7 -; GFX10-NEXT: s_cselect_b32 s7, s9, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s7 -; GFX10-NEXT: s_lshr_b32 s5, s4, 16 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s4, s9, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> From dfcc09890a91b1085139fee175936b0e67824e47 Mon Sep 17 00:00:00 2001 From: Frederik Gossen Date: Tue, 28 Jul 2020 15:39:49 +0000 Subject: [PATCH 0312/1035] [MLIR][Shape] Lower `shape.const_shape` to `tensor_from_elements` Differential Revision: https://reviews.llvm.org/D82848 --- .../ShapeToStandard/ShapeToStandard.cpp | 34 +++++++++++++++++++ .../ShapeToStandard/shape-to-standard.mlir | 16 +++++++++ 2 files changed, 50 insertions(+) diff --git a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp index f239d1cfb4f0b..b84b6ba3b5d67 100644 --- a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp +++ b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp @@ -103,6 +103,39 @@ LogicalResult ShapeOfOpConversion::matchAndRewrite( return success(); } +namespace { +class ConstShapeOpConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(ConstShapeOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override; +}; +} // namespace + +LogicalResult ConstShapeOpConverter::matchAndRewrite( + ConstShapeOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + + // For now, this lowering supports only extent tensors, not `shape.shape` + // types. + if (op.getType().isa()) + return failure(); + + auto loc = op.getLoc(); + SmallVector extentOperands; + for (auto extent : op.shape()) { + extentOperands.push_back( + rewriter.create(loc, extent.getLimitedValue())); + } + Value tensor = rewriter.create(loc, extentOperands); + Type indexTy = rewriter.getIndexType(); + Type resultTy = RankedTensorType::get({ShapedType::kDynamicSize}, indexTy); + rewriter.replaceOpWithNewOp(op, tensor, resultTy); + return success(); +} + namespace { class GetExtentOpConverter : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -209,6 +242,7 @@ void mlir::populateShapeToStandardConversionPatterns( patterns.insert< AnyOpConversion, BinaryOpConversion, + ConstShapeOpConverter, BinaryOpConversion, GetExtentOpConverter, RankOpConverter, diff --git a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir index 9336402d86da4..7f875f3bb19f9 100644 --- a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir +++ b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir @@ -111,6 +111,22 @@ func @get_extent_from_extent_tensor(%extents : tensor, %idx : index) // ----- +// Lower `const_shape` to `tensor_from_elements`. +// CHECK-LABEL: @const_shape +// CHECK-SAME: () -> tensor +func @const_shape() -> tensor { + // CHECK: %[[C1:.*]] = constant 1 : index + // CHECK: %[[C2:.*]] = constant 2 : index + // CHECK: %[[C3:.*]] = constant 3 : index + // CHECK: %[[TENSOR3:.*]] = tensor_from_elements(%[[C1]], %[[C2]], %[[C3]]) + // CHECK: %[[RESULT:.*]] = tensor_cast %[[TENSOR3]] : tensor<3xindex> to tensor + // CHECK: return %[[RESULT]] : tensor + %shape = shape.const_shape [1, 2, 3] : tensor + return %shape : tensor +} + +// ----- + // Lower `any` to its first operand. // CHECK-LABEL: @any_of_three // CHECK-SAME: (%[[A:.*]]: tensor, %[[B:.*]]: tensor, %[[C:.*]]: tensor) -> tensor From bb23b5cfe0c4391576541ffe8a4f966155d2b608 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 26 Jul 2020 12:20:29 -0400 Subject: [PATCH 0313/1035] AMDGPU/GlobalISel: Merge identical select cases --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 777c8c6c2ee69..ff83e1ef404cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2846,6 +2846,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UMIN: case TargetOpcode::G_ATOMICRMW_UMAX: case TargetOpcode::G_ATOMICRMW_FADD: + case AMDGPU::G_AMDGPU_ATOMIC_INC: + case AMDGPU::G_AMDGPU_ATOMIC_DEC: return selectG_LOAD_ATOMICRMW(I); case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: return selectG_AMDGPU_ATOMIC_CMPXCHG(I); @@ -2875,10 +2877,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectG_INSERT_VECTOR_ELT(I); case TargetOpcode::G_SHUFFLE_VECTOR: return selectG_SHUFFLE_VECTOR(I); - case AMDGPU::G_AMDGPU_ATOMIC_INC: - case AMDGPU::G_AMDGPU_ATOMIC_DEC: - initM0(I); - return selectImpl(I, *CoverageInfo); case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { const AMDGPU::ImageDimIntrinsicInfo *Intr From 16bcd54570328c90182edb56d4593ff19207cbc7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 26 Jul 2020 10:52:51 -0400 Subject: [PATCH 0314/1035] AMDGPU/GlobalISel: Mark GlobalISel classes as final --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index d11b9801b9716..4d78a4f063209 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -22,7 +22,7 @@ namespace llvm { class AMDGPUTargetLowering; class MachineInstrBuilder; -class AMDGPUCallLowering: public CallLowering { +class AMDGPUCallLowering final : public CallLowering { void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy, uint64_t Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 1fe80958917d6..1fb64442347c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -47,7 +47,7 @@ class SIInstrInfo; class SIMachineFunctionInfo; class SIRegisterInfo; -class AMDGPUInstructionSelector : public InstructionSelector { +class AMDGPUInstructionSelector final : public InstructionSelector { private: MachineRegisterInfo *MRI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index d932cab4659ab..43be02c661ebe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -25,7 +25,7 @@ class LLVMContext; class GCNSubtarget; /// This class provides the information for the target register banks. -class AMDGPULegalizerInfo : public LegalizerInfo { +class AMDGPULegalizerInfo final : public LegalizerInfo { const GCNSubtarget &ST; public: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 098b0e9938861..8a48ea5bd30c5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -234,7 +234,7 @@ namespace { #include "AMDGPUGenPostLegalizeGICombiner.inc" #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { +class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 800ad2039f0e9..aea148e910710 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -37,7 +37,7 @@ namespace { #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo { +class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 71d82679b3ff1..a99dbeebfbf8c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -39,7 +39,7 @@ namespace { #include "AMDGPUGenRegBankGICombiner.inc" #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H -class AMDGPURegBankCombinerInfo : public CombinerInfo { +class AMDGPURegBankCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 23a466e9bd85b..2cfb32201a6a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -39,7 +39,7 @@ class AMDGPUGenRegisterBankInfo : public RegisterBankInfo { #define GET_TARGET_REGBANK_CLASS #include "AMDGPUGenRegisterBank.inc" }; -class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { +class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo { public: const GCNSubtarget &Subtarget; const SIRegisterInfo *TRI; From 97b5fb78d137a44bec104ba073dd620008ed7abb Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 25 Jul 2020 16:58:15 -0400 Subject: [PATCH 0315/1035] GlobalISel: Translate llvm.convert.{to|from}.fp16 intrinsics I think these were added as a workaround for SelectionDAG lacking half legalization support in the past. I think they should probably be removed from the IR, but clang does still have a target control to emit these instead of the native half fpext/fptrunc. --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 12 +++++++ .../irtranslator-convert-fp16-intrinsics.ll | 31 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-convert-fp16-intrinsics.ll diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 6433c13e990bf..dba341660a550 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1543,6 +1543,18 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, } return true; } + case Intrinsic::convert_from_fp16: + // FIXME: This intrinsic should probably be removed from the IR. + MIRBuilder.buildFPExt(getOrCreateVReg(CI), + getOrCreateVReg(*CI.getArgOperand(0)), + MachineInstr::copyFlagsFromInstruction(CI)); + return true; + case Intrinsic::convert_to_fp16: + // FIXME: This intrinsic should probably be removed from the IR. + MIRBuilder.buildFPTrunc(getOrCreateVReg(CI), + getOrCreateVReg(*CI.getArgOperand(0)), + MachineInstr::copyFlagsFromInstruction(CI)); + return true; case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-convert-fp16-intrinsics.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-convert-fp16-intrinsics.ll new file mode 100644 index 0000000000000..065a3d8e4dd25 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-convert-fp16-intrinsics.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=aarch64-- -mcpu=falkor -mattr=+lse -O0 -aarch64-enable-atomic-cfg-tidy=0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - | FileCheck %s + +define i16 @convert_to_fp16(float %src) { + ; CHECK-LABEL: name: convert_to_fp16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %cvt = call i16 @llvm.convert.to.fp16.f32(float %src) + ret i16 %cvt +} + +define float @convert_from_fp16(i16 %src) { + ; CHECK-LABEL: name: convert_from_fp16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; CHECK: $s0 = COPY [[FPEXT]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %cvt = call float @llvm.convert.from.fp16.f32(i16 %src) + ret float %cvt +} + +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone From 736423af53d707e097a174c3a91b75132b8dc6b1 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Mon, 27 Jul 2020 12:34:36 -0600 Subject: [PATCH 0316/1035] [OldPM] Print out a bit more when passes lie about changing IR https://reviews.llvm.org/D84686 --- llvm/lib/IR/LegacyPassManager.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index c01696e4e575e..96434ae3306b3 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -1586,9 +1586,12 @@ bool FPPassManager::runOnFunction(Function &F) { #endif LocalChanged |= FP->runOnFunction(F); -#ifdef EXPENSIVE_CHECKS - assert((LocalChanged || (RefHash == StructuralHash(F))) && - "Pass modifies its input and doesn't report it."); +#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG) + if (!LocalChanged && (RefHash != StructuralHash(F))) { + llvm::errs() << "Pass modifies its input and doesn't report it: " + << FP->getPassName() << "\n"; + assert(false && "Pass modifies its input and doesn't report it."); + } #endif if (EmitICRemark) { From 2ca6c422d2d025821390260232307567191a7deb Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Tue, 28 Jul 2020 09:08:08 -0700 Subject: [PATCH 0317/1035] [FunctionAttrs] Rename functionattrs -> function-attrs To match NewPM pass name, and also for readability. Also rename rpo-functionattrs -> rpo-function-attrs while we're here. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D84694 --- llvm/docs/Passes.rst | 6 +++--- llvm/lib/Passes/PassRegistry.def | 2 +- llvm/lib/Target/README.txt | 6 +++--- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 16 +++++++++------- llvm/test/Analysis/MemorySSA/pr39197.ll | 2 +- .../TypeBasedAliasAnalysis/functionattrs.ll | 2 +- llvm/test/Analysis/alias-analysis-uses.ll | 2 +- .../check-debugify-preserves-analyses.ll | 4 ++-- .../Feature/OperandBundles/function-attrs.ll | 2 +- llvm/test/Feature/OperandBundles/pr26510.ll | 2 +- .../FunctionAttrs/2008-09-03-Mutual.ll | 2 +- .../FunctionAttrs/2008-09-03-ReadNone.ll | 2 +- .../FunctionAttrs/2008-09-03-ReadOnly.ll | 2 +- .../FunctionAttrs/2008-09-13-VolatileRead.ll | 2 +- .../FunctionAttrs/2008-12-29-Constant.ll | 2 +- .../FunctionAttrs/2009-01-02-LocalStores.ll | 2 +- .../FunctionAttrs/2010-10-30-volatile.ll | 2 +- .../Transforms/FunctionAttrs/arg_returned.ll | 2 +- llvm/test/Transforms/FunctionAttrs/assume.ll | 2 +- llvm/test/Transforms/FunctionAttrs/atomic.ll | 2 +- llvm/test/Transforms/FunctionAttrs/comdat-ipo.ll | 2 +- llvm/test/Transforms/FunctionAttrs/convergent.ll | 2 +- .../FunctionAttrs/incompatible_fn_attrs.ll | 2 +- .../Transforms/FunctionAttrs/int_sideeffect.ll | 2 +- .../Transforms/FunctionAttrs/naked_functions.ll | 2 +- llvm/test/Transforms/FunctionAttrs/nocapture.ll | 2 +- .../FunctionAttrs/nofree-attributor.ll | 2 +- llvm/test/Transforms/FunctionAttrs/nofree.ll | 2 +- .../Transforms/FunctionAttrs/nonnull-global.ll | 2 +- llvm/test/Transforms/FunctionAttrs/nonnull.ll | 2 +- llvm/test/Transforms/FunctionAttrs/norecurse.ll | 4 ++-- llvm/test/Transforms/FunctionAttrs/nounwind.ll | 2 +- .../FunctionAttrs/operand-bundles-scc.ll | 2 +- llvm/test/Transforms/FunctionAttrs/optnone.ll | 2 +- .../FunctionAttrs/out-of-bounds-iterator-bug.ll | 2 +- .../Transforms/FunctionAttrs/read-write-scc.ll | 2 +- llvm/test/Transforms/FunctionAttrs/readattrs.ll | 4 ++-- llvm/test/Transforms/FunctionAttrs/readnone.ll | 2 +- llvm/test/Transforms/FunctionAttrs/returned.ll | 2 +- llvm/test/Transforms/FunctionAttrs/writeonly.ll | 2 +- .../Transforms/GlobalDCE/crash-assertingvh.ll | 2 +- llvm/test/Transforms/IndVarSimplify/pr38855.ll | 2 +- llvm/test/Transforms/Inline/delete-call.ll | 2 +- .../InstCombine/2009-02-11-NotInitialized.ll | 2 +- .../2012-04-30-LoopUnswitch-LPad-Crash.ll | 2 +- .../Reassociate/reassociate-deadinst.ll | 2 +- .../2012-04-30-LoopUnswitch-LPad-Crash.ll | 2 +- 47 files changed, 62 insertions(+), 60 deletions(-) diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst index 9a6c6944b96ef..e45adad98c157 100644 --- a/llvm/docs/Passes.rst +++ b/llvm/docs/Passes.rst @@ -522,9 +522,9 @@ instructions that are obviously dead. A trivial dead store elimination that only considers basic-block local redundant stores. -.. _passes-functionattrs: +.. _passes-function-attrs: -``-functionattrs``: Deduce function attributes +``-function-attrs``: Deduce function attributes ---------------------------------------------- A simple interprocedural pass which walks the call-graph, looking for functions @@ -651,7 +651,7 @@ This pass can also simplify calls to specific well-known function calls (e.g. runtime library functions). For example, a call ``exit(3)`` that occurs within the ``main()`` function can be transformed into simply ``return 3``. Whether or not library calls are simplified is controlled by the -:ref:`-functionattrs ` pass and LLVM's knowledge of +:ref:`-function-attrs ` pass and LLVM's knowledge of library calls on different targets. .. _passes-aggressive-instcombine: diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index edaca9ebf6090..11154686fa77a 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -81,7 +81,7 @@ MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs())) MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs())) MODULE_PASS("rewrite-statepoints-for-gc", RewriteStatepointsForGC()) MODULE_PASS("rewrite-symbols", RewriteSymbolPass()) -MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass()) +MODULE_PASS("rpo-function-attrs", ReversePostOrderFunctionAttrsPass()) MODULE_PASS("sample-profile", SampleProfileLoaderPass()) MODULE_PASS("scc-oz-module-inliner", buildInlinerPipeline(OptimizationLevel::Oz, ThinLTOPhase::None, DebugLogging)) diff --git a/llvm/lib/Target/README.txt b/llvm/lib/Target/README.txt index 563aee9e1a780..a4876f715c64b 100644 --- a/llvm/lib/Target/README.txt +++ b/llvm/lib/Target/README.txt @@ -1541,9 +1541,9 @@ int bar() { return foo("abcd"); } //===---------------------------------------------------------------------===// -functionattrs doesn't know much about memcpy/memset. This function should be +function-attrs doesn't know much about memcpy/memset. This function should be marked readnone rather than readonly, since it only twiddles local memory, but -functionattrs doesn't handle memset/memcpy/memmove aggressively: +function-attrs doesn't handle memset/memcpy/memmove aggressively: struct X { int *p; int *q; }; int foo() { @@ -1557,7 +1557,7 @@ int foo() { } This can be seen at: -$ clang t.c -S -o - -mkernel -O0 -emit-llvm | opt -functionattrs -S +$ clang t.c -S -o - -mkernel -O0 -emit-llvm | opt -function-attrs -S //===---------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 4baeaa6e16304..5f70f8eaebb65 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -63,7 +63,7 @@ using namespace llvm; -#define DEBUG_TYPE "functionattrs" +#define DEBUG_TYPE "function-attrs" STATISTIC(NumReadNone, "Number of functions marked readnone"); STATISTIC(NumReadOnly, "Number of functions marked readonly"); @@ -1477,11 +1477,11 @@ struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass { } // end anonymous namespace char PostOrderFunctionAttrsLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "functionattrs", +INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "function-attrs", "Deduce function attributes", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "functionattrs", +INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "function-attrs", "Deduce function attributes", false, false) Pass *llvm::createPostOrderFunctionAttrsLegacyPass() { @@ -1542,11 +1542,13 @@ struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass { char ReversePostOrderFunctionAttrsLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs", - "Deduce function attributes in RPO", false, false) +INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, + "rpo-function-attrs", "Deduce function attributes in RPO", + false, false) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs", - "Deduce function attributes in RPO", false, false) +INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass, + "rpo-function-attrs", "Deduce function attributes in RPO", + false, false) Pass *llvm::createReversePostOrderFunctionAttrsPass() { return new ReversePostOrderFunctionAttrsLegacyPass(); diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll index 16a321a8108b2..717d92471406a 100644 --- a/llvm/test/Analysis/MemorySSA/pr39197.ll +++ b/llvm/test/Analysis/MemorySSA/pr39197.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -enable-mssa-loop-dependency -verify-memoryssa -sroa -globalopt -functionattrs -simplifycfg -licm -loop-unswitch %s -S | FileCheck %s +; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -enable-mssa-loop-dependency -verify-memoryssa -sroa -globalopt -function-attrs -simplifycfg -licm -loop-unswitch %s -S | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll index ba893fedee269..fca330a1029f3 100644 --- a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll +++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -tbaa -basic-aa -functionattrs -S | FileCheck %s +; RUN: opt < %s -tbaa -basic-aa -function-attrs -S | FileCheck %s ; FunctionAttrs should make use of TBAA. diff --git a/llvm/test/Analysis/alias-analysis-uses.ll b/llvm/test/Analysis/alias-analysis-uses.ll index 4163ec25584c6..8f13148b20117 100644 --- a/llvm/test/Analysis/alias-analysis-uses.ll +++ b/llvm/test/Analysis/alias-analysis-uses.ll @@ -1,4 +1,4 @@ -; RUN: opt -debug-pass=Executions -globals-aa -functionattrs -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -debug-pass=Executions -globals-aa -function-attrs -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Executing Pass 'Globals Alias Analysis' ; CHECK-NOT: Freeing Pass 'Globals Alias Analysis' diff --git a/llvm/test/DebugInfo/check-debugify-preserves-analyses.ll b/llvm/test/DebugInfo/check-debugify-preserves-analyses.ll index 08195da75e853..74f564356d2dd 100644 --- a/llvm/test/DebugInfo/check-debugify-preserves-analyses.ll +++ b/llvm/test/DebugInfo/check-debugify-preserves-analyses.ll @@ -1,7 +1,7 @@ -; RUN: opt < %s -globals-aa -functionattrs | \ +; RUN: opt < %s -globals-aa -function-attrs | \ ; RUN: opt -S -strip -strip-dead-prototypes -strip-named-metadata > %t.no_dbg -; RUN: opt < %s -debugify-each -globals-aa -functionattrs | \ +; RUN: opt < %s -debugify-each -globals-aa -function-attrs | \ ; RUN: opt -S -strip -strip-dead-prototypes -strip-named-metadata > %t.with_dbg ; RUN: diff %t.no_dbg %t.with_dbg diff --git a/llvm/test/Feature/OperandBundles/function-attrs.ll b/llvm/test/Feature/OperandBundles/function-attrs.ll index 6e1b255039928..94f21bdffb244 100644 --- a/llvm/test/Feature/OperandBundles/function-attrs.ll +++ b/llvm/test/Feature/OperandBundles/function-attrs.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -functionattrs < %s | FileCheck %s +; RUN: opt -S -function-attrs < %s | FileCheck %s declare void @f_readonly() readonly declare void @f_readnone() readnone diff --git a/llvm/test/Feature/OperandBundles/pr26510.ll b/llvm/test/Feature/OperandBundles/pr26510.ll index 08bd92aa6fa30..1877f35a40067 100644 --- a/llvm/test/Feature/OperandBundles/pr26510.ll +++ b/llvm/test/Feature/OperandBundles/pr26510.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -globals-aa -functionattrs < %s | FileCheck %s +; RUN: opt -S -globals-aa -function-attrs < %s | FileCheck %s ; RUN: opt -S -O3 < %s | FileCheck %s ; Apart from checking for the direct cause of the bug, we also check diff --git a/llvm/test/Transforms/FunctionAttrs/2008-09-03-Mutual.ll b/llvm/test/Transforms/FunctionAttrs/2008-09-03-Mutual.ll index 6bbd99951adfc..1d75248f41f8b 100644 --- a/llvm/test/Transforms/FunctionAttrs/2008-09-03-Mutual.ll +++ b/llvm/test/Transforms/FunctionAttrs/2008-09-03-Mutual.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; CHECK: Function Attrs diff --git a/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll b/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll index b3035a67abaca..51a9a04eff366 100644 --- a/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll +++ b/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -functionattrs -S | FileCheck %s +; RUN: opt < %s -basic-aa -function-attrs -S | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes=function-attrs -S | FileCheck %s @x = global i32 0 diff --git a/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll b/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll index 1df26459e89f4..4ea6fdc87dfa3 100644 --- a/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll +++ b/llvm/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -functionattrs -S | FileCheck %s +; RUN: opt < %s -basic-aa -function-attrs -S | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes=function-attrs -S | FileCheck %s ; CHECK: define i32 @f() #0 diff --git a/llvm/test/Transforms/FunctionAttrs/2008-09-13-VolatileRead.ll b/llvm/test/Transforms/FunctionAttrs/2008-09-13-VolatileRead.ll index 8212e8945ec6f..a296c78570f84 100644 --- a/llvm/test/Transforms/FunctionAttrs/2008-09-13-VolatileRead.ll +++ b/llvm/test/Transforms/FunctionAttrs/2008-09-13-VolatileRead.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; PR2792 diff --git a/llvm/test/Transforms/FunctionAttrs/2008-12-29-Constant.ll b/llvm/test/Transforms/FunctionAttrs/2008-12-29-Constant.ll index 69ff6cc0a6eb0..d814666b1b64e 100644 --- a/llvm/test/Transforms/FunctionAttrs/2008-12-29-Constant.ll +++ b/llvm/test/Transforms/FunctionAttrs/2008-12-29-Constant.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -functionattrs -S | FileCheck %s +; RUN: opt < %s -basic-aa -function-attrs -S | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes=function-attrs -S | FileCheck %s @s = external constant i8 ; [#uses=1] diff --git a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll index ce72c41656333..435f7810fbde6 100644 --- a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll +++ b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; CHECK: define i32* @a(i32** nocapture readonly %p) diff --git a/llvm/test/Transforms/FunctionAttrs/2010-10-30-volatile.ll b/llvm/test/Transforms/FunctionAttrs/2010-10-30-volatile.ll index b9536dce8a489..c5361b995482a 100644 --- a/llvm/test/Transforms/FunctionAttrs/2010-10-30-volatile.ll +++ b/llvm/test/Transforms/FunctionAttrs/2010-10-30-volatile.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; PR8279 diff --git a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll index 0adf91cd9aa1d..9ee90f9719e79 100644 --- a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll +++ b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll @@ -1,4 +1,4 @@ -; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefix=FNATTR +; RUN: opt -function-attrs -S < %s | FileCheck %s --check-prefix=FNATTR ; ; Test cases specifically designed for the "returned" argument attribute. ; We use FIXME's to indicate problems and missing attributes. diff --git a/llvm/test/Transforms/FunctionAttrs/assume.ll b/llvm/test/Transforms/FunctionAttrs/assume.ll index d6296624a2d2f..d94c3705a88e7 100644 --- a/llvm/test/Transforms/FunctionAttrs/assume.ll +++ b/llvm/test/Transforms/FunctionAttrs/assume.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -o - -functionattrs %s | FileCheck %s +; RUN: opt -S -o - -function-attrs %s | FileCheck %s ; RUN: opt -S -o - -passes=function-attrs %s | FileCheck %s ; CHECK-NOT: readnone diff --git a/llvm/test/Transforms/FunctionAttrs/atomic.ll b/llvm/test/Transforms/FunctionAttrs/atomic.ll index a0a08890cb65a..8112996404a54 100644 --- a/llvm/test/Transforms/FunctionAttrs/atomic.ll +++ b/llvm/test/Transforms/FunctionAttrs/atomic.ll @@ -1,4 +1,4 @@ -; RUN: opt -basic-aa -functionattrs -S < %s | FileCheck %s +; RUN: opt -basic-aa -function-attrs -S < %s | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes=function-attrs -S < %s | FileCheck %s ; Atomic load/store to local doesn't affect whether a function is diff --git a/llvm/test/Transforms/FunctionAttrs/comdat-ipo.ll b/llvm/test/Transforms/FunctionAttrs/comdat-ipo.ll index d2f194facba8b..4ad080bff425f 100644 --- a/llvm/test/Transforms/FunctionAttrs/comdat-ipo.ll +++ b/llvm/test/Transforms/FunctionAttrs/comdat-ipo.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; See PR26774 diff --git a/llvm/test/Transforms/FunctionAttrs/convergent.ll b/llvm/test/Transforms/FunctionAttrs/convergent.ll index 0e4b7515d0187..8b764f502307f 100644 --- a/llvm/test/Transforms/FunctionAttrs/convergent.ll +++ b/llvm/test/Transforms/FunctionAttrs/convergent.ll @@ -1,7 +1,7 @@ ; FIXME: convert CHECK-INDIRECT into CHECK (and remove -check-prefixes) as soon ; FIXME: as new-pass-manager's handling of indirect_non_convergent_call is fixed ; -; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INDIRECT +; RUN: opt -function-attrs -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INDIRECT ; RUN: opt -passes=function-attrs -S < %s | FileCheck %s ; CHECK: Function Attrs diff --git a/llvm/test/Transforms/FunctionAttrs/incompatible_fn_attrs.ll b/llvm/test/Transforms/FunctionAttrs/incompatible_fn_attrs.ll index 79af817ff0351..906ae01422c1f 100644 --- a/llvm/test/Transforms/FunctionAttrs/incompatible_fn_attrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/incompatible_fn_attrs.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -o - -functionattrs %s | FileCheck %s +; RUN: opt -S -o - -function-attrs %s | FileCheck %s ; RUN: opt -S -o - -passes=function-attrs %s | FileCheck %s ; Verify we remove argmemonly/inaccessiblememonly/inaccessiblemem_or_argmemonly diff --git a/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll b/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll index 24a145908e00c..4c33116b57c98 100644 --- a/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll +++ b/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll @@ -1,4 +1,4 @@ -; RUN: opt -S < %s -functionattrs | FileCheck %s +; RUN: opt -S < %s -function-attrs | FileCheck %s ; RUN: opt -S < %s -passes=function-attrs | FileCheck %s ; CHECK: Function Attrs diff --git a/llvm/test/Transforms/FunctionAttrs/naked_functions.ll b/llvm/test/Transforms/FunctionAttrs/naked_functions.ll index c4996d4e7e909..20df047ce0f46 100644 --- a/llvm/test/Transforms/FunctionAttrs/naked_functions.ll +++ b/llvm/test/Transforms/FunctionAttrs/naked_functions.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -functionattrs %s | FileCheck %s +; RUN: opt -S -function-attrs %s | FileCheck %s ; RUN: opt -S -passes='function-attrs' %s | FileCheck %s ; Don't change the attributes of parameters of naked functions, in particular diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll index ba43f9637b885..370a74d469600 100644 --- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll +++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll @@ -1,4 +1,4 @@ -; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefixes=FNATTR +; RUN: opt -function-attrs -S < %s | FileCheck %s --check-prefixes=FNATTR ; RUN: opt -passes=function-attrs -S < %s | FileCheck %s --check-prefixes=FNATTR @g = global i32* null ; [#uses=1] diff --git a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll index 8ac037e5cd8dc..ef9d086f8f17a 100644 --- a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll +++ b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll @@ -1,4 +1,4 @@ -; RUN: opt -functionattrs --disable-nofree-inference=false -S < %s | FileCheck %s --check-prefix=FNATTR +; RUN: opt -function-attrs --disable-nofree-inference=false -S < %s | FileCheck %s --check-prefix=FNATTR target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/FunctionAttrs/nofree.ll b/llvm/test/Transforms/FunctionAttrs/nofree.ll index e72ff2f532543..4d36cc82bae2d 100644 --- a/llvm/test/Transforms/FunctionAttrs/nofree.ll +++ b/llvm/test/Transforms/FunctionAttrs/nofree.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull-global.ll b/llvm/test/Transforms/FunctionAttrs/nonnull-global.ll index d79a7ae290a97..e0c8ae465935d 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull-global.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull-global.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -functionattrs %s | FileCheck %s +; RUN: opt -S -function-attrs %s | FileCheck %s ; RUN: opt -S -passes=function-attrs %s | FileCheck %s @a = external global i8, !absolute_symbol !0 diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll index a71d656882041..9e39580229950 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -functionattrs -enable-nonnull-arg-prop %s | FileCheck %s --check-prefixes=FNATTR +; RUN: opt -S -function-attrs -enable-nonnull-arg-prop %s | FileCheck %s --check-prefixes=FNATTR ; RUN: opt -S -passes=function-attrs -enable-nonnull-arg-prop %s | FileCheck %s --check-prefixes=FNATTR target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll index ad03694c32850..cc48dda663c5e 100644 --- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll +++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -basic-aa -functionattrs -rpo-functionattrs -S | FileCheck %s -; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs),rpo-functionattrs' -S | FileCheck %s +; RUN: opt < %s -basic-aa -function-attrs -rpo-function-attrs -S | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs),rpo-function-attrs' -S | FileCheck %s ; CHECK: Function Attrs ; CHECK-SAME: norecurse nounwind readnone diff --git a/llvm/test/Transforms/FunctionAttrs/nounwind.ll b/llvm/test/Transforms/FunctionAttrs/nounwind.ll index 6d5e3a2ea5b24..57518f4870cc6 100644 --- a/llvm/test/Transforms/FunctionAttrs/nounwind.ll +++ b/llvm/test/Transforms/FunctionAttrs/nounwind.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; TEST 1 ; CHECK: Function Attrs: norecurse nounwind readnone diff --git a/llvm/test/Transforms/FunctionAttrs/operand-bundles-scc.ll b/llvm/test/Transforms/FunctionAttrs/operand-bundles-scc.ll index 4ad195ea2b5ff..78cfd03d5ac5c 100644 --- a/llvm/test/Transforms/FunctionAttrs/operand-bundles-scc.ll +++ b/llvm/test/Transforms/FunctionAttrs/operand-bundles-scc.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -functionattrs < %s | FileCheck %s +; RUN: opt -S -function-attrs < %s | FileCheck %s ; RUN: opt -S -passes=function-attrs < %s | FileCheck %s define void @f() { diff --git a/llvm/test/Transforms/FunctionAttrs/optnone.ll b/llvm/test/Transforms/FunctionAttrs/optnone.ll index 586a6d4a081f9..b7e9ea3636c3b 100644 --- a/llvm/test/Transforms/FunctionAttrs/optnone.ll +++ b/llvm/test/Transforms/FunctionAttrs/optnone.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s @x = global i32 0 diff --git a/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll b/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll index f2294fe22ef48..c87e591898952 100644 --- a/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll +++ b/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll @@ -1,4 +1,4 @@ -; RUN: opt -functionattrs -S < %s | FileCheck %s +; RUN: opt -function-attrs -S < %s | FileCheck %s ; RUN: opt -passes=function-attrs -S < %s | FileCheck %s ; This checks for an iterator wraparound bug in FunctionAttrs. The previous diff --git a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll index 319aaf0136233..968a290b988c5 100644 --- a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll +++ b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -functionattrs < %s | FileCheck %s +; RUN: opt -S -function-attrs < %s | FileCheck %s ; RUN: opt -S -passes=function-attrs < %s | FileCheck %s @i = global i32 0 diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll index e566c96d42b00..ae34219bd011e 100644 --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs)' -S | FileCheck %s @x = global i32 0 @@ -130,7 +130,7 @@ declare void @escape_readonly_ptr(i8** %addr, i8* readonly %ptr) ; is marked as readnone/only. However, the functions can write the pointer into ; %addr, causing the store to write to %escaped_then_written. ; -; FIXME: This test currently exposes a bug in functionattrs! +; FIXME: This test currently exposes a bug in function-attrs! ; ; CHECK: define void @unsound_readnone(i8* nocapture readnone %ignored, i8* readnone %escaped_then_written) ; CHECK: define void @unsound_readonly(i8* nocapture readnone %ignored, i8* readonly %escaped_then_written) diff --git a/llvm/test/Transforms/FunctionAttrs/readnone.ll b/llvm/test/Transforms/FunctionAttrs/readnone.ll index b18aab539b62b..aa074b3da11fa 100644 --- a/llvm/test/Transforms/FunctionAttrs/readnone.ll +++ b/llvm/test/Transforms/FunctionAttrs/readnone.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; CHECK: define void @bar(i8* nocapture readnone %0) diff --git a/llvm/test/Transforms/FunctionAttrs/returned.ll b/llvm/test/Transforms/FunctionAttrs/returned.ll index 04ddb7b5ac03a..451b95074b630 100644 --- a/llvm/test/Transforms/FunctionAttrs/returned.ll +++ b/llvm/test/Transforms/FunctionAttrs/returned.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; CHECK: define i32 @test1(i32 %p, i32 %q) diff --git a/llvm/test/Transforms/FunctionAttrs/writeonly.ll b/llvm/test/Transforms/FunctionAttrs/writeonly.ll index 6514cd9d10641..9be998787466a 100644 --- a/llvm/test/Transforms/FunctionAttrs/writeonly.ll +++ b/llvm/test/Transforms/FunctionAttrs/writeonly.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -functionattrs -S | FileCheck %s +; RUN: opt < %s -function-attrs -S | FileCheck %s ; RUN: opt < %s -passes=function-attrs -S | FileCheck %s ; CHECK: define void @nouses-argworn-funrn(i32* nocapture readnone %.aaa) #0 { diff --git a/llvm/test/Transforms/GlobalDCE/crash-assertingvh.ll b/llvm/test/Transforms/GlobalDCE/crash-assertingvh.ll index 2919999d5e288..08230ef55fa73 100644 --- a/llvm/test/Transforms/GlobalDCE/crash-assertingvh.ll +++ b/llvm/test/Transforms/GlobalDCE/crash-assertingvh.ll @@ -3,7 +3,7 @@ ; to assert when global DCE deletes the body of the function. ; ; RUN: opt -disable-output < %s -passes='module(function(jump-threading),globaldce)' -; RUN: opt -disable-output < %s -passes='module(rpo-functionattrs,globaldce)' +; RUN: opt -disable-output < %s -passes='module(rpo-function-attrs,globaldce)' target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/IndVarSimplify/pr38855.ll b/llvm/test/Transforms/IndVarSimplify/pr38855.ll index 67887f5146c85..4088584f6f0af 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr38855.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr38855.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -disable-nounwind-inference=false -inline -functionattrs -indvars < %s | FileCheck %s +; RUN: opt -S -disable-nounwind-inference=false -inline -function-attrs -indvars < %s | FileCheck %s ; Check that the invalidation happens correctly and the test does not crash. define void @f2() { diff --git a/llvm/test/Transforms/Inline/delete-call.ll b/llvm/test/Transforms/Inline/delete-call.ll index 7f30ffb306b41..9d6140a3f78e5 100644 --- a/llvm/test/Transforms/Inline/delete-call.ll +++ b/llvm/test/Transforms/Inline/delete-call.ll @@ -2,7 +2,7 @@ ; RUN: opt -S -inline -stats < %s 2>&1 | FileCheck %s ; CHECK: Number of functions inlined -; RUN: opt -S -inline -functionattrs -stats < %s 2>&1 | FileCheck -check-prefix=CHECK-FUNCTIONATTRS %s +; RUN: opt -S -inline -function-attrs -stats < %s 2>&1 | FileCheck -check-prefix=CHECK-FUNCTIONATTRS %s ; CHECK-FUNCTIONATTRS: Number of call sites deleted, not inlined target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" diff --git a/llvm/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll b/llvm/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll index b66495d9cbaa3..ceffb8941de86 100644 --- a/llvm/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll +++ b/llvm/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -inline -instcombine -functionattrs | llvm-dis +; RUN: opt < %s -inline -instcombine -function-attrs | llvm-dis ; ; Check that nocapture attributes are added when run after an SCC pass. ; PR3520 diff --git a/llvm/test/Transforms/LoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll b/llvm/test/Transforms/LoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll index 67742fcab46ae..14b6484618f17 100644 --- a/llvm/test/Transforms/LoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll +++ b/llvm/test/Transforms/LoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -instcombine -inline -functionattrs -licm -loop-unswitch -gvn -verify +; RUN: opt < %s -basic-aa -instcombine -inline -function-attrs -licm -loop-unswitch -gvn -verify ; PR12573 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.7.0" diff --git a/llvm/test/Transforms/Reassociate/reassociate-deadinst.ll b/llvm/test/Transforms/Reassociate/reassociate-deadinst.ll index df314d571d37a..9266fce48e8ff 100644 --- a/llvm/test/Transforms/Reassociate/reassociate-deadinst.ll +++ b/llvm/test/Transforms/Reassociate/reassociate-deadinst.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -inline -functionattrs -reassociate -S | FileCheck %s +; RUN: opt < %s -inline -function-attrs -reassociate -S | FileCheck %s ; CHECK-NOT: func1 ; CHECK-LABEL: main diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll index c9837c5cc141d..66176ed717532 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -instcombine -inline -functionattrs -licm -simple-loop-unswitch -gvn -verify +; RUN: opt < %s -basic-aa -instcombine -inline -function-attrs -licm -simple-loop-unswitch -gvn -verify ; PR12573 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.7.0" From 9b1539be406da714ab7cbb379575f67d82b874b2 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 27 Jul 2020 11:31:34 -0700 Subject: [PATCH 0318/1035] [NewPM][Sancov] Pin RUN lines with -sancov to legacy PM Since the NPM pass is named sancov-module, not sancov. This makes all tests under Instrumentation/SanitizerCoverage pass when -enable-new-pm is on by default. Reviewed By: vitalybuka Differential Revision: https://reviews.llvm.org/D84687 --- .../SanitizerCoverage/abort-in-entry-block.ll | 2 +- .../Instrumentation/SanitizerCoverage/backedge-pruning.ll | 4 ++-- llvm/test/Instrumentation/SanitizerCoverage/chains.ll | 2 +- .../SanitizerCoverage/cmp-tracing-api-x86_32.ll | 2 +- .../SanitizerCoverage/cmp-tracing-api-x86_64.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/coff-comdat.ll | 2 +- .../SanitizerCoverage/coff-pc-table-inline-8bit-counters.ll | 2 +- .../SanitizerCoverage/coff-pc-table-inline-bool-flag.ll | 2 +- .../Instrumentation/SanitizerCoverage/coff-used-ctor.ll | 2 +- .../Instrumentation/SanitizerCoverage/const-cmp-tracing.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/coverage.ll | 4 ++-- .../test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/div-tracing.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/gep-tracing.ll | 2 +- .../SanitizerCoverage/inline-8bit-counters.ll | 2 +- .../Instrumentation/SanitizerCoverage/inline-bool-flag.ll | 2 +- .../SanitizerCoverage/interposable-symbol-nocomdat.ll | 4 ++-- llvm/test/Instrumentation/SanitizerCoverage/no-func.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll | 6 +++--- .../SanitizerCoverage/postdominator_check.ll | 4 ++-- llvm/test/Instrumentation/SanitizerCoverage/seh.ll | 6 +++--- .../stack-depth-variable-declared-by-user.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll | 4 ++-- .../Instrumentation/SanitizerCoverage/switch-tracing.ll | 2 +- .../SanitizerCoverage/trace-pc-guard-comdat.ll | 2 +- .../trace-pc-guard-inline-8bit-counters.ll | 2 +- .../SanitizerCoverage/trace-pc-guard-inline-bool-flag.ll | 2 +- .../SanitizerCoverage/trace-pc-guard-nocomdat.ll | 2 +- .../Instrumentation/SanitizerCoverage/tracing-comdat.ll | 4 ++-- llvm/test/Instrumentation/SanitizerCoverage/tracing.ll | 6 +++--- .../SanitizerCoverage/unreachable-critedge.ll | 2 +- llvm/test/Instrumentation/SanitizerCoverage/wineh.ll | 2 +- 34 files changed, 46 insertions(+), 46 deletions(-) diff --git a/llvm/test/Instrumentation/SanitizerCoverage/abort-in-entry-block.ll b/llvm/test/Instrumentation/SanitizerCoverage/abort-in-entry-block.ll index 1a44e9b76f553..79a46d09e447f 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/abort-in-entry-block.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/abort-in-entry-block.ll @@ -1,5 +1,5 @@ ; Checks that a function with no-return in the entry block is not instrumented. -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s ; CHECK-NOT: call void @__sanitizer_cov_trace_pc_guard diff --git a/llvm/test/Instrumentation/SanitizerCoverage/backedge-pruning.ll b/llvm/test/Instrumentation/SanitizerCoverage/backedge-pruning.ll index c3a13b090dc0e..6b5ba8bc2cf17 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/backedge-pruning.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/backedge-pruning.ll @@ -1,6 +1,6 @@ ; Test -sanitizer-coverage-trace-compares=1 and how it prunes backedge compares. -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s --check-prefix=PRUNE -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -sanitizer-coverage-prune-blocks=0 -S | FileCheck %s --check-prefix=NOPRUNE +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -sanitizer-coverage-prune-blocks=1 -S -enable-new-pm=0 | FileCheck %s --check-prefix=PRUNE +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -sanitizer-coverage-prune-blocks=0 -S -enable-new-pm=0 | FileCheck %s --check-prefix=NOPRUNE ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s --check-prefix=PRUNE ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -sanitizer-coverage-prune-blocks=0 -S | FileCheck %s --check-prefix=NOPRUNE diff --git a/llvm/test/Instrumentation/SanitizerCoverage/chains.ll b/llvm/test/Instrumentation/SanitizerCoverage/chains.ll index ba83c85c83611..ffb46264b226e 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/chains.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/chains.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s define i32 @blah(i32) #0 { diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll index a0406f833a623..25e7a6ece8444 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-trace-compares=1 API declarations on a non-x86_64 arch -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s target triple = "i386-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll index e2645d19a340d..cd18aeed28a18 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-trace-compares=1 API declarations on x86_64 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing.ll index df329fbe85eca..79c682630e65a 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-trace-compares=1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/coff-comdat.ll b/llvm/test/Instrumentation/SanitizerCoverage/coff-comdat.ll index 240c6c50ecd8d..b104796a2ebba 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/coff-comdat.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/coff-comdat.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s ; Make sure we use the right comdat groups for COFF to avoid relocations diff --git a/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-8bit-counters.ll b/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-8bit-counters.ll index 3c279ebeaef34..0134c7ea92c14 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-8bit-counters.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-8bit-counters.ll @@ -1,5 +1,5 @@ ; Checks that the PC and 8-bit Counter Arrays are placed in their own sections in COFF binaries. -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc19.14.26433" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-bool-flag.ll b/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-bool-flag.ll index 9fcafa9a8b9a3..d91714ba8fcf6 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-bool-flag.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/coff-pc-table-inline-bool-flag.ll @@ -1,5 +1,5 @@ ; Checks that the PC and 8-bit Counter Arrays are placed in their own sections in COFF binaries. -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -sanitizer-coverage-pc-table=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc19.14.26433" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/coff-used-ctor.ll b/llvm/test/Instrumentation/SanitizerCoverage/coff-used-ctor.ll index d89375af982f2..2484756458860 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/coff-used-ctor.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/coff-used-ctor.ll @@ -1,5 +1,5 @@ ; Checks that sancov.module_ctor is marked used. -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc19.14.26433" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/const-cmp-tracing.ll b/llvm/test/Instrumentation/SanitizerCoverage/const-cmp-tracing.ll index c5fbea3d0a9b7..36919275488b3 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/const-cmp-tracing.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/const-cmp-tracing.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-trace-compares=1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll b/llvm/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll index f870b6eaba36a..7c7932d430d33 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll @@ -1,6 +1,6 @@ ; Test that coverage instrumentation does not lose debug location. -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -S | FileCheck %s ; C++ source: diff --git a/llvm/test/Instrumentation/SanitizerCoverage/coverage.ll b/llvm/test/Instrumentation/SanitizerCoverage/coverage.ll index bb0bed06a47ef..3b3d8d9f04c48 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/coverage.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/coverage.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK_TRACE_PC -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s --check-prefix=CHECKPRUNE +; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK_TRACE_PC +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-prune-blocks=1 -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECKPRUNE ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK_TRACE_PC ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s --check-prefix=CHECKPRUNE diff --git a/llvm/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll b/llvm/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll index 11ec286ce1ee0..59db418a6f81a 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll @@ -1,6 +1,6 @@ ; Test that coverage instrumentation does not lose debug location. -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=2 -S | FileCheck %s ; C++ source: diff --git a/llvm/test/Instrumentation/SanitizerCoverage/div-tracing.ll b/llvm/test/Instrumentation/SanitizerCoverage/div-tracing.ll index 35162bc279450..12394159510a9 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/div-tracing.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/div-tracing.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-trace-divs=1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-divs=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-divs=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-divs=1 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/gep-tracing.ll b/llvm/test/Instrumentation/SanitizerCoverage/gep-tracing.ll index 04ff415c235a1..ad8544ad8a6f2 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/gep-tracing.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/gep-tracing.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-trace-geps=1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-geps=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-geps=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-geps=1 -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll b/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll index b3269b8799884..775ce4fd772d3 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-inline-8bit-counters=1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll index 65bee698e23dd..8b05aac813c6a 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; Test -sanitizer-coverage-inline-bool-flag=1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/interposable-symbol-nocomdat.ll b/llvm/test/Instrumentation/SanitizerCoverage/interposable-symbol-nocomdat.ll index fcdea5535d4fb..d6a1593c7b3be 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/interposable-symbol-nocomdat.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/interposable-symbol-nocomdat.ll @@ -1,6 +1,6 @@ ; Test that interposable symbols do not get put in comdats. -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -mtriple x86_64-linux-gnu -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -mtriple x86_64-windows-msvc -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -mtriple x86_64-linux-gnu -S -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -mtriple x86_64-windows-msvc -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -mtriple x86_64-linux-gnu -S | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -mtriple x86_64-windows-msvc -S | FileCheck %s diff --git a/llvm/test/Instrumentation/SanitizerCoverage/no-func.ll b/llvm/test/Instrumentation/SanitizerCoverage/no-func.ll index 6dfecb87d62c8..219b82c9d364b 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/no-func.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/no-func.ll @@ -1,6 +1,6 @@ ; Tests that we don't insert __sanitizer_cov_trace_pc_guard_init or some such ; when there is no instrumentation. -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll b/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll index 509abab591873..eeeb56bfb2abd 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll @@ -1,7 +1,7 @@ ; Test -sanitizer-coverage-pc-table=1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-pc-table=1 -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-inline-8bit-counters -sanitizer-coverage-pc-table=1 -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-inline-bool-flag -sanitizer-coverage-pc-table=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-pc-table=1 -S -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-inline-8bit-counters -sanitizer-coverage-pc-table=1 -S -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-inline-bool-flag -sanitizer-coverage-pc-table=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-pc-table=1 -S | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-inline-8bit-counters -sanitizer-coverage-pc-table=1 -S | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-inline-bool-flag -sanitizer-coverage-pc-table=1 -S | FileCheck %s diff --git a/llvm/test/Instrumentation/SanitizerCoverage/postdominator_check.ll b/llvm/test/Instrumentation/SanitizerCoverage/postdominator_check.ll index 50b411de9395a..32479d5f1037d 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/postdominator_check.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/postdominator_check.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=0 -S | FileCheck %s --check-prefix=CHECK_NO_PRUNE +; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=0 -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK_NO_PRUNE ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=0 -S | FileCheck %s --check-prefix=CHECK_NO_PRUNE diff --git a/llvm/test/Instrumentation/SanitizerCoverage/seh.ll b/llvm/test/Instrumentation/SanitizerCoverage/seh.ll index f88ccbe2d5312..da91f394d2e6c 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/seh.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/seh.ll @@ -1,6 +1,6 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=0 -S | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -S | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=2 -S | FileCheck %s diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll index b93d6e91a2eb5..f44c3e0c458f5 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll @@ -1,7 +1,7 @@ ; Ensure that we terminate with a useful error message (instead of crash) if the ; user declares `__sancov_lowest_stack` with an unexpected type. ; RUN: not opt < %s -sancov -sanitizer-coverage-level=1 \ -; RUN: -sanitizer-coverage-stack-depth -S 2>&1 | FileCheck %s +; RUN: -sanitizer-coverage-stack-depth -S 2>&1 -enable-new-pm=0 | FileCheck %s ; RUN: not opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 \ ; RUN: -sanitizer-coverage-stack-depth -S 2>&1 | FileCheck %s diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll index bba3e3b7d2805..cc696779795e7 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll @@ -1,9 +1,9 @@ ; This check verifies that stack depth instrumentation works correctly. ; RUN: opt < %s -sancov -sanitizer-coverage-level=1 \ -; RUN: -sanitizer-coverage-stack-depth -S | FileCheck %s +; RUN: -sanitizer-coverage-stack-depth -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 \ ; RUN: -sanitizer-coverage-stack-depth -sanitizer-coverage-trace-pc-guard \ -; RUN: -S | FileCheck %s +; RUN: -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 \ ; RUN: -sanitizer-coverage-stack-depth -S | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 \ diff --git a/llvm/test/Instrumentation/SanitizerCoverage/switch-tracing.ll b/llvm/test/Instrumentation/SanitizerCoverage/switch-tracing.ll index 2d4eb4ca1514b..3b5e762956205 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/switch-tracing.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/switch-tracing.ll @@ -1,5 +1,5 @@ ; Test -sanitizer-coverage-trace-compares=1 (instrumenting a switch) -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-comdat.ll b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-comdat.ll index a278dbcfdcab4..0765d737f143f 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-comdat.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-comdat.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD +; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-8bit-counters.ll b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-8bit-counters.ll index b3b2cc18916fd..2cce6e76f4f38 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-8bit-counters.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-8bit-counters.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-inline-8bit-counters -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-inline-8bit-counters -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-inline-8bit-counters -S | FileCheck %s ; Module ctors should have stable names across modules, not something like diff --git a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-bool-flag.ll b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-bool-flag.ll index 76f9dc9c1ccbf..14198c77d90b2 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-bool-flag.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-inline-bool-flag.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-inline-bool-flag -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-inline-bool-flag -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-inline-bool-flag -S | FileCheck %s ; Module ctors should have stable names across modules, not something like diff --git a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-nocomdat.ll b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-nocomdat.ll index 1cfbfebdf2689..a1aeb7e5b41c2 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-nocomdat.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard-nocomdat.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD +; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/tracing-comdat.ll b/llvm/test/Instrumentation/SanitizerCoverage/tracing-comdat.ll index 709108434975d..2f7d468406412 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/tracing-comdat.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/tracing-comdat.ll @@ -1,7 +1,7 @@ ; Test that the coverage guards have proper comdat -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S -enable-new-pm=0 | FileCheck %s ; Make sure asan does not instrument __sancov_gen_ -; RUN: opt < %s -sancov -asan -asan-module -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s +; RUN: opt < %s -sancov -asan -asan-module -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s ; RUN: opt < %s -passes='module(require,sancov-module,asan-module),function(asan)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s diff --git a/llvm/test/Instrumentation/SanitizerCoverage/tracing.ll b/llvm/test/Instrumentation/SanitizerCoverage/tracing.ll index ddc0e354ec59d..75a30d6b2b2b2 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/tracing.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/tracing.ll @@ -1,7 +1,7 @@ ; Test -sanitizer-coverage-experimental-tracing -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK_PC -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s --check-prefix=CHECK_PC_GUARD -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S -mtriple=x86_64-apple-macosx | FileCheck %s --check-prefix=CHECK_PC_GUARD_DARWIN +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK_PC +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK_PC_GUARD +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S -mtriple=x86_64-apple-macosx -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK_PC_GUARD_DARWIN ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK_PC ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard -S | FileCheck %s --check-prefix=CHECK_PC_GUARD diff --git a/llvm/test/Instrumentation/SanitizerCoverage/unreachable-critedge.ll b/llvm/test/Instrumentation/SanitizerCoverage/unreachable-critedge.ll index 99e3383db9a7f..bc9083fcf6cac 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/unreachable-critedge.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/unreachable-critedge.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -sancov -sanitizer-coverage-level=3 | FileCheck %s +; RUN: opt < %s -S -sancov -sanitizer-coverage-level=3 -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -S -passes='module(sancov-module)' -sanitizer-coverage-level=3 | FileCheck %s ; The critical edges to unreachable_bb should not be split. diff --git a/llvm/test/Instrumentation/SanitizerCoverage/wineh.ll b/llvm/test/Instrumentation/SanitizerCoverage/wineh.ll index fbe9d89f085ee..63120cc29d65f 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/wineh.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/wineh.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK +; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc -S -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK ; Generated from this C++ source: From 46ebb619bf0fb98d94175dca2a06ead27318002f Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 28 Jul 2020 09:12:31 -0700 Subject: [PATCH 0319/1035] [FIX] Resolve test failure in polly/test/ScopInfo/memcpy-raw-source.ll scoped-noalias -> scoped-noalias-aa reference: https://reviews.llvm.org/D84542 Reviewed By: aeubanks Differential Revision: https://reviews.llvm.org/D84720 --- polly/test/ScopInfo/memcpy-raw-source.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll index 69a12f5cb8346..f558bfe997276 100644 --- a/polly/test/ScopInfo/memcpy-raw-source.ll +++ b/polly/test/ScopInfo/memcpy-raw-source.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -scoped-noalias -tbaa -polly-scops -analyze < %s +; RUN: opt %loadPolly -basic-aa -scoped-noalias-aa -tbaa -polly-scops -analyze < %s ; ; Ensure that ScopInfo's alias analysis llvm.memcpy for, ; like the AliasSetTracker, preserves bitcasts. From 8b2fcc42b895575d46dbd9252df566938cf68a69 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Thu, 23 Jul 2020 14:17:37 -0700 Subject: [PATCH 0320/1035] [CompilerRT] Don't pass global compile test flags in non-standalone build In a build with -DLLVM_ENABLE_LTO=Thin: $ ninja TSanitizer-x86_64-Test-Nolibc [1/1] Generating Sanitizer-x86_64-Test-Nolibc FAILED: projects/compiler-rt/lib/sanitizer_common/tests/Sanitizer-x86_64-Test-Nolibc sanitizer_nolibc_test_main.x86_64.o: file not recognized: file format not recognized because -flto=thin is getting passed to the clang_compile step. For non-standalone builds, global compilation flags shouldn't be passed to compiler-rt tests, only the flags the test specifies. Reviewed By: vitalybuka Differential Revision: https://reviews.llvm.org/D84466 --- .../cmake/Modules/CompilerRTCompile.cmake | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake index 07b589beb2d10..3330038f80688 100644 --- a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake @@ -70,29 +70,34 @@ function(clang_compile object_file source) if (TARGET CompilerRTUnitTestCheckCxx) list(APPEND SOURCE_DEPS CompilerRTUnitTestCheckCxx) endif() - string(REGEX MATCH "[.](cc|cpp)$" is_cxx ${source_rpath}) - string(REGEX MATCH "[.](m|mm)$" is_objc ${source_rpath}) - if(is_cxx) - string(REPLACE " " ";" global_flags "${CMAKE_CXX_FLAGS}") - else() - string(REPLACE " " ";" global_flags "${CMAKE_C_FLAGS}") - endif() + if(COMPILER_RT_STANDALONE_BUILD) + # Only add global flags in standalone build. + string(REGEX MATCH "[.](cc|cpp)$" is_cxx ${source_rpath}) + string(REGEX MATCH "[.](m|mm)$" is_objc ${source_rpath}) + if(is_cxx) + string(REPLACE " " ";" global_flags "${CMAKE_CXX_FLAGS}") + else() + string(REPLACE " " ";" global_flags "${CMAKE_C_FLAGS}") + endif() - if (MSVC) - translate_msvc_cflags(global_flags "${global_flags}") - endif() + if (MSVC) + translate_msvc_cflags(global_flags "${global_flags}") + endif() - if (APPLE) - set(global_flags ${OSX_SYSROOT_FLAG} ${global_flags}) - endif() - if (is_objc) - list(APPEND global_flags -ObjC) - endif() + if (APPLE) + set(global_flags ${OSX_SYSROOT_FLAG} ${global_flags}) + endif() + if (is_objc) + list(APPEND global_flags -ObjC) + endif() - # Ignore unknown warnings. CMAKE_CXX_FLAGS may contain GCC-specific options - # which are not supported by Clang. - list(APPEND global_flags -Wno-unknown-warning-option) - set(compile_flags ${global_flags} ${SOURCE_CFLAGS}) + # Ignore unknown warnings. CMAKE_CXX_FLAGS may contain GCC-specific options + # which are not supported by Clang. + list(APPEND global_flags -Wno-unknown-warning-option) + set(compile_flags ${global_flags} ${SOURCE_CFLAGS}) + else() + set(compile_flags ${SOURCE_CFLAGS}) + endif() add_custom_command( OUTPUT ${object_file} COMMAND ${COMPILER_RT_TEST_COMPILER} ${compile_flags} -c From 3fb0de820796cc6e322c8378713d375d9870a353 Mon Sep 17 00:00:00 2001 From: Julian Lettner Date: Fri, 24 Jul 2020 11:24:34 -0700 Subject: [PATCH 0321/1035] [Darwin] Fix OS version checks inside simulators compiler-rt checks OS versions by querying the Darwin kernel version. This is not necessarily correct inside the simulators if the simulator runtime is not aligned with the host macOS. Let's instead check the `SIMULATOR_RUNTIME_VERSION` env var. rdar://63031937 Reviewed By: delcypher Differential Revision: https://reviews.llvm.org/D83977 --- .../lib/sanitizer_common/sanitizer_mac.cpp | 89 +++++++++++++------ .../tests/sanitizer_mac_test.cpp | 42 +++++++-- 2 files changed, 95 insertions(+), 36 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index db8a09e6f0de3..522a909e95285 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -606,21 +606,65 @@ HandleSignalMode GetHandleSignalMode(int signum) { return result; } -// This corresponds to Triple::getMacOSXVersion() in the Clang driver. -static MacosVersion GetMacosAlignedVersionInternal() { - u16 kernel_major = GetDarwinKernelVersion().major; - // Darwin 0-3 -> unsupported - // Darwin 4-19 -> macOS 10.x - // Darwin 20+ -> macOS 11+ - CHECK_GE(kernel_major, 4); - u16 major, minor; - if (kernel_major < 20) { - major = 10; - minor = kernel_major - 4; +using VersStr = char[64]; + +static void GetOSVersion(VersStr vers) { + uptr len = sizeof(VersStr); + if (SANITIZER_IOSSIM) { + const char *vers_env = GetEnv("SIMULATOR_RUNTIME_VERSION"); + if (!vers_env) { + Report("ERROR: Running in simulator but SIMULATOR_RUNTIME_VERSION env " + "var is not set.\n"); + Die(); + } + len = internal_strlcpy(vers, vers_env, len); } else { - major = 11 + kernel_major - 20; - minor = 0; + int res = + internal_sysctlbyname("kern.osproductversion", vers, &len, nullptr, 0); + CHECK_EQ(res, 0); + } + CHECK_LT(len, sizeof(VersStr)); +} + +void ParseVersion(const char *vers, u16 *major, u16 *minor) { + // Format: .[.]\0 + CHECK_GE(internal_strlen(vers), 3); + const char *p = vers; + *major = internal_simple_strtoll(p, &p, /*base=*/10); + CHECK_EQ(*p, '.'); + p += 1; + *minor = internal_simple_strtoll(p, &p, /*base=*/10); +} + +// Aligned versions example: +// macOS 10.15 -- iOS 13 -- tvOS 13 -- watchOS 6 +static void MapToMacos(u16 *major, u16 *minor) { + if (TARGET_OS_OSX) + return; + + if (SANITIZER_IOS || SANITIZER_TVOS) + *major += 2; + else if (SANITIZER_WATCHOS) + *major += 9; + else + UNREACHABLE("unsupported platform"); + + if (*major >= 16) { // macOS 11+ + *major -= 5; + } else { // macOS 10.15 and below + *minor = *major; + *major = 10; } +} + +static MacosVersion GetMacosAlignedVersionInternal() { + VersStr vers; + GetOSVersion(vers); + + u16 major, minor; + ParseVersion(vers, &major, &minor); + MapToMacos(&major, &minor); + return MacosVersion(major, minor); } @@ -639,24 +683,15 @@ MacosVersion GetMacosAlignedVersion() { return *reinterpret_cast(&result); } -void ParseVersion(const char *vers, u16 *major, u16 *minor) { - // Format: ..\0 - CHECK_GE(internal_strlen(vers), 5); - const char *p = vers; - *major = internal_simple_strtoll(p, &p, /*base=*/10); - CHECK_EQ(*p, '.'); - p += 1; - *minor = internal_simple_strtoll(p, &p, /*base=*/10); -} - DarwinKernelVersion GetDarwinKernelVersion() { - char buf[100]; - size_t len = sizeof(buf); - int res = internal_sysctlbyname("kern.osrelease", buf, &len, nullptr, 0); + VersStr vers; + uptr len = sizeof(VersStr); + int res = internal_sysctlbyname("kern.osrelease", vers, &len, nullptr, 0); CHECK_EQ(res, 0); + CHECK_LT(len, sizeof(VersStr)); u16 major, minor; - ParseVersion(buf, &major, &minor); + ParseVersion(vers, &major, &minor); return DarwinKernelVersion(major, minor); } diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp index c8658ea55d034..090947eceb4a7 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp @@ -22,6 +22,38 @@ namespace __sanitizer { +void ParseVersion(const char *vers, u16 *major, u16 *minor); + +TEST(SanitizerMac, ParseVersion) { + u16 major, minor; + + ParseVersion("11.22.33", &major, &minor); + EXPECT_EQ(major, 11); + EXPECT_EQ(minor, 22); + + ParseVersion("1.2", &major, &minor); + EXPECT_EQ(major, 1); + EXPECT_EQ(minor, 2); +} + +// TODO(yln): Run sanitizer unit tests for the simulators (rdar://65680742) +#if SANITIZER_IOSSIM +TEST(SanitizerMac, GetMacosAlignedVersion) { + const char *vers_str; + if (SANITIZER_IOS || SANITIZER_TVOS) { + vers_str = "13.0"; + } else if (SANITIZER_WATCHOS) { + vers_str = "6.5"; + } else { + FAIL() << "unsupported simulator runtime"; + } + setenv("SIMULATOR_RUNTIME_VERSION", vers_str, /*overwrite=*/1); + + MacosVersion vers = GetMacosAlignedVersion(); + EXPECT_EQ(vers.major, 10); + EXPECT_EQ(vers.minor, 15); +} +#else TEST(SanitizerMac, GetMacosAlignedVersion) { MacosVersion vers = GetMacosAlignedVersion(); u16 kernel_major = GetDarwinKernelVersion().major; @@ -31,15 +63,7 @@ TEST(SanitizerMac, GetMacosAlignedVersion) { EXPECT_EQ(vers.major, expected_major); EXPECT_EQ(vers.minor, expected_minor); } - -void ParseVersion(const char *vers, u16 *major, u16 *minor); - -TEST(SanitizerMac, ParseVersion) { - u16 major, minor; - ParseVersion("11.22.33", &major, &minor); - EXPECT_EQ(major, 11); - EXPECT_EQ(minor, 22); -} +#endif TEST(SanitizerMac, GetDarwinKernelVersion) { DarwinKernelVersion vers = GetDarwinKernelVersion(); From 0c64233bb7ae760d36a0af3a22324810787fa4ca Mon Sep 17 00:00:00 2001 From: Anna Welker Date: Tue, 28 Jul 2020 17:28:19 +0100 Subject: [PATCH 0322/1035] [ARM][MVE] Teach MVEGatherScatterLowering to merge successive getelementpointers A patch following up on the introduction of pointer induction variables, adding a preprocessing step to the address optimisation in the MVEGatherScatterLowering pass. If the getelementpointer that is the address is itself using a getelementpointer as base, they will be merged into one by summing up the offsets, after checking that this will not cause an overflow (this can be repeated recursively). Differential Revision: https://reviews.llvm.org/D84027 --- .../Target/ARM/MVEGatherScatterLowering.cpp | 243 +++- .../CodeGen/Thumb2/mve-gather-ind16-scaled.ll | 424 +++++++ .../CodeGen/Thumb2/mve-gather-ind32-scaled.ll | 39 + .../Thumb2/mve-gather-ind8-unscaled.ll | 806 +++++++++++++ .../Thumb2/mve-gather-scatter-ptr-address.ll | 1012 +++++++++++++++++ .../Thumb2/mve-scatter-ind16-scaled.ll | 70 ++ .../Thumb2/mve-scatter-ind32-scaled.ll | 52 + .../Thumb2/mve-scatter-ind8-unscaled.ll | 109 ++ 8 files changed, 2718 insertions(+), 37 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index e0ba7f88db951..2746d4d456e4f 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -84,7 +84,7 @@ class MVEGatherScatterLowering : public FunctionPass { // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP, + Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder); // Compute the scale of this gather/scatter instruction int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); @@ -132,6 +132,11 @@ class MVEGatherScatterLowering : public FunctionPass { Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, Value *Ptr, unsigned TypeScale, IRBuilder<> &Builder); + + // Optimise the base and offsets of the given address + bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI); + // Try to fold consecutive geps together into one + Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -167,7 +172,49 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, return false; } -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, +bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) { + // Offsets that are not of type are sign extended by the + // getelementptr instruction, and MVE gathers/scatters treat the offset as + // unsigned. Thus, if the element size is smaller than 32, we can only allow + // positive offsets - i.e., the offsets are not allowed to be variables we + // can't look into. + // Additionally, offsets have to either originate from a zext of a + // vector with element types smaller or equal the type of the gather we're + // looking at, or consist of constants that we can check are small enough + // to fit into the gather type. + // Thus we check that 0 < value < 2^TargetElemSize. + unsigned TargetElemSize = 128 / TargetElemCount; + unsigned OffsetElemSize = cast(Offsets->getType()) + ->getElementType() + ->getScalarSizeInBits(); + if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) { + Constant *ConstOff = dyn_cast(Offsets); + if (!ConstOff) + return false; + int64_t TargetElemMaxSize = (1ULL << TargetElemSize); + auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) { + ConstantInt *OConst = dyn_cast(OffsetElem); + if (!OConst) + return false; + int SExtValue = OConst->getSExtValue(); + if (SExtValue >= TargetElemMaxSize || SExtValue < 0) + return false; + return true; + }; + if (isa(ConstOff->getType())) { + for (unsigned i = 0; i < TargetElemCount; i++) { + if (!CheckValueSize(ConstOff->getAggregateElement(i))) + return false; + } + } else { + if (!CheckValueSize(ConstOff)) + return false; + } + } + return true; +} + +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder) { if (!GEP) { @@ -178,40 +225,43 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); - if (GEPPtr->getType()->isVectorTy()) { + Offsets = GEP->getOperand(1); + if (GEPPtr->getType()->isVectorTy() || + !isa(Offsets->getType())) return nullptr; - } + if (GEP->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } Offsets = GEP->getOperand(1); + unsigned OffsetsElemCount = + cast(Offsets->getType())->getNumElements(); // Paranoid check whether the number of parallel lanes is the same - assert(cast(Ty)->getNumElements() == - cast(Offsets->getType())->getNumElements()); - // Only offsets can be integrated into an arm gather, any smaller - // type would have to be sign extended by the gep - and arm gathers can only - // zero extend. Additionally, the offsets do have to originate from a zext of - // a vector with element types smaller or equal the type of the gather we're - // looking at - if (Offsets->getType()->getScalarSizeInBits() != 32) - return nullptr; - if (ZExtInst *ZextOffs = dyn_cast(Offsets)) + assert(Ty->getNumElements() == OffsetsElemCount); + + ZExtInst *ZextOffs = dyn_cast(Offsets); + if (ZextOffs) Offsets = ZextOffs->getOperand(0); - else if (!(cast(Offsets->getType())->getNumElements() == 4 && - Offsets->getType()->getScalarSizeInBits() == 32)) - return nullptr; + FixedVectorType *OffsetType = cast(Offsets->getType()); + + // If the offsets are already being zext-ed to , that relieves us of + // having to make sure that they won't overflow. + if (!ZextOffs || cast(ZextOffs->getDestTy()) + ->getElementType() + ->getScalarSizeInBits() != 32) + if (!checkOffsetSize(Offsets, OffsetsElemCount)) + return nullptr; + // The offset sizes have been checked; if any truncating or zext-ing is + // required to fix them, do that now if (Ty != Offsets->getType()) { - if ((Ty->getScalarSizeInBits() < - Offsets->getType()->getScalarSizeInBits())) { - LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type." - << " Can't create intrinsic.\n"); - return nullptr; + if ((Ty->getElementType()->getScalarSizeInBits() < + OffsetType->getElementType()->getScalarSizeInBits())) { + Offsets = Builder.CreateTrunc(Offsets, Ty); } else { - Offsets = Builder.CreateZExt( - Offsets, VectorType::getInteger(cast(Ty))); + Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty)); } } // If none of the checks failed, return the gep's base pointer @@ -426,7 +476,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(ResultTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -566,7 +617,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(InputTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -978,6 +1030,128 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB, return true; } +Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, + IRBuilder<> &Builder) { + + // Splat the non-vector value to a vector of the given type - if the value is + // a constant (and its value isn't too big), we can even use this opportunity + // to scale it to the size of the vector elements + auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) { + ConstantInt *Const; + if ((Const = dyn_cast(NonVectorVal)) && + VT->getElementType() != NonVectorVal->getType()) { + unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits(); + uint64_t N = Const->getZExtValue(); + if (N < (unsigned)(1 << (TargetElemSize - 1))) { + NonVectorVal = Builder.CreateVectorSplat( + VT->getNumElements(), Builder.getIntN(TargetElemSize, N)); + return; + } + } + NonVectorVal = + Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal); + }; + + FixedVectorType *XElType = dyn_cast(X->getType()); + FixedVectorType *YElType = dyn_cast(Y->getType()); + // If one of X, Y is not a vector, we have to splat it in order + // to add the two of them. + if (XElType && !YElType) { + FixSummands(XElType, Y); + YElType = cast(Y->getType()); + } else if (YElType && !XElType) { + FixSummands(YElType, X); + XElType = cast(X->getType()); + } + // Check that the summands are of compatible types + if (XElType != YElType) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n"); + return nullptr; + } + + if (XElType->getElementType()->getScalarSizeInBits() != 32) { + // Check that by adding the vectors we do not accidentally + // create an overflow + Constant *ConstX = dyn_cast(X); + Constant *ConstY = dyn_cast(Y); + if (!ConstX || !ConstY) + return nullptr; + unsigned TargetElemSize = 128 / XElType->getNumElements(); + for (unsigned i = 0; i < XElType->getNumElements(); i++) { + ConstantInt *ConstXEl = + dyn_cast(ConstX->getAggregateElement(i)); + ConstantInt *ConstYEl = + dyn_cast(ConstY->getAggregateElement(i)); + if (!ConstXEl || !ConstYEl || + ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >= + (unsigned)(1 << (TargetElemSize - 1))) + return nullptr; + } + } + + Value *Add = Builder.CreateAdd(X, Y); + + FixedVectorType *GEPType = cast(GEP->getType()); + if (checkOffsetSize(Add, GEPType->getNumElements())) + return Add; + else + return nullptr; +} + +Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP, + Value *&Offsets, + IRBuilder<> &Builder) { + Value *GEPPtr = GEP->getPointerOperand(); + Offsets = GEP->getOperand(1); + // We only merge geps with constant offsets, because only for those + // we can make sure that we do not cause an overflow + if (!isa(Offsets)) + return nullptr; + GetElementPtrInst *BaseGEP; + if ((BaseGEP = dyn_cast(GEPPtr))) { + // Merge the two geps into one + Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder); + if (!BaseBasePtr) + return nullptr; + Offsets = + CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder); + if (Offsets == nullptr) + return nullptr; + return BaseBasePtr; + } + return GEPPtr; +} + +bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB, + LoopInfo *LI) { + GetElementPtrInst *GEP = dyn_cast(Address); + if (!GEP) + return false; + bool Changed = false; + if (GEP->hasOneUse() && + dyn_cast(GEP->getPointerOperand())) { + IRBuilder<> Builder(GEP->getContext()); + Builder.SetInsertPoint(GEP); + Builder.SetCurrentDebugLocation(GEP->getDebugLoc()); + Value *Offsets; + Value *Base = foldGEP(GEP, Offsets, Builder); + // We only want to merge the geps if there is a real chance that they can be + // used by an MVE gather; thus the offset has to have the correct size + // (always i32 if it is not of vector type) and the base has to be a + // pointer. + if (Offsets && Base && Base != GEP) { + PointerType *BaseType = cast(Base->getType()); + GetElementPtrInst *NewAddress = GetElementPtrInst::Create( + BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP); + GEP->replaceAllUsesWith(NewAddress); + GEP = NewAddress; + Changed = true; + } + } + Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI); + return Changed; +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -995,22 +1169,17 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) { for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { + if (II && II->getIntrinsicID() == Intrinsic::masked_gather && + isa(II->getType())) { Gathers.push_back(II); - if (isa(II->getArgOperand(0))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(0))->getOperand(1), - II->getParent(), LI); - } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { + Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter && + isa(II->getArgOperand(0)->getType())) { Scatters.push_back(II); - if (isa(II->getArgOperand(1))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(1))->getOperand(1), - II->getParent(), LI); + Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI); } } } - for (unsigned i = 0; i < Gathers.size(); i++) { IntrinsicInst *I = Gathers[i]; Value *L = lowerGather(I); diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll index 30ce13b850736..63990baf7fc84 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -266,6 +266,430 @@ entry: ret <8 x i16> %gather } +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vmov.i32 q1, #0x28 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrh.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep2(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep3(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI17_0 +; CHECK-NEXT: adr.w r12, .LCPI17_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .long 131072 @ 0x20000 +; CHECK-NEXT: .long 131078 @ 0x20006 +; CHECK-NEXT: .long 131084 @ 0x2000c +; CHECK-NEXT: .long 131090 @ 0x20012 +; CHECK-NEXT: .LCPI17_1: +; CHECK-NEXT: .long 131096 @ 0x20018 +; CHECK-NEXT: .long 131102 @ 0x2001e +; CHECK-NEXT: .long 131108 @ 0x20024 +; CHECK-NEXT: .long 131114 @ 0x2002a +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 65536 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep4(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI18_0 +; CHECK-NEXT: adr.w r12, .LCPI18_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI18_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .LCPI18_1: +; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .long 131072 @ 0x20000 +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 42 @ 0x2a +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep5(<8 x i16*> %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov.i32 q2, #0x20000 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q0[6], r1 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %base, i32 65536 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep6(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep6: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI20_0 +; CHECK-NEXT: adr.w r12, .LCPI20_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI20_0: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .LCPI20_1: +; CHECK-NEXT: .long 131074 @ 0x20002 +; CHECK-NEXT: .long 32 @ 0x20 +; CHECK-NEXT: .long 38 @ 0x26 +; CHECK-NEXT: .long 44 @ 0x2c +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 1 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep7(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_biggep7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI21_0 +; CHECK-NEXT: adr.w r12, .LCPI21_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 128 @ 0x80 +; CHECK-NEXT: .long 1206 @ 0x4b6 +; CHECK-NEXT: .long 1212 @ 0x4bc +; CHECK-NEXT: .long 1218 @ 0x4c2 +; CHECK-NEXT: .LCPI21_1: +; CHECK-NEXT: .long 1224 @ 0x4c8 +; CHECK-NEXT: .long 1230 @ 0x4ce +; CHECK-NEXT: .long 1236 @ 0x4d4 +; CHECK-NEXT: .long 1242 @ 0x4da +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 600 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_basei32(i32* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_basei32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrh.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %offs.zext + %ptrs.cast = bitcast <8 x i32*> %ptrs to <8 x i16*> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs.cast, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll index c674ffbf51bd3..4c32200ee4a53 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -294,6 +294,45 @@ entry: ret <4 x i32> %gather.sext } +define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(i32* %base, <4 x i32>* %offptr) { +; CHECK-LABEL: scaled_i32_i32_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov.i32 q0, #0x14 +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs + %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + +define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base, <4 x i32>* %offptr) { +; CHECK-LABEL: scaled_i32_i32_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI21_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe +entry: + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> + %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll index 60bffc5a31dbd..c7d29af67b5b1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -365,6 +365,812 @@ entry: ret <16 x i8> %gather } +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vmov.i32 q2, #0x5 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r3, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q0, q2 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vadd.i32 q3, q0, q2 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r7, s15 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vldrb.s32 q3, [r1, #4] +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep2(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI10_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI11_0 +; CHECK-NEXT: adr r2, .LCPI11_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI11_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI11_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 280 @ 0x118 +; CHECK-NEXT: .long 283 @ 0x11b +; CHECK-NEXT: .long 286 @ 0x11e +; CHECK-NEXT: .long 289 @ 0x121 +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 292 @ 0x124 +; CHECK-NEXT: .long 295 @ 0x127 +; CHECK-NEXT: .long 298 @ 0x12a +; CHECK-NEXT: .long 301 @ 0x12d +; CHECK-NEXT: .LCPI11_2: +; CHECK-NEXT: .long 256 @ 0x100 +; CHECK-NEXT: .long 259 @ 0x103 +; CHECK-NEXT: .long 262 @ 0x106 +; CHECK-NEXT: .long 265 @ 0x109 +; CHECK-NEXT: .LCPI11_3: +; CHECK-NEXT: .long 268 @ 0x10c +; CHECK-NEXT: .long 271 @ 0x10f +; CHECK-NEXT: .long 274 @ 0x112 +; CHECK-NEXT: .long 277 @ 0x115 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 256 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: adr r2, .LCPI12_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI12_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI12_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .long 256 @ 0x100 +; CHECK-NEXT: .long 27 @ 0x1b +; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .long 33 @ 0x21 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 39 @ 0x27 +; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .long 45 @ 0x2d +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI12_3: +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 15 @ 0xf +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 21 @ 0x15 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.i32 q4, #0x100 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: ldrb.w r12, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r3, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vadd.i32 q3, q0, q4 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[3], r5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[4], r5 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[6], r5 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: vmov.8 q0[8], r4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], lr +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.8 q0[15], r1 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep6: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: adr r2, .LCPI14_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI14_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI14_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .long 257 @ 0x101 +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 31 @ 0x1f +; CHECK-NEXT: .long 34 @ 0x22 +; CHECK-NEXT: .LCPI14_1: +; CHECK-NEXT: .long 37 @ 0x25 +; CHECK-NEXT: .long 40 @ 0x28 +; CHECK-NEXT: .long 43 @ 0x2b +; CHECK-NEXT: .long 46 @ 0x2e +; CHECK-NEXT: .LCPI14_2: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI14_3: +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 16 @ 0x10 +; CHECK-NEXT: .long 19 @ 0x13 +; CHECK-NEXT: .long 22 @ 0x16 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 1 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: adr r2, .LCPI15_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI15_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI15_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .long 224 @ 0xe0 +; CHECK-NEXT: .long 227 @ 0xe3 +; CHECK-NEXT: .long 230 @ 0xe6 +; CHECK-NEXT: .long 233 @ 0xe9 +; CHECK-NEXT: .LCPI15_1: +; CHECK-NEXT: .long 236 @ 0xec +; CHECK-NEXT: .long 239 @ 0xef +; CHECK-NEXT: .long 242 @ 0xf2 +; CHECK-NEXT: .long 245 @ 0xf5 +; CHECK-NEXT: .LCPI15_2: +; CHECK-NEXT: .long 300 @ 0x12c +; CHECK-NEXT: .long 203 @ 0xcb +; CHECK-NEXT: .long 206 @ 0xce +; CHECK-NEXT: .long 209 @ 0xd1 +; CHECK-NEXT: .LCPI15_3: +; CHECK-NEXT: .long 212 @ 0xd4 +; CHECK-NEXT: .long 215 @ 0xd7 +; CHECK-NEXT: .long 218 @ 0xda +; CHECK-NEXT: .long 221 @ 0xdd +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 200 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_3(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI17_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(i16* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_basei16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.u32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <16 x i32> %offs.zext + %ptrs.cast = bitcast <16 x i16*> %ptrs to <16 x i8*> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs.cast, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll new file mode 100644 index 0000000000000..7e01d24d006aa --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -0,0 +1,1012 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s + +define void @ptr_iv_v4i32(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { +; CHECK-LABEL: ptr_iv_v4i32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI0_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i32 q1, q1, r2 +; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 12 @ 0xc +vector.ph: + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i32, i32* %pointer.phi, i32 16 + %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> + %2 = getelementptr i32, i32* %pointer.phi13, i32 16 + %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> , <4 x i32> undef) + %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %3, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { +; CHECK-LABEL: ptr_iv_v4i32_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r1, .LCPI1_0 +; CHECK-NEXT: adr r3, .LCPI1_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf +vector.ph: + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i32, i32* %pointer.phi, i32 16 + %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> + %gather.address = getelementptr i32, <4 x i32*> %1, i32 3 + %2 = getelementptr i32, i32* %pointer.phi13, i32 16 + %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> + %scatter.address = getelementptr i32, <4 x i32*> %1, i32 5 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gather.address, i32 4, <4 x i1> , <4 x i32> undef) + %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %scatter.address, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8i16(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { +; CHECK-LABEL: ptr_iv_v8i16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI2_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c +vector.ph: + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i16, i16* %pointer.phi, i32 32 + %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> + %2 = getelementptr i16, i16* %pointer.phi13, i32 32 + %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %1, i32 4, <8 x i1> , <8 x i16> undef) + %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %3, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { +; CHECK-LABEL: ptr_iv_v8i16_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI3_0 +; CHECK-NEXT: adr r3, .LCPI3_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i16 q2, q2, r2 +; CHECK-NEXT: vstrh.16 q2, [r1, q1, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI3_0: +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 +; CHECK-NEXT: .LCPI3_1: +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f +vector.ph: + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i16, i16* %pointer.phi, i32 32 + %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> + %gather.address = getelementptr i16, <8 x i16*> %1, i16 3 + %2 = getelementptr i16, i16* %pointer.phi13, i32 32 + %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> + %scatter.address = getelementptr i16, <8 x i16*> %3, i16 5 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gather.address, i32 4, <8 x i1> , <8 x i16> undef) + %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %scatter.address, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v16i8(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { +; CHECK-LABEL: ptr_iv_v16i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI4_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i8 q1, q1, r2 +; CHECK-NEXT: vstrb.8 q1, [r1, q0] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI4_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 48 @ 0x30 +; CHECK-NEXT: .byte 52 @ 0x34 +; CHECK-NEXT: .byte 56 @ 0x38 +; CHECK-NEXT: .byte 60 @ 0x3c +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 + %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i8, i8* %pointer.phi, i32 64 + %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> + %2 = getelementptr i8, i8* %pointer.phi13, i32 64 + %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %1, i32 4, <16 x i1> , <16 x i8> undef) + %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %3, i32 4, <16 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define void @ptr_iv_v16i8_mult(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { +; CHECK-LABEL: ptr_iv_v16i8_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI5_0 +; CHECK-NEXT: adr r3, .LCPI5_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q2, [r0, q0] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i8 q2, q2, r2 +; CHECK-NEXT: vstrb.8 q2, [r1, q1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI5_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 45 @ 0x2d +; CHECK-NEXT: .byte 49 @ 0x31 +; CHECK-NEXT: .byte 53 @ 0x35 +; CHECK-NEXT: .byte 57 @ 0x39 +; CHECK-NEXT: .byte 61 @ 0x3d +; CHECK-NEXT: .byte 65 @ 0x41 +; CHECK-NEXT: .LCPI5_1: +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 51 @ 0x33 +; CHECK-NEXT: .byte 55 @ 0x37 +; CHECK-NEXT: .byte 59 @ 0x3b +; CHECK-NEXT: .byte 63 @ 0x3f +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 + %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i8, i8* %pointer.phi, i32 64 + %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> + %gather.address = getelementptr i8, <16 x i8*> %1, i8 3 + %2 = getelementptr i8, i8* %pointer.phi13, i32 64 + %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> + %scatter.address = getelementptr i8, <16 x i8*> %3, i8 5 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gather.address, i32 4, <16 x i1> , <16 x i8> undef) + %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %scatter.address, i32 4, <16 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4f32(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v4f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI6_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.f32 q1, q1, r2 +; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 12 @ 0xc +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr float, float* %pointer.phi, i32 16 + %1 = getelementptr float, float* %pointer.phi, <4 x i32> + %2 = getelementptr float, float* %pointer.phi13, i32 16 + %3 = getelementptr float, float* %pointer.phi13, <4 x i32> + %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %1, i32 4, <4 x i1> , <4 x float> undef) + %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %3, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v4f32_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r1, .LCPI7_0 +; CHECK-NEXT: adr r3, .LCPI7_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.f32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI7_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .LCPI7_1: +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr float, float* %pointer.phi, i32 16 + %1 = getelementptr float, float* %pointer.phi, <4 x i32> + %gather.address = getelementptr float, <4 x float*> %1, i32 3 + %2 = getelementptr float, float* %pointer.phi13, i32 16 + %3 = getelementptr float, float* %pointer.phi13, <4 x i32> + %scatter.address = getelementptr float, <4 x float*> %1, i32 5 + %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gather.address, i32 4, <4 x i1> , <4 x float> undef) + %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %scatter.address, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8f16(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v8f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: adr r3, .LCPI8_0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.f16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c +vector.ph: + %y.trunc = fptrunc float %y to half + %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 + %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr half, half* %pointer.phi, i32 32 + %1 = getelementptr half, half* %pointer.phi, <8 x i16> + %2 = getelementptr half, half* %pointer.phi13, i32 32 + %3 = getelementptr half, half* %pointer.phi13, <8 x i16> + %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %1, i32 4, <8 x i1> , <8 x half> undef) + %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %3, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v8f16_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: adr r2, .LCPI9_0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adr r2, .LCPI9_1 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: vadd.f16 q2, q2, r1 +; CHECK-NEXT: vstrh.16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB9_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 +vector.ph: + %y.trunc = fptrunc float %y to half + %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 + %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr half, half* %pointer.phi, i32 32 + %1 = getelementptr half, half* %pointer.phi, <8 x i16> + %gather.address = getelementptr half, <8 x half*> %1, i32 3 + %2 = getelementptr half, half* %pointer.phi13, i32 32 + %3 = getelementptr half, half* %pointer.phi13, <8 x i16> + %scatter.address = getelementptr half, <8 x half*> %1, i32 5 + %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gather.address, i32 4, <8 x i1> , <8 x half> undef) + %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %scatter.address, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x, i32* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v4i32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI10_0 +; CHECK-NEXT: adr.w lr, .LCPI10_1 +; CHECK-NEXT: adr r3, .LCPI10_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q4, [r0, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q5, [r0, q2, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [r0, q3, uxtw #2] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i32 q6, q5, q6 +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] +; CHECK-NEXT: vstrw.32 q6, [r1, q3, uxtw #2] +; CHECK-NEXT: vstrw.32 q4, [r1, q1, uxtw #2] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI10_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI10_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i32* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i32, i32* %pointer.phi, <4 x i32> + %v3 = getelementptr i32, i32* %pointer.phi, i32 12 + %vector.gep56 = getelementptr i32, i32* %pointer.phi55, <4 x i32> + %v4 = getelementptr i32, i32* %pointer.phi55, i32 12 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %vector.gep, i32 4, <4 x i1> , <4 x i32> undef) + %v7 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 2 + %wide.masked.gather57 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v6, i32 4, <4 x i1> , <4 x i32> undef) + %wide.masked.gather58 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v7, i32 4, <4 x i1> , <4 x i32> undef) + %v11 = mul nuw nsw <4 x i32> %wide.masked.gather, + %v13 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v11, <4 x i32*> %vector.gep56, i32 4, <4 x i1> ) + %v18 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v13, <4 x i32*> %v17, i32 4, <4 x i1> ) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v15, <4 x i32*> %v18, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v4i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v4i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI11_0 +; CHECK-NEXT: adr.w lr, .LCPI11_1 +; CHECK-NEXT: adr r3, .LCPI11_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u32 q4, [r0, q1] +; CHECK-NEXT: vldrb.u32 q5, [r0, q2] +; CHECK-NEXT: vldrb.u32 q6, [r0, q3] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #12 +; CHECK-NEXT: vmul.i32 q6, q5, q6 +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrb.32 q5, [r1, q2] +; CHECK-NEXT: vstrb.32 q6, [r1, q3] +; CHECK-NEXT: vstrb.32 q4, [r1, q1] +; CHECK-NEXT: add.w r1, r1, #12 +; CHECK-NEXT: bne .LBB11_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI11_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i8, i8* %pointer.phi, <4 x i32> + %v3 = getelementptr i8, i8* %pointer.phi, i32 12 + %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <4 x i32> + %v4 = getelementptr i8, i8* %pointer.phi55, i32 12 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 1 + %wide.masked.gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %vector.gep, i32 1, <4 x i1> , <4 x i8> undef) + %v7 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 2 + %wide.masked.gather57 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v6, i32 1, <4 x i1> , <4 x i8> undef) + %wide.masked.gather58 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v7, i32 1, <4 x i1> , <4 x i8> undef) + %v8 = zext <4 x i8> %wide.masked.gather to <4 x i32> + %v9 = zext <4 x i8> %wide.masked.gather57 to <4 x i32> + %v10 = zext <4 x i8> %wide.masked.gather58 to <4 x i32> + %v11 = mul nuw nsw <4 x i32> %v8, + %v12 = trunc <4 x i32> %v11 to <4 x i8> + %v13 = mul nuw nsw <4 x i32> %v8, %v9 + %v14 = trunc <4 x i32> %v13 to <4 x i8> + %v15 = mul nuw nsw <4 x i32> %v8, %v10 + %v16 = trunc <4 x i32> %v15 to <4 x i8> + %v17 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v12, <4 x i8*> %vector.gep56, i32 1, <4 x i1> ) + %v18 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v14, <4 x i8*> %v17, i32 1, <4 x i1> ) + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v16, <4 x i8*> %v18, i32 1, <4 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, i16* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v8i16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI12_0 +; CHECK-NEXT: adr.w lr, .LCPI12_1 +; CHECK-NEXT: adr r3, .LCPI12_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i16 q0, #0xa +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q4, [r0, q1, uxtw #1] +; CHECK-NEXT: vldrh.u16 q5, [r0, q2, uxtw #1] +; CHECK-NEXT: vldrh.u16 q6, [r0, q3, uxtw #1] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i16 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i16 q6, q5, q6 +; CHECK-NEXT: vmul.i16 q5, q5, q0 +; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] +; CHECK-NEXT: vstrh.16 q6, [r1, q3, uxtw #1] +; CHECK-NEXT: vstrh.16 q4, [r1, q1, uxtw #1] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i16* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i16, i16* %pointer.phi, <8 x i16> + %v3 = getelementptr i16, i16* %pointer.phi, i32 24 + %vector.gep56 = getelementptr i16, i16* %pointer.phi55, <8 x i16> + %v4 = getelementptr i16, i16* %pointer.phi55, i32 24 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 1 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %vector.gep, i32 2, <8 x i1> , <8 x i16> undef) + %v7 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 2 + %wide.masked.gather57 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v6, i32 2, <8 x i1> , <8 x i16> undef) + %wide.masked.gather58 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v7, i32 2, <8 x i1> , <8 x i16> undef) + %v11 = mul nuw nsw <8 x i16> %wide.masked.gather, + %v13 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v11, <8 x i16*> %vector.gep56, i32 2, <8 x i1> ) + %v18 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v13, <8 x i16*> %v17, i32 2, <8 x i1> ) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v15, <8 x i16*> %v18, i32 2, <8 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v16i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v16i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI13_0 +; CHECK-NEXT: adr.w lr, .LCPI13_1 +; CHECK-NEXT: adr r3, .LCPI13_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i8 q0, #0xa +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q4, [r0, q1] +; CHECK-NEXT: vldrb.u8 q5, [r0, q2] +; CHECK-NEXT: vldrb.u8 q6, [r0, q3] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i8 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i8 q6, q5, q6 +; CHECK-NEXT: vmul.i8 q5, q5, q0 +; CHECK-NEXT: vstrb.8 q5, [r1, q2] +; CHECK-NEXT: vstrb.8 q6, [r1, q3] +; CHECK-NEXT: vstrb.8 q4, [r1, q1] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI13_0: +; CHECK-NEXT: .byte 1 @ 0x1 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 10 @ 0xa +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 22 @ 0x16 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 34 @ 0x22 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 46 @ 0x2e +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +; CHECK-NEXT: .LCPI13_2: +; CHECK-NEXT: .byte 2 @ 0x2 +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i8, i8* %pointer.phi, <16 x i8> + %v3 = getelementptr i8, i8* %pointer.phi, i32 48 + %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <16 x i8> + %v4 = getelementptr i8, i8* %pointer.phi55, i32 48 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 1 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %vector.gep, i32 1, <16 x i1> , <16 x i8> undef) + %v7 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 2 + %wide.masked.gather57 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v6, i32 1, <16 x i1> , <16 x i8> undef) + %wide.masked.gather58 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v7, i32 1, <16 x i1> , <16 x i8> undef) + %v11 = mul nuw nsw <16 x i8> %wide.masked.gather, + %v13 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v11, <16 x i8*> %vector.gep56, i32 1, <16 x i1> ) + %v18 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v13, <16 x i8*> %v17, i32 1, <16 x i1> ) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v15, <16 x i8*> %v18, i32 1, <16 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) + +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll index 2d9e2047e08cb..ac615a1d57a1c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -177,5 +177,75 @@ entry: ret void } +define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldrh.s32 q3, [r1] +; CHECK-NEXT: vmov.i32 q2, #0x28 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs2, i32 2, <8 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs2, i32 2, <8 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll index 25a9cea5f5be9..1c9871e54b10d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -233,6 +233,58 @@ entry: ret void } +define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_scaled_i16_i32_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.i32 q1, #0xa +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs + %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_scaled_i16_i32_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> + %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll index d84066cdc5bdd..7fde69ef836c8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -534,6 +534,115 @@ entry: ret void } +define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrb.s32 q1, [r1, #12] +; CHECK-NEXT: vldrb.s32 q2, [r1, #8] +; CHECK-NEXT: vldrb.s32 q3, [r1, #4] +; CHECK-NEXT: vldrb.s32 q5, [r1] +; CHECK-NEXT: vmov.i32 q4, #0x5 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrb.8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) From bd4757cc4e40cb1b916c2c39f7bac8abc5adc294 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Tue, 28 Jul 2020 09:41:27 -0700 Subject: [PATCH 0323/1035] [ELF] --reproduce should include lto sample profile Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D84569 --- lld/ELF/Driver.cpp | 3 +++ lld/test/ELF/reproduce-lto.s | 10 ++++++++++ 2 files changed, 13 insertions(+) create mode 100644 lld/test/ELF/reproduce-lto.s diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 30bff945f1a54..64a41ba77ba2f 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -495,6 +495,9 @@ void LinkerDriver::main(ArrayRef argsArr) { tar = std::move(*errOrWriter); tar->append("response.txt", createResponseFile(args)); tar->append("version.txt", getLLDVersion() + "\n"); + StringRef ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile); + if (!ltoSampleProfile.empty()) + readFile(ltoSampleProfile); } else { error("--reproduce: " + toString(errOrWriter.takeError())); } diff --git a/lld/test/ELF/reproduce-lto.s b/lld/test/ELF/reproduce-lto.s new file mode 100644 index 0000000000000..e7c3ece61085d --- /dev/null +++ b/lld/test/ELF/reproduce-lto.s @@ -0,0 +1,10 @@ +# REQUIRES: x86 + +# RUN: rm -rf %t.dir +# RUN: mkdir -p %t.dir/build1 +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.dir/build1/foo.o +# RUN: echo > %t.dir/build1/empty_profile.txt +# RUN: cd %t.dir +# RUN: ld.lld build1/foo.o -o /dev/null --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt +# RUN: tar tvf repro1.tar | FileCheck %s +# CHECK: repro1/{{.*}}/empty_profile.txt From 12f27fc4b505da848a06b37488c5717bf9e3b85d Mon Sep 17 00:00:00 2001 From: Julian Lettner Date: Tue, 28 Jul 2020 09:44:02 -0700 Subject: [PATCH 0324/1035] [Darwin] Cleanup code via improved GetMacosAlignedVersion() Checking the OS version via `GetMacosAlignedVersion()` now works in simulators [1]. Let's use it to simplify `DyldNeedsEnvVariable()`. [1] 3fb0de820796cc6e322c8378713d375d9870a353 Reviewed By: delcypher Differential Revision: https://reviews.llvm.org/D81197 --- .../lib/sanitizer_common/sanitizer_mac.cpp | 16 +++------------- compiler-rt/lib/sanitizer_common/sanitizer_mac.h | 1 + 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index 522a909e95285..f96f187131972 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -879,20 +879,10 @@ bool ReexecDisabled() { return false; } -extern "C" SANITIZER_WEAK_ATTRIBUTE double dyldVersionNumber; -static const double kMinDyldVersionWithAutoInterposition = 360.0; - -bool DyldNeedsEnvVariable() { - // Although sanitizer support was added to LLVM on OS X 10.7+, GCC users - // still may want use them on older systems. On older Darwin platforms, dyld - // doesn't export dyldVersionNumber symbol and we simply return true. - if (!&dyldVersionNumber) return true; +static bool DyldNeedsEnvVariable() { // If running on OS X 10.11+ or iOS 9.0+, dyld will interpose even if - // DYLD_INSERT_LIBRARIES is not set. However, checking OS version via - // GetMacosAlignedVersion() doesn't work for the simulator. Let's instead - // check `dyldVersionNumber`, which is exported by dyld, against a known - // version number from the first OS release where this appeared. - return dyldVersionNumber < kMinDyldVersionWithAutoInterposition; + // DYLD_INSERT_LIBRARIES is not set. + return GetMacosAlignedVersion() < MacosVersion(10, 11); } void MaybeReexec() { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h index 90ecff4815c2e..f61ebe2566e5f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h @@ -44,6 +44,7 @@ struct VersionBase { return major > other.major || (major == other.major && minor >= other.minor); } + bool operator<(const VersionType &other) const { return !(*this >= other); } }; struct MacosVersion : VersionBase { From f761acfb1a737d8a631a5e55b58cdb7c2215baad Mon Sep 17 00:00:00 2001 From: Vince Bridgers Date: Thu, 16 Jul 2020 17:46:48 -0500 Subject: [PATCH 0325/1035] [ASTImporter] Add Visitor for TypedefNameDecl's We found a case where Typedef Name Declarations were not being added correctly when importing builtin types. This exposed the need for a TypedefNameDecl visitor so these types can be added by RecordDecl and fields. This code is covered by the ASTImporterTest cases that use the implicit struct __NSConstantString_tag definitions. Thanks to @martong for the debugging assist! Depends on D83970. Reviewed By: martong Differential Revision: https://reviews.llvm.org/D83992 --- clang/lib/AST/ASTImporterLookupTable.cpp | 14 +++++++++++++ clang/test/Analysis/Inputs/ctu-import.c | 15 ++++++++++++++ .../ctu-import.c.externalDefMap.ast-dump.txt | 1 + clang/test/Analysis/ctu-implicit.c | 20 +++++++++++++++++++ 4 files changed, 50 insertions(+) create mode 100644 clang/test/Analysis/Inputs/ctu-import.c create mode 100644 clang/test/Analysis/Inputs/ctu-import.c.externalDefMap.ast-dump.txt create mode 100644 clang/test/Analysis/ctu-implicit.c diff --git a/clang/lib/AST/ASTImporterLookupTable.cpp b/clang/lib/AST/ASTImporterLookupTable.cpp index 4d6fff8f34191..e17d6082dcdcc 100644 --- a/clang/lib/AST/ASTImporterLookupTable.cpp +++ b/clang/lib/AST/ASTImporterLookupTable.cpp @@ -22,6 +22,20 @@ namespace { struct Builder : RecursiveASTVisitor { ASTImporterLookupTable < Builder(ASTImporterLookupTable <) : LT(LT) {} + + bool VisitTypedefNameDecl(TypedefNameDecl *D) { + QualType Ty = D->getUnderlyingType(); + Ty = Ty.getCanonicalType(); + if (const auto *RTy = dyn_cast(Ty)) { + LT.add(RTy->getAsRecordDecl()); + // iterate over the field decls, adding them + for (auto *it : RTy->getAsRecordDecl()->fields()) { + LT.add(it); + } + } + return true; + } + bool VisitNamedDecl(NamedDecl *D) { LT.add(D); return true; diff --git a/clang/test/Analysis/Inputs/ctu-import.c b/clang/test/Analysis/Inputs/ctu-import.c new file mode 100644 index 0000000000000..6c99a36427978 --- /dev/null +++ b/clang/test/Analysis/Inputs/ctu-import.c @@ -0,0 +1,15 @@ + +// Use an internal, implicitly defined type, called by +// a function imported for CTU. This should not crash. +int foo(void); +int foobar(int skip) { + __NSConstantString str = {.flags = 1}; + + if (str.flags >= 0) + str.flags = 0; + return 4; +} + +int testStaticImplicit(void) { + return foobar(3); +} diff --git a/clang/test/Analysis/Inputs/ctu-import.c.externalDefMap.ast-dump.txt b/clang/test/Analysis/Inputs/ctu-import.c.externalDefMap.ast-dump.txt new file mode 100644 index 0000000000000..83d3b4ca451e8 --- /dev/null +++ b/clang/test/Analysis/Inputs/ctu-import.c.externalDefMap.ast-dump.txt @@ -0,0 +1 @@ +c:@F@testStaticImplicit ctu-import.c.ast diff --git a/clang/test/Analysis/ctu-implicit.c b/clang/test/Analysis/ctu-implicit.c new file mode 100644 index 0000000000000..925044845e090 --- /dev/null +++ b/clang/test/Analysis/ctu-implicit.c @@ -0,0 +1,20 @@ +// RUN: rm -rf %t && mkdir %t +// RUN: mkdir -p %t/ctudir2 +// RUN: %clang_cc1 \ +// RUN: -emit-pch -o %t/ctudir2/ctu-import.c.ast %S/Inputs/ctu-import.c +// RUN: cp %S/Inputs/ctu-import.c.externalDefMap.ast-dump.txt %t/ctudir2/externalDefMap.txt +// RUN: %clang_cc1 -analyze \ +// RUN: -analyzer-checker=core,debug.ExprInspection \ +// RUN: -analyzer-config experimental-enable-naive-ctu-analysis=true \ +// RUN: -analyzer-config display-ctu-progress=true \ +// RUN: -analyzer-config ctu-dir=%t/ctudir2 \ +// RUN: -verify %s + +void clang_analyzer_eval(int); + +int testStaticImplicit(void); +int func(void) { + int ret = testStaticImplicit(); + clang_analyzer_eval(ret == 4); // expected-warning{{TRUE}} + return testStaticImplicit(); +} From 4853a86022feee1b16249cc7a0ea12c842fa5986 Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Tue, 28 Jul 2020 09:56:50 -0700 Subject: [PATCH 0326/1035] [lld-macho] Support -filelist XCode passes files in using this flag Reviewed By: #lld-macho, compnerd Differential Revision: https://reviews.llvm.org/D84486 --- lld/MachO/Driver.cpp | 12 +++++++++ lld/test/MachO/filelist.s | 40 ++++++++++++++++++++++++++++ lld/test/MachO/invalid/no-filelist.s | 9 +++++++ 3 files changed, 61 insertions(+) create mode 100644 lld/test/MachO/filelist.s create mode 100644 lld/test/MachO/invalid/no-filelist.s diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index ee794129e1fc6..d76e0115d10fc 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -213,6 +213,15 @@ static void addFile(StringRef path) { } } +static void addFileList(StringRef path) { + Optional buffer = readFile(path); + if (!buffer) + return; + MemoryBufferRef mbref = *buffer; + for (StringRef path : args::getLines(mbref)) + addFile(path); +} + static std::array archNames{"arm", "arm64", "i386", "x86_64", "ppc", "ppc64"}; static bool isArchString(StringRef s) { @@ -411,6 +420,9 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, case OPT_INPUT: addFile(arg->getValue()); break; + case OPT_filelist: + addFileList(arg->getValue()); + break; case OPT_l: { StringRef name = arg->getValue(); if (Optional path = findLibrary(name)) { diff --git a/lld/test/MachO/filelist.s b/lld/test/MachO/filelist.s new file mode 100644 index 0000000000000..673740ebf06df --- /dev/null +++ b/lld/test/MachO/filelist.s @@ -0,0 +1,40 @@ +# REQUIRES: x86 + +## This test verifies that the paths in -filelist get processed in command-line +## order. + +# RUN: mkdir -p %t +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,first; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/first.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,second; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/second.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o + +# FIRST: __TEXT,first _foo +# SECOND: __TEXT,second _foo + +# RUN: echo "%t/first.o" > filelist +# RUN: echo "%t/second.o" >> filelist +# RUN: lld -flavor darwinnew -Z -filelist filelist %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=FIRST + +# RUN: echo "%t/second.o" > filelist +# RUN: echo "%t/first.o" >> filelist +# RUN: lld -flavor darwinnew -Z -filelist filelist %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=SECOND + +# RUN: echo "%t/first.o" > filelist +# RUN: lld -flavor darwinnew -Z -filelist filelist %t/second.o %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=FIRST +# RUN: lld -flavor darwinnew -Z %t/second.o -filelist filelist %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=SECOND + +# RUN: echo "%t/first.o" > filelist-1 +# RUN: echo "%t/second.o" > filelist-2 +# RUN: lld -flavor darwinnew -Z -filelist filelist-1 -filelist filelist-2 %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=FIRST +# RUN: lld -flavor darwinnew -Z -filelist filelist-2 -filelist filelist-1 %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=SECOND + +.globl _main + +_main: + ret diff --git a/lld/test/MachO/invalid/no-filelist.s b/lld/test/MachO/invalid/no-filelist.s new file mode 100644 index 0000000000000..fb80185c66aef --- /dev/null +++ b/lld/test/MachO/invalid/no-filelist.s @@ -0,0 +1,9 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: not lld -flavor darwinnew -Z -filelist nonexistent %t.o -o %t 2>&1 | FileCheck %s +# CHECK: cannot open nonexistent: No such file or directory + +.globl _main + +_main: + ret From d32e32500f92602ccedcf967df2915da6f3803d2 Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Tue, 28 Jul 2020 09:56:55 -0700 Subject: [PATCH 0327/1035] [lld-macho] Fix segment filesize calculation The previous approach of adding up the file sizes of the component sections ignored the fact that the sections did not have to be contiguous in the file. As such, it was underestimating the true size. I discovered this issue because `codesign` checks whether `__LINKEDIT` extends to the end of the file. Since we were underestimating segment sizes, this check failed. Reviewed By: #lld-macho, compnerd Differential Revision: https://reviews.llvm.org/D84574 --- lld/MachO/Writer.cpp | 8 +++- lld/test/MachO/section-headers.s | 16 ++++++-- lld/test/MachO/segments.s | 69 ++++++++++++++++++-------------- 3 files changed, 59 insertions(+), 34 deletions(-) diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index 03000a7f437e0..c9070e90f97e0 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -134,7 +134,11 @@ class LCSegment : public LoadCommand { c->nsects = seg->numNonHiddenSections(); for (OutputSection *osec : seg->getSections()) { - c->filesize += osec->getFileSize(); + if (!isZeroFill(osec->flags)) { + assert(osec->fileOff >= seg->fileOff); + c->filesize = std::max( + c->filesize, osec->fileOff + osec->getFileSize() - seg->fileOff); + } if (osec->isHidden()) continue; @@ -454,6 +458,8 @@ void Writer::assignAddresses(OutputSegment *seg) { seg->fileOff = fileOff; for (auto *osec : seg->getSections()) { + if (!osec->isNeeded()) + continue; addr = alignTo(addr, osec->align); fileOff = alignTo(fileOff, osec->align); osec->addr = addr; diff --git a/lld/test/MachO/section-headers.s b/lld/test/MachO/section-headers.s index 9fafc5a912b0f..fdfdbed632450 100644 --- a/lld/test/MachO/section-headers.s +++ b/lld/test/MachO/section-headers.s @@ -1,7 +1,7 @@ # REQUIRES: x86 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o # RUN: lld -flavor darwinnew -o %t %t.o -# RUN: llvm-readobj --section-headers %t | FileCheck %s +# RUN: llvm-readobj --section-headers --macho-segment %t | FileCheck %s # CHECK: Name: __text # CHECK-NEXT: Segment: __TEXT @@ -25,11 +25,21 @@ # CHECK: Name: maxlen_16ch_name # CHECK-NEXT: Segment: __TEXT -# CHECK-NOT: } -# CHECK: Alignment: 3 +# CHECK-NEXT: Address: +# CHECK-NEXT: Size: [[#%x, LAST_SEC_SIZE:]] +# CHECK-NEXT: Offset: [[#%u, LAST_SEC_OFF:]] +# CHECK-NEXT: Alignment: 3 # CHECK-NOT: } # CHECK: Type: Regular (0x0) +# CHECK-LABEL: Segment { +# CHECK: Name: __TEXT +# CHECK-NEXT: Size: +# CHECK-NEXT: vmaddr: +# CHECK-NEXT: vmsize: +# CHECK-NEXT: fileoff: 0 +# CHECK-NEXT: filesize: [[#%u, LAST_SEC_SIZE + LAST_SEC_OFF]] + .text .align 1 .global _main diff --git a/lld/test/MachO/segments.s b/lld/test/MachO/segments.s index acb0f1e90101a..e0f127fabe55f 100644 --- a/lld/test/MachO/segments.s +++ b/lld/test/MachO/segments.s @@ -1,49 +1,58 @@ -# REQUIRES: x86 +# REQUIRES: x86, shell # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o # RUN: lld -flavor darwinnew -o %t %t.o -# RUN: llvm-readobj --macho-segment %t | FileCheck %s +# RUN: (llvm-readobj --macho-segment %t; echo "Total file size"; wc -c %t) | FileCheck %s ## These two segments must always be present at the start of an executable. # CHECK-NOT: Segment { # CHECK: Segment { -# CHECK: Cmd: LC_SEGMENT_64 -# CHECK: Name: __PAGEZERO -# CHECK: Size: 72 -# CHECK: vmaddr: 0x0 -# CHECK: vmsize: 0x100000000 -# CHECK: fileoff: 0 -# CHECK: filesize: 0 +# CHECK-NEXT: Cmd: LC_SEGMENT_64 +# CHECK-NEXT: Name: __PAGEZERO +# CHECK-NEXT: Size: 72 +# CHECK-NEXT: vmaddr: 0x0 +# CHECK-NEXT: vmsize: 0x100000000 +# CHECK-NEXT: fileoff: 0 +# CHECK-NEXT: filesize: 0 ## The kernel won't execute a binary with the wrong protections for __PAGEZERO. -# CHECK: maxprot: --- -# CHECK: initprot: --- -# CHECK: nsects: 0 -# CHECK: flags: 0x0 -# CHECK: } -# CHECK: Segment { -# CHECK: Cmd: LC_SEGMENT_64 -# CHECK: Name: __TEXT -# CHECK: Size: 152 -# CHECK: vmaddr: 0x100000000 -# CHECK: vmsize: +# CHECK-NEXT: maxprot: --- +# CHECK-NEXT: initprot: --- +# CHECK-NEXT: nsects: 0 +# CHECK-NEXT: flags: 0x0 +# CHECK-NEXT: } +# CHECK-NEXT: Segment { +# CHECK-NEXT: Cmd: LC_SEGMENT_64 +# CHECK-NEXT: Name: __TEXT +# CHECK-NEXT: Size: 152 +# CHECK-NEXT: vmaddr: 0x100000000 +# CHECK-NEXT: vmsize: ## dyld3 assumes that the __TEXT segment starts from the file header -# CHECK: fileoff: 0 -# CHECK: filesize: -# CHECK: maxprot: rwx -# CHECK: initprot: r-x -# CHECK: nsects: 1 -# CHECK: flags: 0x0 -# CHECK: } +# CHECK-NEXT: fileoff: 0 +# CHECK-NEXT: filesize: +# CHECK-NEXT: maxprot: rwx +# CHECK-NEXT: initprot: r-x +# CHECK-NEXT: nsects: 1 +# CHECK-NEXT: flags: 0x0 +# CHECK-NEXT: } ## Check that we handle max-length names correctly. # CHECK: Cmd: LC_SEGMENT_64 # CHECK-NEXT: Name: maxlen_16ch_name -## This segment must always be present at the end of an executable. +## This segment must always be present at the end of an executable, and cover +## its last byte. # CHECK: Name: __LINKEDIT -# CHECK: maxprot: rwx -# CHECK: initprot: r-- +# CHECK-NEXT: Size: +# CHECK-NEXT: vmaddr: +# CHECK-NEXT: vmsize: +# CHECK-NEXT: fileoff: [[#%u, LINKEDIT_OFF:]] +# CHECK-NEXT: filesize: [[#%u, LINKEDIT_SIZE:]] +# CHECK-NEXT: maxprot: rwx +# CHECK-NEXT: initprot: r-- # CHECK-NOT: Cmd: LC_SEGMENT_64 +# CHECK-LABEL: Total file size +# CHECK-NEXT: [[#%u, LINKEDIT_OFF + LINKEDIT_SIZE]] + .text .global _main _main: From 4b56238b136ba88b9e7794deca86bab7ccbe3ae3 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 28 Jul 2020 13:09:32 -0400 Subject: [PATCH 0328/1035] NFC: Add whitespace changing revisions to .git-blame-ignore-revs --- .git-blame-ignore-revs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 71d0488b32790..7c759a1adc950 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -22,3 +22,12 @@ b9c1b51e45b845debb76d8658edabca70ca56079 # r302496: That is the revert of r302421 ff63090b0e1072bd398b8efef8ae2291613a6ec9 + +# Fix more line endings changed in r320089. NFC. +d8f0e6caa91e230a486c948ab643174e40bdf215 + +# Correct line endings that got mixed up in r320089; NFC. +29dc5ded45447915d96ef7ca3f02acf2232282e0 + +# Remove line-endings added by r320089. NFC. +100a0eedc00b2bf48bcdc6c209c000745a4a0e48 From 64cdd5b3da1abce63838bfc5bd32398c834a32e3 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Tue, 28 Jul 2020 09:58:44 -0400 Subject: [PATCH 0329/1035] [mlir][Vector] Drop declarative transforms For the purpose of vector transforms, the Tablegen-based infra is subsumed by simple C++ pattern application. Deprecate declarative transforms whose complexity does not pay for itself. Differential Revision: https://reviews.llvm.org/D84753 --- .../mlir/Dialect/Vector/CMakeLists.txt | 4 --- .../Dialect/Vector/VectorTransformPatterns.td | 26 -------------- mlir/lib/Dialect/Vector/CMakeLists.txt | 1 - mlir/test/lib/CMakeLists.txt | 1 - .../lib/DeclarativeTransforms/CMakeLists.txt | 3 -- .../TestVectorTransformPatterns.td | 34 ------------------- .../lib/DeclarativeTransforms/lit.local.cfg | 1 - mlir/test/lib/Transforms/CMakeLists.txt | 3 -- .../lib/Transforms/TestVectorTransforms.cpp | 12 +++---- 9 files changed, 6 insertions(+), 79 deletions(-) delete mode 100644 mlir/include/mlir/Dialect/Vector/VectorTransformPatterns.td delete mode 100644 mlir/test/lib/DeclarativeTransforms/CMakeLists.txt delete mode 100644 mlir/test/lib/DeclarativeTransforms/TestVectorTransformPatterns.td delete mode 100644 mlir/test/lib/DeclarativeTransforms/lit.local.cfg diff --git a/mlir/include/mlir/Dialect/Vector/CMakeLists.txt b/mlir/include/mlir/Dialect/Vector/CMakeLists.txt index a27eef693a288..23ad74e0cb72f 100644 --- a/mlir/include/mlir/Dialect/Vector/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Vector/CMakeLists.txt @@ -1,6 +1,2 @@ add_mlir_dialect(VectorOps vector) add_mlir_doc(VectorOps -gen-op-doc VectorOps Dialects/) - -set(LLVM_TARGET_DEFINITIONS VectorTransformPatterns.td) -mlir_tablegen(VectorTransformPatterns.h.inc -gen-rewriters) -add_public_tablegen_target(MLIRVectorTransformPatternsIncGen) diff --git a/mlir/include/mlir/Dialect/Vector/VectorTransformPatterns.td b/mlir/include/mlir/Dialect/Vector/VectorTransformPatterns.td deleted file mode 100644 index ef8118ec64704..0000000000000 --- a/mlir/include/mlir/Dialect/Vector/VectorTransformPatterns.td +++ /dev/null @@ -1,26 +0,0 @@ -//===- VectorTransformPatterns.td - Vector-Vector patterns -*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This is the pattern definition file for declarative Vector transformations. -// -//===----------------------------------------------------------------------===// - -#ifndef VECTOR_TRANSFORM_PATTERNS -#define VECTOR_TRANSFORM_PATTERNS - -include "mlir/IR/OpBase.td" - -class HasShape shape> : - CPred<"$0.getType().cast().hasStaticShape({" # - StrJoinInt.result # "})">; - -class UnrollVectorOp factors> : NativeCodeCall< - "unrollSingleResultVectorOp($_builder, $0.getDefiningOp(), " # - "{" # StrJoinInt.result # "})">; - -#endif // VECTOR_TRANSFORM_PATTERNS diff --git a/mlir/lib/Dialect/Vector/CMakeLists.txt b/mlir/lib/Dialect/Vector/CMakeLists.txt index d6ba987e6622f..7b34f1933c42c 100644 --- a/mlir/lib/Dialect/Vector/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/CMakeLists.txt @@ -9,7 +9,6 @@ add_mlir_dialect_library(MLIRVector DEPENDS MLIRVectorOpsIncGen - MLIRVectorTransformPatternsIncGen LINK_LIBS PUBLIC MLIREDSC diff --git a/mlir/test/lib/CMakeLists.txt b/mlir/test/lib/CMakeLists.txt index 641a6218d1ccd..0df357c8c355e 100644 --- a/mlir/test/lib/CMakeLists.txt +++ b/mlir/test/lib/CMakeLists.txt @@ -1,4 +1,3 @@ -add_subdirectory(DeclarativeTransforms) add_subdirectory(Dialect) add_subdirectory(IR) add_subdirectory(Pass) diff --git a/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt b/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt deleted file mode 100644 index 67d194ff868a5..0000000000000 --- a/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS TestVectorTransformPatterns.td) -mlir_tablegen(TestVectorTransformPatterns.h.inc -gen-rewriters) -add_public_tablegen_target(MLIRTestVectorTransformPatternsIncGen) diff --git a/mlir/test/lib/DeclarativeTransforms/TestVectorTransformPatterns.td b/mlir/test/lib/DeclarativeTransforms/TestVectorTransformPatterns.td deleted file mode 100644 index 2c6ca1a05733e..0000000000000 --- a/mlir/test/lib/DeclarativeTransforms/TestVectorTransformPatterns.td +++ /dev/null @@ -1,34 +0,0 @@ -//===- TestVectorTransformPatterns.td - Test patterns ---*- tablegen ----*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This is the pattern definition file for declarative Vector transformations -// tests. -// -//===----------------------------------------------------------------------===// - -#ifndef TEST_VECTOR_TRANSFORMS_PATTERNS -#define TEST_VECTOR_TRANSFORMS_PATTERNS - -include "mlir/Dialect/StandardOps/IR/Ops.td" -include "mlir/Dialect/Vector/VectorOps.td" -include "mlir/Dialect/Vector/VectorTransformPatterns.td" - -def : Pat<(AddFOp:$op_results $a, $b), - (UnrollVectorOp<[2, 2]> $op_results, $a, $b), - [(Constraint> $a)]>; - -def : Pat<(AddFOp:$op_results $a, $b), - (UnrollVectorOp<[2, 2]> $op_results, $a, $b), - [(Constraint> $a)]>; - -// TODO: Add Constraints on lhs/rhs shapes. -def : Pat<(Vector_ContractionOp:$op_results $a, $b, $c, $masks, $attr0, $attr1), - (UnrollVectorOp<[2, 2, 2]> $op_results, $a, $b, $c), - [(Constraint> $c)]>; - -#endif // TEST_VECTOR_TRANSFORMS_PATTERNS diff --git a/mlir/test/lib/DeclarativeTransforms/lit.local.cfg b/mlir/test/lib/DeclarativeTransforms/lit.local.cfg deleted file mode 100644 index edb5b44b2e2fe..0000000000000 --- a/mlir/test/lib/DeclarativeTransforms/lit.local.cfg +++ /dev/null @@ -1 +0,0 @@ -config.suffixes.remove('.td') \ No newline at end of file diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index cdfd4e8a815b0..c3318316c5085 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -32,7 +32,6 @@ add_mlir_library(MLIRTestTransforms DEPENDS MLIRStandardOpsIncGen - MLIRTestVectorTransformPatternsIncGen LINK_LIBS PUBLIC MLIRAffineOps @@ -59,5 +58,3 @@ add_mlir_library(MLIRTestTransforms include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../Dialect/Test) include_directories(${CMAKE_CURRENT_BINARY_DIR}/../Dialect/Test) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../DeclarativeTransforms) -include_directories(${CMAKE_CURRENT_BINARY_DIR}/../DeclarativeTransforms) diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp index 2dffd88ed7093..2058706dcbdd3 100644 --- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp +++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp @@ -18,16 +18,16 @@ using namespace mlir; using namespace mlir::vector; namespace { -#include "TestVectorTransformPatterns.h.inc" - struct TestVectorToVectorConversion : public PassWrapper { void runOnFunction() override { OwningRewritePatternList patterns; - auto *context = &getContext(); - populateWithGenerated(context, &patterns); - populateVectorToVectorCanonicalizationPatterns(patterns, context); - populateVectorToVectorTransformationPatterns(patterns, context); + auto *ctx = &getContext(); + patterns.insert>(ArrayRef{2, 2}, ctx); + patterns.insert>( + ArrayRef{2, 2, 2}, ctx); + populateVectorToVectorCanonicalizationPatterns(patterns, ctx); + populateVectorToVectorTransformationPatterns(patterns, ctx); applyPatternsAndFoldGreedily(getFunction(), patterns); } }; From 4887495a3e0b97de4e38232e9f65b9454434a818 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Wed, 29 Jul 2020 02:16:34 +0900 Subject: [PATCH 0330/1035] [JumpThreading] Add tests that have a cast of freeze and vice versa --- llvm/test/Transforms/JumpThreading/freeze.ll | 92 ++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/llvm/test/Transforms/JumpThreading/freeze.ll b/llvm/test/Transforms/JumpThreading/freeze.ll index 3c6aa98c32b84..e08aabf5be170 100644 --- a/llvm/test/Transforms/JumpThreading/freeze.ll +++ b/llvm/test/Transforms/JumpThreading/freeze.ll @@ -49,6 +49,98 @@ F2: ret i32 %B } +define i32 @test1_cast(i1 %cond) { +; CHECK-LABEL: @test1_cast( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: T1: +; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: F1: +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: br label [[MERGE]] +; CHECK: Merge: +; CHECK-NEXT: [[A0:%.*]] = phi i32 [ 1, [[T1]] ], [ 0, [[F1]] ] +; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] +; CHECK-NEXT: [[A:%.*]] = trunc i32 [[A0]] to i1 +; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] +; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK: T2: +; CHECK-NEXT: call void @f3() +; CHECK-NEXT: ret i32 [[B]] +; CHECK: F2: +; CHECK-NEXT: ret i32 [[B]] +; + br i1 %cond, label %T1, label %F1 + +T1: + %v1 = call i32 @f1() + br label %Merge + +F1: + %v2 = call i32 @f2() + br label %Merge + +Merge: + %A0 = phi i32 [1, %T1], [0, %F1] + %B = phi i32 [%v1, %T1], [%v2, %F1] + %A = trunc i32 %A0 to i1 + %A.fr = freeze i1 %A + br i1 %A.fr, label %T2, label %F2 + +T2: + call void @f3() + ret i32 %B + +F2: + ret i32 %B +} + +define i32 @test1_cast2(i1 %cond) { +; CHECK-LABEL: @test1_cast2( +; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] +; CHECK: T1: +; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: F1: +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: br label [[MERGE]] +; CHECK: Merge: +; CHECK-NEXT: [[A0:%.*]] = phi i32 [ 1, [[T1]] ], [ 0, [[F1]] ] +; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] +; CHECK-NEXT: [[A0_FR:%.*]] = freeze i32 [[A0]] +; CHECK-NEXT: [[A_FR:%.*]] = trunc i32 [[A0_FR]] to i1 +; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK: T2: +; CHECK-NEXT: call void @f3() +; CHECK-NEXT: ret i32 [[B]] +; CHECK: F2: +; CHECK-NEXT: ret i32 [[B]] +; + br i1 %cond, label %T1, label %F1 + +T1: + %v1 = call i32 @f1() + br label %Merge + +F1: + %v2 = call i32 @f2() + br label %Merge + +Merge: + %A0 = phi i32 [1, %T1], [0, %F1] + %B = phi i32 [%v1, %T1], [%v2, %F1] + %A0.fr = freeze i32 %A0 + %A.fr = trunc i32 %A0.fr to i1 + br i1 %A.fr, label %T2, label %F2 + +T2: + call void @f3() + ret i32 %B + +F2: + ret i32 %B +} + define i32 @test1_undef(i1 %cond) { ; CHECK-LABEL: @test1_undef( ; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] From 04a21318b55756d50836f6e40f2d209f18cce417 Mon Sep 17 00:00:00 2001 From: Yitzhak Mandelbaum Date: Tue, 28 Jul 2020 17:26:12 +0000 Subject: [PATCH 0331/1035] [libTooling] Add a `between` range-selector combinator. Adds the `between` combinator and registers it with the parser. As a driveby, updates some deprecated names to their current versions. Reviewed By: gribozavr2 Differential Revision: https://reviews.llvm.org/D84315 --- .../clang/Tooling/Transformer/RangeSelector.h | 5 +++ clang/lib/Tooling/Transformer/Parsing.cpp | 4 +-- clang/unittests/Tooling/RangeSelectorTest.cpp | 33 ++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Tooling/Transformer/RangeSelector.h b/clang/include/clang/Tooling/Transformer/RangeSelector.h index 2807037bc208f..e070c0e7e2e6b 100644 --- a/clang/include/clang/Tooling/Transformer/RangeSelector.h +++ b/clang/include/clang/Tooling/Transformer/RangeSelector.h @@ -56,6 +56,11 @@ RangeSelector before(RangeSelector Selector); /// * the TokenRange [B,E'] where the token at E' spans the range [E',E). RangeSelector after(RangeSelector Selector); +/// Selects the range between `R1` and `R2. +inline RangeSelector between(RangeSelector R1, RangeSelector R2) { + return enclose(after(std::move(R1)), before(std::move(R2))); +} + /// Selects a node, including trailing semicolon (for non-expression /// statements). \p ID is the node's binding in the match result. RangeSelector node(std::string ID); diff --git a/clang/lib/Tooling/Transformer/Parsing.cpp b/clang/lib/Tooling/Transformer/Parsing.cpp index 1579115b93138..fb5fd4a800bbb 100644 --- a/clang/lib/Tooling/Transformer/Parsing.cpp +++ b/clang/lib/Tooling/Transformer/Parsing.cpp @@ -109,14 +109,14 @@ getUnaryRangeSelectors() { static const llvm::StringMap> & getBinaryStringSelectors() { static const llvm::StringMap> M = { - {"encloseNodes", range}}; + {"encloseNodes", encloseNodes}}; return M; } static const llvm::StringMap> & getBinaryRangeSelectors() { static const llvm::StringMap> - M = {{"enclose", range}}; + M = {{"enclose", enclose}, {"between", between}}; return M; } diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp index e2d7723eab116..64ddee7894eb9 100644 --- a/clang/unittests/Tooling/RangeSelectorTest.cpp +++ b/clang/unittests/Tooling/RangeSelectorTest.cpp @@ -193,8 +193,33 @@ TEST(RangeSelectorTest, AfterOp) { HasValue(EqualsCharSourceRange(ExpectedAfter))); } +TEST(RangeSelectorTest, BetweenOp) { + StringRef Code = R"cc( + int f(int x, int y, int z) { return 3; } + int g() { return f(3, /* comment */ 7 /* comment */, 9); } + )cc"; + auto Matcher = callExpr(hasArgument(0, expr().bind("a0")), + hasArgument(1, expr().bind("a1"))); + RangeSelector R = between(node("a0"), node("a1")); + TestMatch Match = matchCode(Code, Matcher); + EXPECT_THAT_EXPECTED(select(R, Match), HasValue(", /* comment */ ")); +} + +TEST(RangeSelectorTest, BetweenOpParsed) { + StringRef Code = R"cc( + int f(int x, int y, int z) { return 3; } + int g() { return f(3, /* comment */ 7 /* comment */, 9); } + )cc"; + auto Matcher = callExpr(hasArgument(0, expr().bind("a0")), + hasArgument(1, expr().bind("a1"))); + auto R = parseRangeSelector(R"rs(between(node("a0"), node("a1")))rs"); + ASSERT_THAT_EXPECTED(R, llvm::Succeeded()); + TestMatch Match = matchCode(Code, Matcher); + EXPECT_THAT_EXPECTED(select(*R, Match), HasValue(", /* comment */ ")); +} + // Node-id specific version. -TEST(RangeSelectorTest, RangeOpNodes) { +TEST(RangeSelectorTest, EncloseOpNodes) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } @@ -206,7 +231,7 @@ TEST(RangeSelectorTest, RangeOpNodes) { EXPECT_THAT_EXPECTED(select(R, Match), HasValue("3, 7")); } -TEST(RangeSelectorTest, RangeOpGeneral) { +TEST(RangeSelectorTest, EncloseOpGeneral) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } @@ -218,7 +243,7 @@ TEST(RangeSelectorTest, RangeOpGeneral) { EXPECT_THAT_EXPECTED(select(R, Match), HasValue("3, 7")); } -TEST(RangeSelectorTest, RangeOpNodesParsed) { +TEST(RangeSelectorTest, EncloseOpNodesParsed) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } @@ -231,7 +256,7 @@ TEST(RangeSelectorTest, RangeOpNodesParsed) { EXPECT_THAT_EXPECTED(select(*R, Match), HasValue("3, 7")); } -TEST(RangeSelectorTest, RangeOpGeneralParsed) { +TEST(RangeSelectorTest, EncloseOpGeneralParsed) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } From 6c3dc6e1d57d15568ebcb3b725fa65a7ba4462de Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 Jul 2020 09:53:42 -0700 Subject: [PATCH 0332/1035] [X86] Merge disp8 and cdisp8 handling into a single helper function to reduce some code. We currently handle EVEX and non-EVEX separately in two places. By sinking the EVEX check into the existing helper for CDisp8 we can simplify these two places. Differential Revision: https://reviews.llvm.org/D84730 --- .../X86/MCTargetDesc/X86MCCodeEmitter.cpp | 63 +++++++------------ 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 5f1b5b5e2b96e..900b4d78b6f27 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -113,33 +113,28 @@ static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) { } } -/// \returns true if this signed displacement fits in a 8-bit sign-extended -/// field. -static bool isDisp8(int Value) { return Value == (int8_t)Value; } - -/// \returns true if this signed displacement fits in a 8-bit compressed -/// dispacement field. -static bool isCDisp8(uint64_t TSFlags, int Value, int &CValue) { - assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) && - "Compressed 8-bit displacement is only valid for EVEX inst."); +/// Determine if this immediate can fit in a disp8 or a compressed disp8 for +/// EVEX instructions. \p will be set to the value to pass to the ImmOffset +/// parameter of emitImmediate. +static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) { + bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; - unsigned CD8_Scale = + int CD8_Scale = (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; - if (CD8_Scale == 0) { - CValue = Value; - return isDisp8(Value); - } + if (!HasEVEX || CD8_Scale == 0) + return isInt<8>(Value); - unsigned Mask = CD8_Scale - 1; - assert((CD8_Scale & Mask) == 0 && "Invalid memory object size."); - if (Value & Mask) // Unaligned offset + assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!"); + if (Value & (CD8_Scale - 1)) // Unaligned offset return false; - Value /= (int)CD8_Scale; - bool Ret = (Value == (int8_t)Value); - if (Ret) - CValue = Value; - return Ret; + int CDisp8 = Value / CD8_Scale; + if (!isInt<8>(CDisp8)) + return false; + + // ImmOffset will be added to Value in emitImmediate leaving just CDisp8. + ImmOffset = CDisp8 - Value; + return true; } /// \returns the appropriate fixup kind to use for an immediate in an @@ -393,7 +388,6 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt); const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); - bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP || @@ -487,7 +481,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1); } - if (Disp.isImm() && isDisp8(Disp.getImm())) { + if (Disp.isImm() && isInt<8>(Disp.getImm())) { if (Disp.getImm() == 0 && RMfield != 6) { // There is no displacement; just the register. emitByte(modRMByte(0, RegOpcodeField, RMfield), OS); @@ -557,18 +551,11 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. if (Disp.isImm()) { - if (!HasEVEX && isDisp8(Disp.getImm())) { - emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS); - emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups); - return; - } - // Try EVEX compressed 8-bit displacement first; if failed, fall back to - // 32-bit displacement. - int CDisp8 = 0; - if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + int ImmOffset = 0; + if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS); emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, - CDisp8 - Disp.getImm()); + ImmOffset); return; } } @@ -589,7 +576,6 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, bool ForceDisp32 = false; bool ForceDisp8 = false; - int CDisp8 = 0; int ImmOffset = 0; if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with @@ -606,15 +592,10 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte emitByte(modRMByte(0, RegOpcodeField, 4), OS); - } else if (!HasEVEX && isDisp8(Disp.getImm())) { - // Emit the disp8 encoding. - emitByte(modRMByte(1, RegOpcodeField, 4), OS); - ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP - } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + } else if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { // Emit the disp8 encoding. emitByte(modRMByte(1, RegOpcodeField, 4), OS); ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP - ImmOffset = CDisp8 - Disp.getImm(); } else { // Emit the normal disp32 encoding. emitByte(modRMByte(2, RegOpcodeField, 4), OS); From 91b8c1fd0f31a548b75acb3de4dbee09847e19e5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 Jul 2020 10:46:04 -0700 Subject: [PATCH 0333/1035] [X86] Simplify some code in emitMemModRMByte. NFCI --- .../X86/MCTargetDesc/X86MCCodeEmitter.cpp | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 900b4d78b6f27..25f1089912639 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -605,23 +605,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3}; unsigned SS = SSTable[Scale.getImm()]; - if (BaseReg == 0) { - // Handle the SIB byte for the case where there is no base, see Intel - // Manual 2A, table 2-7. The displacement has already been output. - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg); - else // Examples: [ESP+1*+4] or [scaled idx]+disp32 (MOD=0,BASE=5) - IndexRegNo = 4; - emitSIBByte(SS, IndexRegNo, 5, OS); - } else { - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg); - else - IndexRegNo = 4; // For example [ESP+1*+4] - emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS); - } + unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4; + + // Handle the SIB byte for the case where there is no base, see Intel + // Manual 2A, table 2-7. The displacement has already been output. + if (BaseReg == 0) + BaseRegNo = 5; + + emitSIBByte(SS, IndexRegNo, BaseRegNo, OS); // Do we need to output a displacement? if (ForceDisp8) From a5b89c285329dbf160d501bf81fcc3765fde219d Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Tue, 28 Jul 2020 11:04:43 -0700 Subject: [PATCH 0334/1035] [lld-macho] Fix no-filelist test on Windows --- lld/test/MachO/invalid/no-filelist.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/test/MachO/invalid/no-filelist.s b/lld/test/MachO/invalid/no-filelist.s index fb80185c66aef..980814cf6eeb1 100644 --- a/lld/test/MachO/invalid/no-filelist.s +++ b/lld/test/MachO/invalid/no-filelist.s @@ -1,7 +1,7 @@ # REQUIRES: x86 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o # RUN: not lld -flavor darwinnew -Z -filelist nonexistent %t.o -o %t 2>&1 | FileCheck %s -# CHECK: cannot open nonexistent: No such file or directory +# CHECK: cannot open nonexistent: {{N|n}}o such file or directory .globl _main From acca2980a33e182dd6f4c71554ff2130f260463e Mon Sep 17 00:00:00 2001 From: Siva Chandra Reddy Date: Tue, 28 Jul 2020 10:57:00 -0700 Subject: [PATCH 0335/1035] [libc][obvious] Move ErrnoSetterMatcher to test/ErrnoSetterMetcher.h. --- libc/test/CMakeLists.txt | 6 ++++++ libc/{utils/UnitTest => test}/ErrnoSetterMatcher.h | 12 ++++-------- libc/test/src/signal/CMakeLists.txt | 6 ++++++ libc/test/src/signal/sigaction_test.cpp | 2 +- libc/test/src/signal/sigaddset_test.cpp | 2 +- libc/test/src/signal/sigdelset_test.cpp | 2 +- libc/test/src/signal/sigfillset_test.cpp | 2 +- libc/test/src/signal/signal_test.cpp | 2 +- libc/test/src/signal/sigprocmask_test.cpp | 2 +- libc/test/src/sys/mman/linux/CMakeLists.txt | 1 + libc/test/src/sys/mman/linux/mmap_test.cpp | 2 +- libc/test/src/unistd/CMakeLists.txt | 3 ++- libc/test/src/unistd/write_test.cpp | 2 +- libc/utils/UnitTest/CMakeLists.txt | 1 - 14 files changed, 27 insertions(+), 18 deletions(-) rename libc/{utils/UnitTest => test}/ErrnoSetterMatcher.h (84%) diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt index 500b294ad3866..edce4bbf78cc0 100644 --- a/libc/test/CMakeLists.txt +++ b/libc/test/CMakeLists.txt @@ -1,3 +1,9 @@ +add_header_library( + errno_setter_matcher + HDRS + ErrnoSetterMatcher.h +) + add_custom_target(check-libc) add_subdirectory(config) diff --git a/libc/utils/UnitTest/ErrnoSetterMatcher.h b/libc/test/ErrnoSetterMatcher.h similarity index 84% rename from libc/utils/UnitTest/ErrnoSetterMatcher.h rename to libc/test/ErrnoSetterMatcher.h index d676ee9cbc917..7f8311bfd5e6c 100644 --- a/libc/utils/UnitTest/ErrnoSetterMatcher.h +++ b/libc/test/ErrnoSetterMatcher.h @@ -6,15 +6,11 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_UTILS_UNITTEST_ERRNOSETTERMATCHER_H -#define LLVM_LIBC_UTILS_UNITTEST_ERRNOSETTERMATCHER_H +#ifndef LLVM_LIBC_TEST_ERRNOSETTERMATCHER_H +#define LLVM_LIBC_TEST_ERRNOSETTERMATCHER_H -#include "Test.h" - -// Using LLVM libc headers in UnitTest is not ideal however we also want the -// test/ directory to have the same layout as libc/ so there is no clean place -// to put this file except for in utils/UnitTest/. #include "src/errno/llvmlibc_errno.h" +#include "utils/UnitTest/Test.h" namespace __llvm_libc { namespace testing { @@ -73,4 +69,4 @@ static internal::ErrnoSetterMatcher Fails(int ExpectedErrno, } // namespace testing } // namespace __llvm_libc -#endif // LLVM_LIBC_UTILS_UNITTEST_ERRNOSETTERMATCHER_H +#endif // LLVM_LIBC_TEST_ERRNOSETTERMATCHER_H diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt index b5c1281b81023..c5baaf7d9d550 100644 --- a/libc/test/src/signal/CMakeLists.txt +++ b/libc/test/src/signal/CMakeLists.txt @@ -22,6 +22,7 @@ add_libc_unittest( libc.include.signal libc.src.signal.raise libc.src.signal.sigaction + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -37,6 +38,7 @@ add_libc_unittest( libc.src.signal.sigaddset libc.src.signal.sigemptyset libc.src.signal.sigprocmask + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -49,6 +51,7 @@ add_libc_unittest( libc.include.errno libc.include.signal libc.src.signal.sigaddset + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -63,6 +66,7 @@ add_libc_unittest( libc.src.errno.__errno_location libc.src.signal.raise libc.src.signal.signal + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -77,6 +81,7 @@ add_libc_unittest( libc.src.signal.raise libc.src.signal.sigfillset libc.src.signal.sigprocmask + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -92,4 +97,5 @@ add_libc_unittest( libc.src.signal.sigdelset libc.src.signal.sigfillset libc.src.signal.sigprocmask + libc.test.errno_setter_matcher ) diff --git a/libc/test/src/signal/sigaction_test.cpp b/libc/test/src/signal/sigaction_test.cpp index 726d76b3795de..a473b646072f6 100644 --- a/libc/test/src/signal/sigaction_test.cpp +++ b/libc/test/src/signal/sigaction_test.cpp @@ -12,7 +12,7 @@ #include "src/signal/raise.h" #include "src/signal/sigaction.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" using __llvm_libc::testing::ErrnoSetterMatcher::Fails; diff --git a/libc/test/src/signal/sigaddset_test.cpp b/libc/test/src/signal/sigaddset_test.cpp index f106edb57f905..922110b397b6c 100644 --- a/libc/test/src/signal/sigaddset_test.cpp +++ b/libc/test/src/signal/sigaddset_test.cpp @@ -10,7 +10,7 @@ #include "include/signal.h" #include "src/signal/sigaddset.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" // This tests invalid inputs and ensures errno is properly set. diff --git a/libc/test/src/signal/sigdelset_test.cpp b/libc/test/src/signal/sigdelset_test.cpp index 48e0e6f3f5c4a..42ba9335e219d 100644 --- a/libc/test/src/signal/sigdelset_test.cpp +++ b/libc/test/src/signal/sigdelset_test.cpp @@ -13,7 +13,7 @@ #include "src/signal/sigfillset.h" #include "src/signal/sigprocmask.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" TEST(Sigdelset, Invalid) { diff --git a/libc/test/src/signal/sigfillset_test.cpp b/libc/test/src/signal/sigfillset_test.cpp index c21bf137d2836..16f87e641fbe5 100644 --- a/libc/test/src/signal/sigfillset_test.cpp +++ b/libc/test/src/signal/sigfillset_test.cpp @@ -12,7 +12,7 @@ #include "src/signal/sigfillset.h" #include "src/signal/sigprocmask.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" TEST(Sigfillset, Invalid) { diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp index 8db26e91d670b..03bf22a5f3eb9 100644 --- a/libc/test/src/signal/signal_test.cpp +++ b/libc/test/src/signal/signal_test.cpp @@ -12,7 +12,7 @@ #include "src/signal/raise.h" #include "src/signal/signal.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" using __llvm_libc::testing::ErrnoSetterMatcher::Fails; diff --git a/libc/test/src/signal/sigprocmask_test.cpp b/libc/test/src/signal/sigprocmask_test.cpp index 824415b7267cd..90b57e9e0f2f0 100644 --- a/libc/test/src/signal/sigprocmask_test.cpp +++ b/libc/test/src/signal/sigprocmask_test.cpp @@ -14,7 +14,7 @@ #include "src/signal/sigemptyset.h" #include "src/signal/sigprocmask.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" class SignalTest : public __llvm_libc::testing::Test { diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt index 3c53e4b60107e..c7d3dde7d9751 100644 --- a/libc/test/src/sys/mman/linux/CMakeLists.txt +++ b/libc/test/src/sys/mman/linux/CMakeLists.txt @@ -12,4 +12,5 @@ add_libc_unittest( libc.src.errno.__errno_location libc.src.sys.mman.mmap libc.src.sys.mman.munmap + libc.test.errno_setter_matcher ) diff --git a/libc/test/src/sys/mman/linux/mmap_test.cpp b/libc/test/src/sys/mman/linux/mmap_test.cpp index 38478c77b18bd..8f91c2b548e01 100644 --- a/libc/test/src/sys/mman/linux/mmap_test.cpp +++ b/libc/test/src/sys/mman/linux/mmap_test.cpp @@ -11,7 +11,7 @@ #include "src/errno/llvmlibc_errno.h" #include "src/sys/mman/mmap.h" #include "src/sys/mman/munmap.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" using __llvm_libc::testing::ErrnoSetterMatcher::Fails; diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 5eae9c8b478c0..54e513504b3bf 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -7,7 +7,8 @@ add_libc_unittest( SRCS write_test.cpp DEPENDS - libc.src.unistd.write libc.include.errno libc.include.unistd + libc.src.unistd.write + libc.test.errno_setter_matcher ) diff --git a/libc/test/src/unistd/write_test.cpp b/libc/test/src/unistd/write_test.cpp index 2a91ef6fc277f..a2c4cfca05c75 100644 --- a/libc/test/src/unistd/write_test.cpp +++ b/libc/test/src/unistd/write_test.cpp @@ -8,7 +8,7 @@ #include "include/errno.h" #include "src/unistd/write.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" #include "utils/testutils/FDReader.h" diff --git a/libc/utils/UnitTest/CMakeLists.txt b/libc/utils/UnitTest/CMakeLists.txt index 0837f3d8895fc..9c7b48aea4f94 100644 --- a/libc/utils/UnitTest/CMakeLists.txt +++ b/libc/utils/UnitTest/CMakeLists.txt @@ -2,7 +2,6 @@ add_llvm_library( LibcUnitTest Test.cpp Test.h - ErrnoSetterMatcher.h LINK_COMPONENTS Support ) target_include_directories(LibcUnitTest PUBLIC ${LIBC_SOURCE_DIR}) From 5608f28f552793d115a7f8682559ab053f961924 Mon Sep 17 00:00:00 2001 From: Shu Anzai Date: Tue, 28 Jul 2020 11:05:44 -0700 Subject: [PATCH 0336/1035] [lldb] Change the definition of ANSI_UNFAINT Change the definition of ANSI_UNFAINT in Editline.cpp. Differential revision: https://reviews.llvm.org/D84695 --- lldb/source/Host/common/Editline.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp index 226e638aba250..49b7a38d8dae4 100644 --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -48,9 +48,12 @@ int setupterm(char *term, int fildes, int *errret); // understand the relationship between DisplayInput(), MoveCursor(), // SetCurrentLine(), and SaveEditedLine() before making changes. +/// https://www.ecma-international.org/publications/files/ECMA-ST/Ecma-048.pdf #define ESCAPE "\x1b" +/// Faint, decreased intensity or second colour. #define ANSI_FAINT ESCAPE "[2m" -#define ANSI_UNFAINT ESCAPE "[22m" +/// Normal colour or normal intensity (neither bold nor faint). +#define ANSI_UNFAINT ESCAPE "[0m" #define ANSI_CLEAR_BELOW ESCAPE "[J" #define ANSI_CLEAR_RIGHT ESCAPE "[K" #define ANSI_SET_COLUMN_N ESCAPE "[%dG" From 745eb02496b515cc8292dd7f9d7f0db43e162013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 27 Jul 2020 23:44:41 +0300 Subject: [PATCH 0337/1035] [LLD] [MinGW] Implement the --no-seh flag Previously this flag was just ignored. If set, set the IMAGE_DLL_CHARACTERISTICS_NO_SEH bit, regardless of the normal safeSEH machinery. In mingw configurations, the safeSEH bit might not be set in e.g. object files built from handwritten assembly, making it impossible to use the normal safeseh flag. As mingw setups don't generally use SEH on 32 bit x86 at all, it should be fine to set that flag bit though - hook up the existing GNU ld flag for controlling that. Differential Revision: https://reviews.llvm.org/D84701 --- lld/COFF/Config.h | 1 + lld/COFF/Driver.cpp | 7 ++++--- lld/COFF/Options.td | 1 + lld/COFF/Writer.cpp | 2 +- lld/MinGW/Driver.cpp | 2 ++ lld/MinGW/Options.td | 2 +- lld/test/COFF/noseh.s | 19 +++++++++++++++++++ lld/test/MinGW/driver.test | 4 ++++ 8 files changed, 33 insertions(+), 5 deletions(-) create mode 100644 lld/test/COFF/noseh.s diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h index 72d826b8bd173..7c439176f3a45 100644 --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -140,6 +140,7 @@ struct Configuration { bool safeSEH = false; Symbol *sehTable = nullptr; Symbol *sehCount = nullptr; + bool noSEH = false; // Used for /opt:lldlto=N unsigned ltoo = 2; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 7372505bb6161..9ceccef867797 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1700,9 +1700,10 @@ void LinkerDriver::link(ArrayRef argsArr) { config->wordsize = config->is64() ? 8 : 4; // Handle /safeseh, x86 only, on by default, except for mingw. - if (config->machine == I386 && - args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw)) - config->safeSEH = true; + if (config->machine == I386) { + config->safeSEH = args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw); + config->noSEH = args.hasArg(OPT_noseh); + } // Handle /functionpadmin for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt)) diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index 212879e1d60bd..087d53b5d2ddd 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -204,6 +204,7 @@ def include_optional : Joined<["/", "-", "/?", "-?"], "includeoptional:">, HelpText<"Add symbol as undefined, but allow it to remain undefined">; def kill_at : F<"kill-at">; def lldmingw : F<"lldmingw">; +def noseh : F<"noseh">; def output_def : Joined<["/", "-", "/?", "-?"], "output-def:">; def pdb_source_path : P<"pdbsourcepath", "Base path used to make relative source file path absolute in PDB">; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 3bcc1777f7ac8..082de5b8c1d62 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -1393,7 +1393,7 @@ template void Writer::writeHeader() { pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_GUARD_CF; if (config->integrityCheck) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY; - if (setNoSEHCharacteristic) + if (setNoSEHCharacteristic || config->noSEH) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_NO_SEH; if (config->terminalServerAware) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE; diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp index f33b5e19502c6..d60765c70c095 100644 --- a/lld/MinGW/Driver.cpp +++ b/lld/MinGW/Driver.cpp @@ -288,6 +288,8 @@ bool mingw::link(ArrayRef argsArr, bool canExitEarly, add("-kill-at"); if (args.hasArg(OPT_appcontainer)) add("-appcontainer"); + if (args.hasArg(OPT_no_seh)) + add("-noseh"); if (args.getLastArgValue(OPT_m) != "thumb2pe" && args.getLastArgValue(OPT_m) != "arm64pe" && !args.hasArg(OPT_dynamicbase)) diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td index 3281951dc89dd..fe44166600509 100644 --- a/lld/MinGW/Options.td +++ b/lld/MinGW/Options.td @@ -56,6 +56,7 @@ defm minor_subsystem_version: EqLong<"minor-subsystem-version", "Set the OS and subsystem minor version">; def no_insert_timestamp: F<"no-insert-timestamp">, HelpText<"Don't include PE header timestamp">; +def no_seh: F<"no-seh">, HelpText<"Set the 'no SEH' flag in the executable">; def no_whole_archive: F<"no-whole-archive">, HelpText<"No longer include all object files for following archives">; def large_address_aware: Flag<["--"], "large-address-aware">, @@ -111,7 +112,6 @@ def: Flag<["--"], "full-shutdown">; def: F<"high-entropy-va">; def: S<"major-image-version">; def: S<"minor-image-version">; -def: F<"no-seh">; def: F<"nxcompat">; def: F<"pic-executable">; def: S<"plugin">; diff --git a/lld/test/COFF/noseh.s b/lld/test/COFF/noseh.s new file mode 100644 index 0000000000000..442952286229c --- /dev/null +++ b/lld/test/COFF/noseh.s @@ -0,0 +1,19 @@ +# REQUIRES: x86 +# RUN: llvm-mc -triple i686-w64-mingw32 %s -filetype=obj -o %t.obj +# RUN: lld-link -lldmingw %t.obj -out:%t.exe -entry:main +# RUN: llvm-readobj --file-headers %t.exe | FileCheck %s --check-prefix=DEFAULT +# RUN: lld-link -lldmingw %t.obj -out:%t.noseh.exe -entry:main -noseh +# RUN: llvm-readobj --file-headers %t.noseh.exe | FileCheck %s --check-prefix=NOSEH + +# DEFAULT: Characteristics [ +# DEFAULT-NOT: IMAGE_DLL_CHARACTERISTICS_NO_SEH +# DEFAULT: ] + +# NOSEH: Characteristics [ +# NOSEH: IMAGE_DLL_CHARACTERISTICS_NO_SEH +# NOSEH: ] + + .text + .globl _main +_main: + ret diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test index 385822c7e1f70..faac3a0be57d0 100644 --- a/lld/test/MinGW/driver.test +++ b/lld/test/MinGW/driver.test @@ -256,3 +256,7 @@ RUN: ld.lld -### -m i386pep foo.o -section-alignment 0x2000 | FileCheck -check-p RUN: ld.lld -### -m i386pep foo.o --section-alignment=0x2000 | FileCheck -check-prefix ALIGN %s RUN: ld.lld -### -m i386pep foo.o -section-alignment=0x2000 | FileCheck -check-prefix ALIGN %s ALIGN: -align:0x2000 + +RUN: ld.lld -### -m i386pe foo.o -no-seh | FileCheck -check-prefix NOSEH %s +RUN: ld.lld -### -m i386pe foo.o --no-seh | FileCheck -check-prefix NOSEH %s +NOSEH: -noseh From 4c9af6d0e001bf76007527899df7a9d8860c9a5a Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Wed, 29 Jul 2020 03:11:57 +0900 Subject: [PATCH 0338/1035] [JumpThreading] Add a basic support for freeze instruction This patch adds a basic support for freeze instruction to JumpThreading by making ComputeValueKnownInPredecessorsImpl look into its operand. Reviewed By: efriedma, nikic Differential Revision: https://reviews.llvm.org/D84598 --- llvm/lib/Transforms/Scalar/JumpThreading.cpp | 21 +++- llvm/test/Transforms/JumpThreading/freeze.ll | 103 ++++++++----------- 2 files changed, 60 insertions(+), 64 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 2f379b7f61608..7399c7ab609e5 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -675,10 +675,11 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl( } // Handle Cast instructions. Only see through Cast when the source operand is - // PHI or Cmp to save the compilation time. + // PHI, Cmp, or Freeze to save the compilation time. if (CastInst *CI = dyn_cast(I)) { Value *Source = CI->getOperand(0); - if (!isa(Source) && !isa(Source)) + if (!isa(Source) && !isa(Source) && + !isa(Source)) return false; ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference, RecursionSet, CxtI); @@ -692,6 +693,22 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl( return true; } + // Handle Freeze instructions, in a manner similar to Cast. + if (FreezeInst *FI = dyn_cast(I)) { + Value *Source = FI->getOperand(0); + if (!isa(Source) && !isa(Source) && + !isa(Source)) + return false; + ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference, + RecursionSet, CxtI); + + erase_if(Result, [](auto &Pair) { + return !isGuaranteedNotToBeUndefOrPoison(Pair.first); + }); + + return !Result.empty(); + } + // Handle some boolean conditions. if (I->getType()->getPrimitiveSizeInBits() == 1) { assert(Preference == WantInteger && "One-bit non-integer type?"); diff --git a/llvm/test/Transforms/JumpThreading/freeze.ll b/llvm/test/Transforms/JumpThreading/freeze.ll index e08aabf5be170..8f8f3f5566568 100644 --- a/llvm/test/Transforms/JumpThreading/freeze.ll +++ b/llvm/test/Transforms/JumpThreading/freeze.ll @@ -7,23 +7,14 @@ declare void @f3() define i32 @test1(i1 %cond) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] -; CHECK: T1: -; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() -; CHECK-NEXT: br label [[MERGE:%.*]] -; CHECK: F1: -; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() -; CHECK-NEXT: br label [[MERGE]] -; CHECK: Merge: -; CHECK-NEXT: [[A:%.*]] = phi i1 [ true, [[T1]] ], [ false, [[F1]] ] -; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] -; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] -; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[T2:%.*]], label [[F2:%.*]] ; CHECK: T2: +; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() ; CHECK-NEXT: call void @f3() -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[V1]] ; CHECK: F2: -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: ret i32 [[V2]] ; br i1 %cond, label %T1, label %F1 @@ -51,24 +42,21 @@ F2: define i32 @test1_cast(i1 %cond) { ; CHECK-LABEL: @test1_cast( -; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] -; CHECK: T1: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[MERGE_THREAD:%.*]], label [[MERGE:%.*]] +; CHECK: Merge.thread: ; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() -; CHECK-NEXT: br label [[MERGE:%.*]] -; CHECK: F1: -; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() -; CHECK-NEXT: br label [[MERGE]] +; CHECK-NEXT: br label [[T2:%.*]] ; CHECK: Merge: -; CHECK-NEXT: [[A0:%.*]] = phi i32 [ 1, [[T1]] ], [ 0, [[F1]] ] -; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] -; CHECK-NEXT: [[A:%.*]] = trunc i32 [[A0]] to i1 +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: [[A:%.*]] = trunc i32 0 to i1 ; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] -; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK-NEXT: br i1 [[A_FR]], label [[T2]], label [[F2:%.*]] ; CHECK: T2: +; CHECK-NEXT: [[B5:%.*]] = phi i32 [ [[V1]], [[MERGE_THREAD]] ], [ [[V2]], [[MERGE]] ] ; CHECK-NEXT: call void @f3() -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[B5]] ; CHECK: F2: -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[V2]] ; br i1 %cond, label %T1, label %F1 @@ -97,24 +85,21 @@ F2: define i32 @test1_cast2(i1 %cond) { ; CHECK-LABEL: @test1_cast2( -; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] -; CHECK: T1: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[MERGE_THREAD:%.*]], label [[MERGE:%.*]] +; CHECK: Merge.thread: ; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() -; CHECK-NEXT: br label [[MERGE:%.*]] -; CHECK: F1: -; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() -; CHECK-NEXT: br label [[MERGE]] +; CHECK-NEXT: br label [[T2:%.*]] ; CHECK: Merge: -; CHECK-NEXT: [[A0:%.*]] = phi i32 [ 1, [[T1]] ], [ 0, [[F1]] ] -; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] -; CHECK-NEXT: [[A0_FR:%.*]] = freeze i32 [[A0]] +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: [[A0_FR:%.*]] = freeze i32 0 ; CHECK-NEXT: [[A_FR:%.*]] = trunc i32 [[A0_FR]] to i1 -; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK-NEXT: br i1 [[A_FR]], label [[T2]], label [[F2:%.*]] ; CHECK: T2: +; CHECK-NEXT: [[B5:%.*]] = phi i32 [ [[V1]], [[MERGE_THREAD]] ], [ [[V2]], [[MERGE]] ] ; CHECK-NEXT: call void @f3() -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[B5]] ; CHECK: F2: -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[V2]] ; br i1 %cond, label %T1, label %F1 @@ -143,23 +128,20 @@ F2: define i32 @test1_undef(i1 %cond) { ; CHECK-LABEL: @test1_undef( -; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] -; CHECK: T1: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[MERGE_THREAD:%.*]], label [[MERGE:%.*]] +; CHECK: Merge.thread: ; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() -; CHECK-NEXT: br label [[MERGE:%.*]] -; CHECK: F1: -; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() -; CHECK-NEXT: br label [[MERGE]] +; CHECK-NEXT: br label [[T2:%.*]] ; CHECK: Merge: -; CHECK-NEXT: [[A:%.*]] = phi i1 [ true, [[T1]] ], [ undef, [[F1]] ] -; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] -; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] -; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 undef +; CHECK-NEXT: br i1 [[A_FR]], label [[T2]], label [[F2:%.*]] ; CHECK: T2: +; CHECK-NEXT: [[B4:%.*]] = phi i32 [ [[V1]], [[MERGE_THREAD]] ], [ [[V2]], [[MERGE]] ] ; CHECK-NEXT: call void @f3() -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[B4]] ; CHECK: F2: -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[V2]] ; br i1 %cond, label %T1, label %F1 @@ -187,23 +169,20 @@ F2: define i32 @test2(i1 %cond, i1 %cond2) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: br i1 [[COND:%.*]], label [[T1:%.*]], label [[F1:%.*]] -; CHECK: T1: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[MERGE_THREAD:%.*]], label [[MERGE:%.*]] +; CHECK: Merge.thread: ; CHECK-NEXT: [[V1:%.*]] = call i32 @f1() -; CHECK-NEXT: br label [[MERGE:%.*]] -; CHECK: F1: -; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() -; CHECK-NEXT: br label [[MERGE]] +; CHECK-NEXT: br label [[T2:%.*]] ; CHECK: Merge: -; CHECK-NEXT: [[A:%.*]] = phi i1 [ true, [[T1]] ], [ [[COND2:%.*]], [[F1]] ] -; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[V1]], [[T1]] ], [ [[V2]], [[F1]] ] -; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[A]] -; CHECK-NEXT: br i1 [[A_FR]], label [[T2:%.*]], label [[F2:%.*]] +; CHECK-NEXT: [[V2:%.*]] = call i32 @f2() +; CHECK-NEXT: [[A_FR:%.*]] = freeze i1 [[COND2:%.*]] +; CHECK-NEXT: br i1 [[A_FR]], label [[T2]], label [[F2:%.*]] ; CHECK: T2: +; CHECK-NEXT: [[B4:%.*]] = phi i32 [ [[V1]], [[MERGE_THREAD]] ], [ [[V2]], [[MERGE]] ] ; CHECK-NEXT: call void @f3() -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[B4]] ; CHECK: F2: -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: ret i32 [[V2]] ; br i1 %cond, label %T1, label %F1 From 394db2259575ef3cac8d3d37836b11eb2373c435 Mon Sep 17 00:00:00 2001 From: Amy Huang Date: Tue, 28 Jul 2020 11:23:59 -0700 Subject: [PATCH 0339/1035] Revert "Switch to using -debug-info-kind=constructor as default (from =limited)" This reverts commit 227db86a1b7dd6f96f7df14890fcd071bc4fe1f5. Causing debug info errors in google3 LTO builds; also causes a debuginfo-test failure. --- clang/lib/Driver/ToolChains/Clang.cpp | 16 +++++++--------- clang/test/Driver/cl-options.c | 6 +++--- clang/test/Driver/clang-g-opts.c | 2 +- clang/test/Driver/cuda-dwarf-2.cu | 2 +- clang/test/Driver/debug-options-as.c | 2 +- clang/test/Driver/debug-options.c | 8 ++++---- clang/test/Driver/integrated-as.s | 10 +++++----- clang/test/Driver/myriad-toolchain.c | 2 +- clang/test/Driver/openmp-offload-gpu.c | 2 +- clang/test/Driver/split-debug.c | 10 +++++----- .../SymbolFile/PDB/Inputs/ClassLayoutTest.cpp | 1 - 11 files changed, 29 insertions(+), 32 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b0de225f8abf5..68e4eb0eedda1 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -498,7 +498,7 @@ static codegenoptions::DebugInfoKind DebugLevelToInfoKind(const Arg &A) { return codegenoptions::DebugLineTablesOnly; if (A.getOption().matches(options::OPT_gline_directives_only)) return codegenoptions::DebugDirectivesOnly; - return codegenoptions::DebugInfoConstructor; + return codegenoptions::LimitedDebugInfo; } static bool mustUseNonLeafFramePointerForTarget(const llvm::Triple &Triple) { @@ -2383,7 +2383,7 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, CmdArgs.push_back(Value.data()); } else { RenderDebugEnablingArgs(Args, CmdArgs, - codegenoptions::DebugInfoConstructor, + codegenoptions::LimitedDebugInfo, DwarfVersion, llvm::DebuggerKind::Default); } } else if (Value.startswith("-mcpu") || Value.startswith("-mfpu") || @@ -3656,7 +3656,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D, if (const Arg *A = Args.getLastArg(options::OPT_g_Group, options::OPT_gsplit_dwarf, options::OPT_gsplit_dwarf_EQ)) { - DebugInfoKind = codegenoptions::DebugInfoConstructor; + DebugInfoKind = codegenoptions::LimitedDebugInfo; // If the last option explicitly specified a debug-info level, use it. if (checkDebugInfoOption(A, Args, D, TC) && @@ -3761,7 +3761,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D, if (checkDebugInfoOption(A, Args, D, TC)) { if (DebugInfoKind != codegenoptions::DebugLineTablesOnly && DebugInfoKind != codegenoptions::DebugDirectivesOnly) { - DebugInfoKind = codegenoptions::DebugInfoConstructor; + DebugInfoKind = codegenoptions::LimitedDebugInfo; CmdArgs.push_back("-dwarf-ext-refs"); CmdArgs.push_back("-fmodule-format=obj"); } @@ -3781,9 +3781,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D, TC.GetDefaultStandaloneDebug()); if (const Arg *A = Args.getLastArg(options::OPT_fstandalone_debug)) (void)checkDebugInfoOption(A, Args, D, TC); - if ((DebugInfoKind == codegenoptions::LimitedDebugInfo || - DebugInfoKind == codegenoptions::DebugInfoConstructor) && - NeedFullDebug) + if (DebugInfoKind == codegenoptions::LimitedDebugInfo && NeedFullDebug) DebugInfoKind = codegenoptions::FullDebugInfo; if (Args.hasFlag(options::OPT_gembed_source, options::OPT_gno_embed_source, @@ -6569,7 +6567,7 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType, options::OPT_gline_tables_only)) { *EmitCodeView = true; if (DebugInfoArg->getOption().matches(options::OPT__SLASH_Z7)) - *DebugInfoKind = codegenoptions::DebugInfoConstructor; + *DebugInfoKind = codegenoptions::LimitedDebugInfo; else *DebugInfoKind = codegenoptions::DebugLineTablesOnly; } else { @@ -6866,7 +6864,7 @@ void ClangAs::ConstructJob(Compilation &C, const JobAction &JA, // the guard for source type, however there is a test which asserts // that some assembler invocation receives no -debug-info-kind, // and it's not clear whether that test is just overly restrictive. - DebugInfoKind = (WantDebug ? codegenoptions::DebugInfoConstructor + DebugInfoKind = (WantDebug ? codegenoptions::LimitedDebugInfo : codegenoptions::NoDebugInfo); // Add the -fdebug-compilation-dir flag if needed. addDebugCompDirArg(Args, CmdArgs, C.getDriver().getVFS()); diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 0dcaf61088069..d0c48ae41d9a2 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -524,11 +524,11 @@ // RUN: %clang_cl /Zi /c -### -- %s 2>&1 | FileCheck -check-prefix=Zi %s // Zi: "-gcodeview" -// Zi: "-debug-info-kind=constructor" +// Zi: "-debug-info-kind=limited" // RUN: %clang_cl /Z7 /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7 %s // Z7: "-gcodeview" -// Z7: "-debug-info-kind=constructor" +// Z7: "-debug-info-kind=limited" // RUN: %clang_cl /Zd /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7GMLT %s // Z7GMLT: "-gcodeview" @@ -557,7 +557,7 @@ // which made it "win". This test could not detect that bug. // RUN: %clang_cl /Z7 -gdwarf /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7_gdwarf %s // Z7_gdwarf: "-gcodeview" -// Z7_gdwarf: "-debug-info-kind=constructor" +// Z7_gdwarf: "-debug-info-kind=limited" // Z7_gdwarf: "-dwarf-version=4" // RUN: %clang_cl -fmsc-version=1800 -TP -### -- %s 2>&1 | FileCheck -check-prefix=CXX11 %s diff --git a/clang/test/Driver/clang-g-opts.c b/clang/test/Driver/clang-g-opts.c index 60c97790b7dae..bc714b6c93791 100644 --- a/clang/test/Driver/clang-g-opts.c +++ b/clang/test/Driver/clang-g-opts.c @@ -31,7 +31,7 @@ // RUN: | FileCheck --check-prefix=CHECK-WITH-G-DWARF2 %s // CHECK-WITHOUT-G-NOT: -debug-info-kind -// CHECK-WITH-G: "-debug-info-kind=constructor" +// CHECK-WITH-G: "-debug-info-kind=limited" // CHECK-WITH-G: "-dwarf-version=4" // CHECK-WITH-G-DWARF2: "-dwarf-version=2" diff --git a/clang/test/Driver/cuda-dwarf-2.cu b/clang/test/Driver/cuda-dwarf-2.cu index 92b8919729fc4..bcfb2444bc516 100644 --- a/clang/test/Driver/cuda-dwarf-2.cu +++ b/clang/test/Driver/cuda-dwarf-2.cu @@ -49,7 +49,7 @@ // HAS_DEBUG-NOT: warning: debug // HAS_DEBUG: "-fcuda-is-device" -// HAS_DEBUG-SAME: "-debug-info-kind={{constructor|line-tables-only}}" +// HAS_DEBUG-SAME: "-debug-info-kind={{limited|line-tables-only}}" // HAS_DEBUG-SAME: "-dwarf-version=2" // HAS_DEBUG: ptxas // HAS_DEBUG-SAME: "-g" diff --git a/clang/test/Driver/debug-options-as.c b/clang/test/Driver/debug-options-as.c index 4808219702e76..51475680e9b18 100644 --- a/clang/test/Driver/debug-options-as.c +++ b/clang/test/Driver/debug-options-as.c @@ -23,7 +23,7 @@ // RUN: | FileCheck %s // // CHECK: "-cc1as" -// CHECK: "-debug-info-kind=constructor" +// CHECK: "-debug-info-kind=limited" // Check to make sure clang with -g on a .s file gets passed -dwarf-debug-producer. // rdar://12955296 diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c index 2d1a0b2d5cd8f..189c1f9addeb9 100644 --- a/clang/test/Driver/debug-options.c +++ b/clang/test/Driver/debug-options.c @@ -274,18 +274,18 @@ // GLIO_ONLY_DWARF2: "-dwarf-version=2" // // G_ONLY: "-cc1" -// G_ONLY: "-debug-info-kind=constructor" +// G_ONLY: "-debug-info-kind=limited" // // These tests assert that "-gline-tables-only" "-g" uses the latter, // but otherwise not caring about the DebugInfoKind. // G_ONLY_DWARF2: "-cc1" -// G_ONLY_DWARF2: "-debug-info-kind={{standalone|constructor}}" +// G_ONLY_DWARF2: "-debug-info-kind={{standalone|limited}}" // G_ONLY_DWARF2: "-dwarf-version=2" // // G_STANDALONE: "-cc1" // G_STANDALONE: "-debug-info-kind=standalone" // G_LIMITED: "-cc1" -// G_LIMITED: "-debug-info-kind=constructor" +// G_LIMITED: "-debug-info-kind=limited" // G_DWARF2: "-dwarf-version=2" // G_DWARF4: "-dwarf-version=4" // @@ -339,7 +339,7 @@ // NOCI: "-gno-column-info" // // GEXTREFS: "-dwarf-ext-refs" "-fmodule-format=obj" -// GEXTREFS: "-debug-info-kind={{standalone|constructor}}" +// GEXTREFS: "-debug-info-kind={{standalone|limited}}" // RUN: not %clang -cc1 -debug-info-kind=watkind 2>&1 | FileCheck -check-prefix=BADSTRING1 %s // BADSTRING1: error: invalid value 'watkind' in '-debug-info-kind=watkind' diff --git a/clang/test/Driver/integrated-as.s b/clang/test/Driver/integrated-as.s index 05999cfe002b5..0194a3d5a4382 100644 --- a/clang/test/Driver/integrated-as.s +++ b/clang/test/Driver/integrated-as.s @@ -27,19 +27,19 @@ // XA_INCLUDE2: "-Ifoo_dir" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-4 -gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2 %s -// DWARF2: "-debug-info-kind=constructor" "-dwarf-version=2" +// DWARF2: "-debug-info-kind=limited" "-dwarf-version=2" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-3 2>&1 | FileCheck --check-prefix=DWARF3 %s -// DWARF3: "-debug-info-kind=constructor" "-dwarf-version=3" +// DWARF3: "-debug-info-kind=limited" "-dwarf-version=3" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-4 2>&1 | FileCheck --check-prefix=DWARF4 %s -// DWARF4: "-debug-info-kind=constructor" "-dwarf-version=4" +// DWARF4: "-debug-info-kind=limited" "-dwarf-version=4" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Xassembler -gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2XASSEMBLER %s -// DWARF2XASSEMBLER: "-debug-info-kind=constructor" "-dwarf-version=2" +// DWARF2XASSEMBLER: "-debug-info-kind=limited" "-dwarf-version=2" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Wa,-gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2WA %s -// DWARF2WA: "-debug-info-kind=constructor" "-dwarf-version=2" +// DWARF2WA: "-debug-info-kind=limited" "-dwarf-version=2" // A dwarf version number that driver can't parse is just stuffed in. // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Wa,-gdwarf-huh 2>&1 | FileCheck --check-prefix=BOGODWARF %s diff --git a/clang/test/Driver/myriad-toolchain.c b/clang/test/Driver/myriad-toolchain.c index a4bd260a14986..215a02fd0dec1 100644 --- a/clang/test/Driver/myriad-toolchain.c +++ b/clang/test/Driver/myriad-toolchain.c @@ -83,7 +83,7 @@ // NOSTDLIB-NOT: "-lc" // RUN: %clang -### -c -g %s -target sparc-myriad 2>&1 | FileCheck -check-prefix=G_SPARC %s -// G_SPARC: "-debug-info-kind=constructor" "-dwarf-version=2" +// G_SPARC: "-debug-info-kind=limited" "-dwarf-version=2" // RUN: %clang -### -c %s -target sparc-myriad-rtems -fuse-init-array 2>&1 \ // RUN: | FileCheck -check-prefix=USE-INIT-ARRAY %s diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c index 3ddd6446d1176..6415f1d61b720 100644 --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -241,7 +241,7 @@ // HAS_DEBUG-NOT: warning: debug // HAS_DEBUG: "-triple" "nvptx64-nvidia-cuda" -// HAS_DEBUG-SAME: "-debug-info-kind={{constructor|line-tables-only}}" +// HAS_DEBUG-SAME: "-debug-info-kind={{limited|line-tables-only}}" // HAS_DEBUG-SAME: "-dwarf-version=2" // HAS_DEBUG-SAME: "-fopenmp-is-device" // HAS_DEBUG: ptxas diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c index 70f8d91d48e01..d40207d5ae3b6 100644 --- a/clang/test/Driver/split-debug.c +++ b/clang/test/Driver/split-debug.c @@ -68,18 +68,18 @@ // RUN: FileCheck -check-prefix=CHECK-NOINLINE-WITHOUT-SPLIT < %t %s // // CHECK-NOINLINE-WITHOUT-SPLIT: "-fno-split-dwarf-inlining" -// CHECK-NOINLINE-WITHOUT-SPLIT: "-debug-info-kind=constructor" +// CHECK-NOINLINE-WITHOUT-SPLIT: "-debug-info-kind=limited" // RUN: %clang -target x86_64-unknown-linux-gnu -gmlt -gsplit-dwarf -fno-split-dwarf-inlining -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-WITH-GMLT < %t %s // -// CHECK-SPLIT-WITH-GMLT: "-debug-info-kind=constructor" +// CHECK-SPLIT-WITH-GMLT: "-debug-info-kind=limited" // CHECK-SPLIT-WITH-GMLT: "-split-dwarf-output" // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -fno-split-dwarf-inlining -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-WITH-NOINL < %t %s // -// CHECK-SPLIT-WITH-NOINL: "-debug-info-kind=constructor" +// CHECK-SPLIT-WITH-NOINL: "-debug-info-kind=limited" // CHECK-SPLIT-WITH-NOINL: "-split-dwarf-output" // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -gmlt -fsplit-dwarf-inlining -S -### %s 2> %t @@ -92,7 +92,7 @@ // RUN: %clang -target x86_64-unknown-linux-gnu -gmlt -gsplit-dwarf -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-OVER-GMLT < %t %s // -// CHECK-SPLIT-OVER-GMLT: "-debug-info-kind=constructor" +// CHECK-SPLIT-OVER-GMLT: "-debug-info-kind=limited" // CHECK-SPLIT-OVER-GMLT: "-split-dwarf-file" // CHECK-SPLIT-OVER-GMLT: "-split-dwarf-output" @@ -117,6 +117,6 @@ // RUN: %clang -target x86_64-unknown-linux-gnu -g0 -gsplit-dwarf=split -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-OVER-G0 < %t %s // -// CHECK-SPLIT-OVER-G0: "-debug-info-kind=constructor" +// CHECK-SPLIT-OVER-G0: "-debug-info-kind=limited" // CHECK-SPLIT-OVER-G0: "-split-dwarf-file" // CHECK-SPLIT-OVER-G0: "-split-dwarf-output" diff --git a/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp b/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp index 503939680c500..3c4b005cdf1be 100644 --- a/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp +++ b/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp @@ -106,7 +106,6 @@ class Class : public Base { // Test base class. int main() { MemberTest::Base B1; B1.Get(); - MemberTest::Class C1; MemberTest::Class::StaticMemberFunc(1, 10, 2); return 0; } From 834133c950fce120d0378d09718d32a320cbcd72 Mon Sep 17 00:00:00 2001 From: Anand Kodnani Date: Tue, 28 Jul 2020 10:37:16 -0700 Subject: [PATCH 0340/1035] [MLIR] Vector store to load forwarding The MemRefDataFlow pass does store to load forwarding only for affine store/loads. This patch updates the pass to use affine read/write interface which enables vector forwarding. Reviewed By: dcaballe, bondhugula, ftynse Differential Revision: https://reviews.llvm.org/D84302 --- .../Affine/IR/AffineMemoryOpInterfaces.td | 21 +++++++++++++++++++ .../mlir/Dialect/Affine/IR/AffineOps.td | 4 ++-- mlir/lib/Transforms/MemRefDataFlowOpt.cpp | 15 ++++++------- mlir/test/Transforms/memref-dataflow-opt.mlir | 20 ++++++++++++++++++ 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td b/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td index a093cb9fd4be4..1f25073f07e36 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td @@ -81,6 +81,16 @@ def AffineReadOpInterface : OpInterface<"AffineReadOpInterface"> { op.getAffineMapAttr()}; }] >, + InterfaceMethod< + /*desc=*/"Returns the value read by this operation.", + /*retTy=*/"Value", + /*methodName=*/"getValue", + /*args=*/(ins), + /*methodBody=*/[{}], + /*defaultImplementation=*/[{ + return cast(this->getOperation()); + }] + >, ]; } @@ -150,6 +160,17 @@ def AffineWriteOpInterface : OpInterface<"AffineWriteOpInterface"> { op.getAffineMapAttr()}; }] >, + InterfaceMethod< + /*desc=*/"Returns the value to store.", + /*retTy=*/"Value", + /*methodName=*/"getValueToStore", + /*args=*/(ins), + /*methodBody=*/[{}], + /*defaultImplementation=*/[{ + ConcreteOp op = cast(this->getOperation()); + return op.getOperand(op.getStoredValOperandIndex()); + }] + >, ]; } diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index ab7d96f7cafa3..95e17aa1eec87 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -725,8 +725,8 @@ class AffineStoreOpBase traits = []> : Affine_Op])> { code extraClassDeclarationBase = [{ - /// Get value to be stored by store operation. - Value getValueToStore() { return getOperand(0); } + /// Returns the operand index of the value to be stored. + unsigned getStoredValOperandIndex() { return 0; } /// Returns the operand index of the memref. unsigned getMemRefOperandIndex() { return 1; } diff --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp index 7220fd1f3dc6a..7924b46a83d58 100644 --- a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp +++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp @@ -63,7 +63,7 @@ namespace { struct MemRefDataFlowOpt : public MemRefDataFlowOptBase { void runOnFunction() override; - void forwardStoreToLoad(AffineLoadOp loadOp); + void forwardStoreToLoad(AffineReadOpInterface loadOp); // A list of memref's that are potentially dead / could be eliminated. SmallPtrSet memrefsToErase; @@ -84,14 +84,14 @@ std::unique_ptr> mlir::createMemRefDataFlowOptPass() { // This is a straightforward implementation not optimized for speed. Optimize // if needed. -void MemRefDataFlowOpt::forwardStoreToLoad(AffineLoadOp loadOp) { +void MemRefDataFlowOpt::forwardStoreToLoad(AffineReadOpInterface loadOp) { // First pass over the use list to get the minimum number of surrounding // loops common between the load op and the store op, with min taken across // all store ops. SmallVector storeOps; unsigned minSurroundingLoops = getNestingDepth(loadOp); for (auto *user : loadOp.getMemRef().getUsers()) { - auto storeOp = dyn_cast(user); + auto storeOp = dyn_cast(user); if (!storeOp) continue; unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp); @@ -167,8 +167,9 @@ void MemRefDataFlowOpt::forwardStoreToLoad(AffineLoadOp loadOp) { return; // Perform the actual store to load forwarding. - Value storeVal = cast(lastWriteStoreOp).getValueToStore(); - loadOp.replaceAllUsesWith(storeVal); + Value storeVal = + cast(lastWriteStoreOp).getValueToStore(); + loadOp.getValue().replaceAllUsesWith(storeVal); // Record the memref for a later sweep to optimize away. memrefsToErase.insert(loadOp.getMemRef()); // Record this to erase later. @@ -190,7 +191,7 @@ void MemRefDataFlowOpt::runOnFunction() { memrefsToErase.clear(); // Walk all load's and perform store to load forwarding. - f.walk([&](AffineLoadOp loadOp) { forwardStoreToLoad(loadOp); }); + f.walk([&](AffineReadOpInterface loadOp) { forwardStoreToLoad(loadOp); }); // Erase all load op's whose results were replaced with store fwd'ed ones. for (auto *loadOp : loadOpsToErase) @@ -207,7 +208,7 @@ void MemRefDataFlowOpt::runOnFunction() { // could still erase it if the call had no side-effects. continue; if (llvm::any_of(memref.getUsers(), [&](Operation *ownerOp) { - return !isa(ownerOp); + return !isa(ownerOp); })) continue; diff --git a/mlir/test/Transforms/memref-dataflow-opt.mlir b/mlir/test/Transforms/memref-dataflow-opt.mlir index 6d5288c775c7f..dfda19372813a 100644 --- a/mlir/test/Transforms/memref-dataflow-opt.mlir +++ b/mlir/test/Transforms/memref-dataflow-opt.mlir @@ -280,3 +280,23 @@ func @refs_not_known_to_be_equal(%A : memref<100 x 100 x f32>, %M : index) { } return } + +// The test checks for value forwarding from vector stores to vector loads. +// The value loaded from %in can directly be stored to %out by eliminating +// store and load from %tmp. +func @vector_forwarding(%in : memref<512xf32>, %out : memref<512xf32>) { + %tmp = alloc() : memref<512xf32> + affine.for %i = 0 to 16 { + %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32> + affine.vector_store %ld0, %tmp[32*%i] : memref<512xf32>, vector<32xf32> + %ld1 = affine.vector_load %tmp[32*%i] : memref<512xf32>, vector<32xf32> + affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32> + } + return +} + +// CHECK-LABEL: func @vector_forwarding +// CHECK: affine.for %{{.*}} = 0 to 16 { +// CHECK-NEXT: %[[LDVAL:.*]] = affine.vector_load +// CHECK-NEXT: affine.vector_store %[[LDVAL]],{{.*}} +// CHECK-NEXT: } From adeeac9d5a322a751011d4624152317cb43419d9 Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Mon, 13 Jul 2020 15:38:29 -0700 Subject: [PATCH 0341/1035] [AMDGPU] Spill CSR VGPR which is reserved for SGPR spills Update logic for reserving VGPR for SGPR spills. A CSR VGPR being reserved for SGPR spills could be clobbered if there were no free lower VGPR's available. Create a stack object so that it will be spilled in the prologue. Also adds more tests. Differential Revision: https://reviews.llvm.org/D83730 --- llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 49 +++--- .../AMDGPU/reserve-vgpr-for-sgpr-spill.ll | 158 +++++++++++++++++- 2 files changed, 179 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 1349d3b6bf3f6..a7e963e043f46 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -233,10 +233,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { // Find lowest available VGPR and use it as VGPR reserved for SGPR spills. static bool lowerShiftReservedVGPR(MachineFunction &MF, const GCNSubtarget &ST) { + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; + // Early out if pre-reservation of a VGPR for SGPR spilling is disabled. + if (!PreReservedVGPR) + return false; + + // If there are no free lower VGPRs available, default to using the + // pre-reserved register instead. + Register LowestAvailableVGPR = PreReservedVGPR; + MachineRegisterInfo &MRI = MF.getRegInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - Register LowestAvailableVGPR, ReservedVGPR; ArrayRef AllVGPR32s = ST.getRegisterInfo()->getAllVGPR32(MF); for (MCPhysReg Reg : AllVGPR32s) { if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) { @@ -245,26 +253,29 @@ static bool lowerShiftReservedVGPR(MachineFunction &MF, } } - if (!LowestAvailableVGPR) - return false; - - ReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); - int i = 0; + Optional FI; + // Check if we are reserving a CSR. Create a stack object for a possible spill + // in the function prologue. + if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) + FI = FrameInfo.CreateSpillStackObject(4, Align(4)); + + // Find saved info about the pre-reserved register. + const auto *ReservedVGPRInfoItr = + std::find_if(FuncInfo->getSGPRSpillVGPRs().begin(), + FuncInfo->getSGPRSpillVGPRs().end(), + [PreReservedVGPR](const auto &SpillRegInfo) { + return SpillRegInfo.VGPR == PreReservedVGPR; + }); + + assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end()); + auto Index = + std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr); + + FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index); for (MachineBasicBlock &MBB : MF) { - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { - if (Reg.VGPR == ReservedVGPR) { - MBB.removeLiveIn(ReservedVGPR); - MBB.addLiveIn(LowestAvailableVGPR); - Optional FI; - if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) - FI = FrameInfo.CreateSpillStackObject(4, Align(4)); - - FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, i); - } - ++i; - } + MBB.addLiveIn(LowestAvailableVGPR); MBB.sortUniqueLiveIns(); } diff --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll index 9df99aae15d49..73d837efa9f44 100644 --- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll @@ -5,17 +5,21 @@ define void @child_function() #0 { ret void } -; GCN-LABEL: {{^}}parent_func: -; CHECK: v_writelane_b32 v255, s33, 2 -; CHECK: v_writelane_b32 v255, s30, 0 -; CHECK: v_writelane_b32 v255, s31, 1 -; CHECK: s_swappc_b64 s[30:31], s[4:5] -; CHECK: v_readlane_b32 s4, v255, 0 -; CHECK: v_readlane_b32 s5, v255, 1 -; CHECK: v_readlane_b32 s33, v255, 2 +; GCN-LABEL: {{^}}reserve_vgpr_with_no_lower_vgpr_available: +; GCN: buffer_store_dword v255, off, s[0:3], s32 +; GCN: v_writelane_b32 v255, s33, 2 +; GCN: v_writelane_b32 v255, s30, 0 +; GCN: v_writelane_b32 v255, s31, 1 +; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: v_readlane_b32 s4, v255, 0 +; GCN: v_readlane_b32 s5, v255, 1 +; GCN: v_readlane_b32 s33, v255, 2 ; GCN: ; NumVgprs: 256 -define void @parent_func() #0 { +define void @reserve_vgpr_with_no_lower_vgpr_available() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} @@ -47,4 +51,140 @@ define void @parent_func() #0 { ret void } +; GCN-LABEL: {{^}}reserve_lowest_available_vgpr: +; GCN: buffer_store_dword v254, off, s[0:3], s32 +; GCN: v_writelane_b32 v254, s33, 2 +; GCN: v_writelane_b32 v254, s30, 0 +; GCN: v_writelane_b32 v254, s31, 1 +; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: v_readlane_b32 s4, v254, 0 +; GCN: v_readlane_b32 s5, v254, 1 +; GCN: v_readlane_b32 s33, v254, 2 + +define void @reserve_lowest_available_vgpr() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253}" () #0 + call void @child_function() + ret void +} + +; GCN-LABEL: {{^}}reserve_vgpr_with_sgpr_spills: +; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 +; GCN: ; def s4 +; GCN: v_writelane_b32 v254, s4, 2 +; GCN: v_readlane_b32 s4, v254, 2 +; GCN: ; use s4 + +define void @reserve_vgpr_with_sgpr_spills() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253}" () #0 + + %sgpr = call i32 asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 undef, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(i32 %sgpr) #0 + br label %ret + +ret: + ret void +} + +; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call +; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 +; GCN-NOT: v_writelane +; GCN: s_setpc_b64 s[4:5] + +define void @reserve_vgpr_with_tail_call() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253},~{v254}" () #0 + musttail call void @child_function() + ret void +} + attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" } From 4838cd46a90931af3905233a86a7ceda6df7ac69 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Jul 2020 13:08:22 +0100 Subject: [PATCH 0342/1035] [X86][XOP] Shuffle v16i8 using VPPERM(X,Y) instead of OR(PSHUFB(X),PSHUFB(Y)) --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++++ llvm/test/CodeGen/X86/oddshuffles.ll | 24 ++++++------------- .../CodeGen/X86/vector-shuffle-128-v16.ll | 24 ++++++++++++++----- .../CodeGen/X86/vector-shuffle-256-v32.ll | 20 ++++++---------- llvm/test/CodeGen/X86/vector-shuffle-v48.ll | 10 +++----- 5 files changed, 41 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5eadd9c287c79..9dd4dfdfea49a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15057,6 +15057,12 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); + // If we have XOP we can use one VPPERM instead of multiple PSHUFBs. + if (Subtarget.hasXOP()) { + SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true); + return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode); + } + // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. if (SDValue V = lowerShuffleAsByteRotateAndPermute( diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index d24fd3f024d49..abec83fec594d 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -775,17 +775,11 @@ define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; ; XOP-LABEL: interleave_24i8_out: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqu (%rdi), %xmm0 -; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOP-NEXT: vmovdqu (%rdi), %xmm1 +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm2, (%rsi) ; XOP-NEXT: vmovq %xmm3, (%rdx) ; XOP-NEXT: vmovq %xmm0, (%rcx) @@ -883,12 +877,8 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; XOP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero -; XOP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,8],xmm1[0],xmm0[1,9],xmm1[1],xmm0[2,10],xmm1[2],xmm0[3,11],xmm1[3],xmm0[4,12],xmm1[4],xmm0[5] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[13],xmm1[5],xmm0[6,14],xmm1[6],xmm0[7,15],xmm1[7],xmm0[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm0, 16(%rdi) ; XOP-NEXT: vmovdqu %xmm2, (%rdi) ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 5ba0efdf7f50b..8e9b991c34bf5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1577,12 +1577,19 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: -; AVX1OR2: # %bb.0: # %entry -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: ; AVX512VLBW: # %bb.0: # %entry @@ -1596,6 +1603,11 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 ; AVX512VLVBMI-NEXT: retq +; +; XOP-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; XOP: # %bb.0: # %entry +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,10,2,7],xmm1[6],xmm0[14,7,2],xmm1[2],xmm0[3,1,14],xmm1[2],xmm0[9,11,0] +; XOP-NEXT: retq entry: %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 0508c6bac2bb1..ff2654e800e23 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -3407,20 +3407,14 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; ; XOPAVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm2[4,3,u,3,u,u,u,u,u,u,u],xmm0[7],xmm2[u,u,u,u] -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7] -; XOPAVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm1[u,u],xmm2[4],xmm1[u],xmm2[1,6],xmm1[5,0],xmm2[0],xmm1[10],xmm2[11],xmm1[u,4,2,4,7] +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm5 = xmm4[4,3,u,3,u,u,u,u,u,u,u],xmm0[7],xmm4[u,u,u,u] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] -; XOPAVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3] -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero -; XOPAVX1-NEXT: vpor %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[u,u,u,u,1,6,13,u,u],zero,xmm2[u,u] -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] -; XOPAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm5, %xmm3 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],xmm2[12],xmm1[8,u,u,u,12,1,u],xmm2[0,3] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],xmm4[1,6,13],xmm0[u,u,12,u,u] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255] ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll index e854923258257..1cf66bb08bb3c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -26,13 +26,9 @@ define <32 x i8> @foo(<48 x i8>* %x0) { ; XOP-NEXT: vmovdqu (%rdi), %xmm0 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2 -; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,11,12,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[1,2,4,5,7,8,10,11,13,14] -; XOP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,3,5,6] -; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],xmm1[0,2,3,5,6] +; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,11,12,14,15],xmm2[1,2,4,5,7,8,10,11,13,14] +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX2-LABEL: foo: From c6920081a8b97a8a911803d2bac50fa1db7a8123 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Jul 2020 19:37:40 +0100 Subject: [PATCH 0343/1035] [CostModel][X86] Add abs intrinsics cost model tests abs costs currently falls back in scalar generic intrinsic calls --- llvm/test/Analysis/CostModel/X86/abs.ll | 354 ++++++++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/X86/abs.ll diff --git a/llvm/test/Analysis/CostModel/X86/abs.ll b/llvm/test/Analysis/CostModel/X86/abs.ll new file mode 100644 index 0000000000000..ee61f5c07a393 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/abs.ll @@ -0,0 +1,354 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512dq | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512BW + +; +; abs(X, not_posion) +; + +define void @cost_abs_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE2-LABEL: 'cost_abs_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 0) + %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 0) + %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 0) + %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 0) + ret void +} + +define void @cost_abs_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSE2-LABEL: 'cost_abs_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 0) + %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 0) + %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 0) + %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 0) + ret void +} + +define void @cost_abs_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSE-LABEL: 'cost_abs_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; SSE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; SSE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 0) + %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 0) + %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 0) + %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 0) + ret void +} + +define void @cost_abs_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSE2-LABEL: 'cost_abs_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 226 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 243 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 0) + %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 0) + %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 0) + %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 0) + ret void +} + +; +; abs(X, posion) +; + +define void @cost_abs_i64_poison(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE2-LABEL: 'cost_abs_i64_poison' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i64_poison' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i64_poison' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i64_poison' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i64_poison' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 -1) + %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 -1) + %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 -1) + %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 -1) + ret void +} + +define void @cost_abs_i32_poison(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSE2-LABEL: 'cost_abs_i32_poison' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i32_poison' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i32_poison' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i32_poison' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i32_poison' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 -1) + %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 -1) + %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 -1) + %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 -1) + ret void +} + +define void @cost_abs_i16_poison(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSE-LABEL: 'cost_abs_i16_poison' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; SSE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; SSE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i16_poison' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i16_poison' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 -1) + %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 -1) + %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 -1) + %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 -1) + ret void +} + +define void @cost_abs_i8_poison(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSE2-LABEL: 'cost_abs_i8_poison' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i8_poison' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i8_poison' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX-LABEL: 'cost_abs_i8_poison' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 226 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i8_poison' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 243 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 -1) + %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 -1) + %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 -1) + %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 -1) + ret void +} + +declare i64 @llvm.abs.i64(i64, i1) +declare i32 @llvm.abs.i32(i32, i1) +declare i16 @llvm.abs.i16(i16, i1) +declare i8 @llvm.abs.i8(i8, i1) + +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) + +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) + +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) From b4b6e77454b6822e1ba7bdaa1b5bde8654c3e87f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Jul 2020 19:48:35 +0100 Subject: [PATCH 0344/1035] [DAG] isSplatValue - add support for TRUNCATE/SIGN_EXTEND/ZERO_EXTEND These are just pass-throughs to the source operand - we can't assume that ANY_EXTEND(splat) will still be a splat though. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 + llvm/test/CodeGen/X86/vector-fshl-128.ll | 96 ++++++++--------- llvm/test/CodeGen/X86/vector-fshl-256.ll | 48 ++++----- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 38 +++---- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 36 +++---- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 64 +++++------ llvm/test/CodeGen/X86/vector-fshr-128.ll | 100 +++++++++--------- llvm/test/CodeGen/X86/vector-fshr-256.ll | 48 ++++----- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 38 +++---- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 36 +++---- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 64 +++++------ llvm/test/CodeGen/X86/vector-rotate-128.ll | 36 +++---- llvm/test/CodeGen/X86/vector-rotate-256.ll | 32 +++--- .../test/CodeGen/X86/vector-shift-ashr-128.ll | 20 ++-- .../test/CodeGen/X86/vector-shift-ashr-256.ll | 10 +- .../CodeGen/X86/vector-shift-ashr-sub128.ll | 60 +++++------ .../test/CodeGen/X86/vector-shift-lshr-128.ll | 20 ++-- .../test/CodeGen/X86/vector-shift-lshr-256.ll | 10 +- .../CodeGen/X86/vector-shift-lshr-sub128.ll | 60 +++++------ llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 20 ++-- llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 10 +- .../CodeGen/X86/vector-shift-shl-sub128.ll | 60 +++++------ 22 files changed, 424 insertions(+), 486 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 592c09c10fb08..87d2fa15d0377 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2323,6 +2323,10 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, } break; } + case ISD::TRUNCATE: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return isSplatValue(V.getOperand(0), DemandedElts, UndefElts); } // We don't support other cases than those above for scalable vectors at diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 0192d1e8137c1..81c291fa602b8 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1947,16 +1947,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpslld %xmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -1967,16 +1967,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpslld %xmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -1988,16 +1988,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm4, %ymm3, %ymm3 ; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 @@ -2009,16 +2009,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 @@ -2029,16 +2029,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1 ; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 @@ -2049,16 +2049,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3 ; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1 ; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 0cf4c172412a6..afe7716a36e49 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1526,13 +1526,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 @@ -1546,13 +1546,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 @@ -1565,13 +1565,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 @@ -1584,13 +1584,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 59bef3a97b1fa..b1eb2fdbe7af8 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1056,17 +1056,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1074,17 +1073,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -1092,17 +1090,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1111,17 +1108,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm3, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512VLBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index fd0e1c7e2f3ab..7db474fc39da9 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -841,34 +841,34 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm4, %ymm1 -; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm4, %ymm1 -; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index ff177678bed6c..2481b8ebfe25d 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -366,50 +366,50 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounw define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpsllw %xmm3, %ymm4, %ymm5 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsrlvd %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vpsrld %xmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm5 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512VL-NEXT: vpsrld %xmm1, %zmm2, %zmm1 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 817bca051e0ae..db76d3eef282d 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1967,18 +1967,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm4 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrld %xmm5, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm2, %xmm5, %xmm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1987,18 +1987,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm4 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrld %xmm5, %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm5, %xmm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2008,17 +2008,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm4 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm5, %xmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm4, %zmm4, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -2029,17 +2029,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm4, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2049,17 +2049,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm5, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %xmm4, %xmm4, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq @@ -2068,17 +2068,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm4, %xmm4, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index f0848cfd2e49a..6449d58fa38c5 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1528,13 +1528,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 @@ -1548,13 +1548,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 @@ -1567,13 +1567,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 @@ -1585,13 +1585,13 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 8fe7ba9e471a9..6cda97e1ebd39 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1132,17 +1132,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpsrld %xmm3, %zmm0, %zmm3 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1150,17 +1149,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpsrld %xmm3, %zmm0, %zmm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -1168,17 +1166,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1187,17 +1184,16 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm3, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512VLBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 4e92bfc4f9136..fffa286b79196 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -916,34 +916,34 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm4, %ymm1 -; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm4, %ymm1 -; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index fda0dacedf85b..04a883171a7ce 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -366,50 +366,50 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounw define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 +; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsllvd %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vpslld %xmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsllvd %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512VL-NEXT: vpslld %xmm1, %zmm2, %zmm1 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index d140fb5c09295..6f11c5cbd7567 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1032,14 +1032,13 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512F-LABEL: splatvar_rotate_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpslld %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrld %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1047,14 +1046,13 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512VL-LABEL: splatvar_rotate_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld %xmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrld %xmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -1062,14 +1060,13 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_rotate_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1078,14 +1075,13 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512VLBW-LABEL: splatvar_rotate_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm1 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm1 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512VLBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index a850ab5ba7822..f54aea8d86ddf 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -831,29 +831,29 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_rotate_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_rotate_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 9b1fb29cb0297..b4bcca54b1f73 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -863,20 +863,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -884,20 +882,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index ae9c375eec254..141b68db31173 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -933,10 +933,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; @@ -956,10 +955,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; AVX512BWVL-LABEL: splatvar_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index a994d6610d73d..bac8f615116e7 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1385,20 +1385,18 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1406,20 +1404,18 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -1527,20 +1523,18 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v4i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1548,20 +1542,18 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v4i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v4i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -1660,20 +1652,18 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v2i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1681,20 +1671,18 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v2i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v2i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 2e19f753722d3..7dab12f3f03e8 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -713,20 +713,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -734,20 +732,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 9119e32bda375..9ba3e4a451ecf 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -762,10 +762,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; @@ -782,10 +781,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; AVX512BWVL-LABEL: splatvar_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index 8a843ef652e7f..6b9ac09658e9e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1135,20 +1135,18 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1156,20 +1154,18 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -1257,20 +1253,18 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v4i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1278,20 +1272,18 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v4i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v4i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -1370,20 +1362,18 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v2i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1391,20 +1381,18 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v2i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v2i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 5f1325aacb4d1..4b83700e3a85f 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -623,20 +623,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -644,20 +642,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 1296fcf8bb902..8ed95bae2ebd2 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -688,10 +688,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; @@ -707,10 +706,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; AVX512BWVL-LABEL: splatvar_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index 808033e0fb7ce..1c688089af494 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -988,20 +988,18 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1009,20 +1007,18 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -1105,20 +1101,18 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v4i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1126,20 +1120,18 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v4i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v4i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -1215,20 +1207,18 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; ; AVX512DQ-LABEL: splatvar_shift_v2i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1236,20 +1226,18 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; ; AVX512DQVL-LABEL: splatvar_shift_v2i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v2i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq From 3f7249046a87e08272957d12bff73295fc4f0e8c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Jul 2020 19:55:55 +0100 Subject: [PATCH 0345/1035] [CostModel][X86] Add smax/smin/umin/umax intrinsics cost model tests Costs currently fall back to scalar generic intrinsic calls --- .../Analysis/CostModel/X86/arith-sminmax.ll | 287 ++++++++++++++++++ .../Analysis/CostModel/X86/arith-uminmax.ll | 287 ++++++++++++++++++ 2 files changed, 574 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/X86/arith-sminmax.ll create mode 100644 llvm/test/Analysis/CostModel/X86/arith-uminmax.ll diff --git a/llvm/test/Analysis/CostModel/X86/arith-sminmax.ll b/llvm/test/Analysis/CostModel/X86/arith-sminmax.ll new file mode 100644 index 0000000000000..96180d01aae09 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/arith-sminmax.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512dq | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512BW + +declare i64 @llvm.smax.i64(i64, i64) +declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.smax.i32(i32, i32) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.smax.i16(i16, i16) +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.smax.i8(i8, i8) +declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>) + +define i32 @smax(i32 %arg) { +; SSE2-LABEL: 'smax' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'smax' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'smax' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'smax' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'smax' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +declare i64 @llvm.smin.i64(i64, i64) +declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.smin.i32(i32, i32) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.smin.i16(i16, i16) +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.smin.i8(i8, i8) +declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>) + +define i32 @smin(i32 %arg) { +; SSE2-LABEL: 'smin' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'smin' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'smin' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'smin' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'smin' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} diff --git a/llvm/test/Analysis/CostModel/X86/arith-uminmax.ll b/llvm/test/Analysis/CostModel/X86/arith-uminmax.ll new file mode 100644 index 0000000000000..74aaa3b03a49e --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/arith-uminmax.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512dq | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512BW + +declare i64 @llvm.umax.i64(i64, i64) +declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.umax.i32(i32, i32) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.umax.i16(i16, i16) +declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.umax.i8(i8, i8) +declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>) + +define i32 @umax(i32 %arg) { +; SSE2-LABEL: 'umax' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'umax' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'umax' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'umax' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'umax' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +declare i64 @llvm.umin.i64(i64, i64) +declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.umin.i32(i32, i32) +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.umin.i16(i16, i16) +declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.umin.i8(i8, i8) +declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>) + +define i32 @umin(i32 %arg) { +; SSE2-LABEL: 'umin' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'umin' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'umin' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX-LABEL: 'umin' +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512-LABEL: 'umin' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} From 6f00f3b56e5a13286142facd929be15ab7b17aa3 Mon Sep 17 00:00:00 2001 From: Kostya Kortchinsky Date: Mon, 27 Jul 2020 09:13:42 -0700 Subject: [PATCH 0346/1035] [scudo][standalone] mallopt runtime configuration options Summary: Partners have requested the ability to configure more parts of Scudo at runtime, notably the Secondary cache options (maximum number of blocks cached, maximum size) as well as the TSD registry options (the maximum number of TSDs in use). This CL adds a few more Scudo specific `mallopt` parameters that are passed down to the various subcomponents of the Combined allocator. - `M_CACHE_COUNT_MAX`: sets the maximum number of Secondary cached items - `M_CACHE_SIZE_MAX`: sets the maximum size of a cacheable item in the Secondary - `M_TSDS_COUNT_MAX`: sets the maximum number of TSDs that can be used (Shared Registry only) Regarding the TSDs maximum count, this is a one way option, only allowing to increase the count. In order to allow for this, I rearranged the code to have some `setOption` member function to the relevant classes, using the `scudo::Option` class enum to determine what is to be set. This also fixes an issue where a static variable (`Ready`) was used in templated functions without being set back to `false` every time. Reviewers: pcc, eugenis, hctim, cferris Subscribers: jfb, llvm-commits, #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D84667 --- .../lib/scudo/standalone/allocator_config.h | 11 +- compiler-rt/lib/scudo/standalone/combined.h | 46 ++++---- compiler-rt/lib/scudo/standalone/common.h | 8 ++ .../standalone/include/scudo/interface.h | 12 ++ compiler-rt/lib/scudo/standalone/primary32.h | 24 ++-- compiler-rt/lib/scudo/standalone/primary64.h | 24 ++-- compiler-rt/lib/scudo/standalone/secondary.h | 89 +++++++++------ .../scudo/standalone/tests/combined_test.cpp | 5 +- .../scudo/standalone/tests/primary_test.cpp | 3 +- .../scudo/standalone/tests/secondary_test.cpp | 40 +++++-- .../lib/scudo/standalone/tests/tsd_test.cpp | 78 ++++++++++++- .../standalone/tests/wrappers_c_test.cpp | 1 + .../standalone/tests/wrappers_cpp_test.cpp | 3 +- .../lib/scudo/standalone/tsd_exclusive.h | 6 + compiler-rt/lib/scudo/standalone/tsd_shared.h | 105 ++++++++++++------ 15 files changed, 318 insertions(+), 137 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h index ad2a17ef7014a..cf362da4e5be5 100644 --- a/compiler-rt/lib/scudo/standalone/allocator_config.h +++ b/compiler-rt/lib/scudo/standalone/allocator_config.h @@ -48,9 +48,10 @@ struct AndroidConfig { typedef SizeClassAllocator32 Primary; #endif // Cache blocks up to 2MB - typedef MapAllocator> Secondary; + typedef MapAllocator> + Secondary; template - using TSDRegistryT = TSDRegistrySharedT; // Shared, max 2 TSDs. + using TSDRegistryT = TSDRegistrySharedT; // Shared, max 8 TSDs. }; struct AndroidSvelteConfig { @@ -62,9 +63,9 @@ struct AndroidSvelteConfig { // 64KB regions typedef SizeClassAllocator32 Primary; #endif - typedef MapAllocator> Secondary; + typedef MapAllocator> Secondary; template - using TSDRegistryT = TSDRegistrySharedT; // Shared, only 1 TSD. + using TSDRegistryT = TSDRegistrySharedT; // Shared, max 2 TSDs. }; #if SCUDO_CAN_USE_PRIMARY64 @@ -73,7 +74,7 @@ struct FuchsiaConfig { typedef SizeClassAllocator64 Primary; typedef MapAllocator Secondary; template - using TSDRegistryT = TSDRegistrySharedT; // Shared, max 8 TSDs. + using TSDRegistryT = TSDRegistrySharedT; // Shared, max 8 TSDs. }; #endif diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 582178ee36b40..6ca00c29ab732 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -41,8 +41,6 @@ extern "C" size_t android_unsafe_frame_pointer_chase(scudo::uptr *buf, namespace scudo { -enum class Option { ReleaseInterval, MemtagTuning }; - template class Allocator { public: @@ -277,7 +275,7 @@ class Allocator { } #endif // GWP_ASAN_HOOKS - FillContentsMode FillContents = + const FillContentsMode FillContents = ZeroContents ? ZeroFill : Options.FillContents; if (UNLIKELY(Alignment > MaxAlignment)) { @@ -285,7 +283,7 @@ class Allocator { return nullptr; reportAlignmentTooBig(Alignment, MaxAlignment); } - if (Alignment < MinAlignment) + if (UNLIKELY(Alignment < MinAlignment)) Alignment = MinAlignment; // If the requested size happens to be 0 (more common than you might think), @@ -322,13 +320,11 @@ class Allocator { if (UNLIKELY(!Block)) { while (ClassId < SizeClassMap::LargestClassId) { Block = TSD->Cache.allocate(++ClassId); - if (LIKELY(Block)) { + if (LIKELY(Block)) break; - } } - if (UNLIKELY(!Block)) { + if (UNLIKELY(!Block)) ClassId = 0; - } } if (UnlockRequired) TSD->unlock(); @@ -349,7 +345,7 @@ class Allocator { void *Ptr = reinterpret_cast(UserPtr); void *TaggedPtr = Ptr; - if (ClassId) { + if (LIKELY(ClassId)) { // We only need to zero or tag the contents for Primary backed // allocations. We only set tags for primary allocations in order to avoid // faulting potentially large numbers of pages for large secondary @@ -692,11 +688,7 @@ class Allocator { } bool setOption(Option O, sptr Value) { - if (O == Option::ReleaseInterval) { - Primary.setReleaseToOsIntervalMs(static_cast(Value)); - Secondary.setReleaseToOsIntervalMs(static_cast(Value)); - return true; - } + initThreadMaybe(); if (O == Option::MemtagTuning) { // Enabling odd/even tags involves a tradeoff between use-after-free // detection and buffer overflow detection. Odd/even tags make it more @@ -705,14 +697,19 @@ class Allocator { // use-after-free is less likely to be detected because the tag space for // any particular chunk is cut in half. Therefore we use this tuning // setting to control whether odd/even tags are enabled. - if (Value == M_MEMTAG_TUNING_BUFFER_OVERFLOW) { + if (Value == M_MEMTAG_TUNING_BUFFER_OVERFLOW) Options.UseOddEvenTags = true; - return true; - } - if (Value == M_MEMTAG_TUNING_UAF) { + else if (Value == M_MEMTAG_TUNING_UAF) Options.UseOddEvenTags = false; - return true; - } + return true; + } else { + // We leave it to the various sub-components to decide whether or not they + // want to handle the option, but we do not want to short-circuit + // execution if one of the setOption was to return false. + const bool PrimaryResult = Primary.setOption(O, Value); + const bool SecondaryResult = Secondary.setOption(O, Value); + const bool RegistryResult = TSDRegistry.setOption(O, Value); + return PrimaryResult && SecondaryResult && RegistryResult; } return false; } @@ -805,8 +802,7 @@ class Allocator { PrimaryT::findNearestBlock(RegionInfoPtr, UntaggedFaultAddr); auto GetGranule = [&](uptr Addr, const char **Data, uint8_t *Tag) -> bool { - if (Addr < MemoryAddr || - Addr + archMemoryTagGranuleSize() < Addr || + if (Addr < MemoryAddr || Addr + archMemoryTagGranuleSize() < Addr || Addr + archMemoryTagGranuleSize() > MemoryAddr + MemorySize) return false; *Data = &Memory[Addr - MemoryAddr]; @@ -950,10 +946,10 @@ class Allocator { u32 Cookie; struct { - u8 MayReturnNull : 1; // may_return_null + u8 MayReturnNull : 1; // may_return_null FillContentsMode FillContents : 2; // zero_contents, pattern_fill_contents - u8 DeallocTypeMismatch : 1; // dealloc_type_mismatch - u8 DeleteSizeMismatch : 1; // delete_size_mismatch + u8 DeallocTypeMismatch : 1; // dealloc_type_mismatch + u8 DeleteSizeMismatch : 1; // delete_size_mismatch u8 TrackAllocationStacks : 1; u8 UseOddEvenTags : 1; u32 QuarantineMaxChunkSize; // quarantine_max_chunk_size diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h index 9037f92b4976c..b3bce6ee291aa 100644 --- a/compiler-rt/lib/scudo/standalone/common.h +++ b/compiler-rt/lib/scudo/standalone/common.h @@ -182,6 +182,14 @@ struct BlockInfo { uptr RegionEnd; }; +enum class Option : u8 { + ReleaseInterval, // Release to OS interval in milliseconds. + MemtagTuning, // Whether to tune tagging for UAF or overflow. + MaxCacheEntriesCount, // Maximum number of blocks that can be cached. + MaxCacheEntrySize, // Maximum size of a block that can be cached. + MaxTSDsCount, // Number of usable TSDs for the shared registry. +}; + constexpr unsigned char PatternFillByte = 0xAB; enum FillContentsMode { diff --git a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h index 27c1684596c44..7e65b68ab36d0 100644 --- a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h +++ b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h @@ -121,6 +121,18 @@ size_t __scudo_get_region_info_size(); #define M_MEMTAG_TUNING -102 #endif +#ifndef M_CACHE_COUNT_MAX +#define M_CACHE_COUNT_MAX -200 +#endif + +#ifndef M_CACHE_SIZE_MAX +#define M_CACHE_SIZE_MAX -201 +#endif + +#ifndef M_TSDS_COUNT_MAX +#define M_TSDS_COUNT_MAX -202 +#endif + enum scudo_memtag_tuning { // Tune for buffer overflows. M_MEMTAG_TUNING_BUFFER_OVERFLOW, diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h index 321cf92fae30e..e41b949d8d0e6 100644 --- a/compiler-rt/lib/scudo/standalone/primary32.h +++ b/compiler-rt/lib/scudo/standalone/primary32.h @@ -86,7 +86,7 @@ class SizeClassAllocator32 { if (Sci->CanRelease) Sci->ReleaseInfo.LastReleaseAtNs = Time; } - setReleaseToOsIntervalMs(ReleaseToOsInterval); + setOption(Option::ReleaseInterval, static_cast(ReleaseToOsInterval)); } void init(s32 ReleaseToOsInterval) { memset(this, 0, sizeof(*this)); @@ -184,13 +184,16 @@ class SizeClassAllocator32 { getStats(Str, I, 0); } - void setReleaseToOsIntervalMs(s32 Interval) { - if (Interval >= MaxReleaseToOsIntervalMs) { - Interval = MaxReleaseToOsIntervalMs; - } else if (Interval <= MinReleaseToOsIntervalMs) { - Interval = MinReleaseToOsIntervalMs; + bool setOption(Option O, sptr Value) { + if (O == Option::ReleaseInterval) { + const s32 Interval = + Max(Min(static_cast(Value), MaxReleaseToOsIntervalMs), + MinReleaseToOsIntervalMs); + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + return true; } - atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + // Not supported by the Primary, but not an error either. + return true; } uptr releaseToOS() { @@ -423,10 +426,6 @@ class SizeClassAllocator32 { AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased); } - s32 getReleaseToOsIntervalMs() { - return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); - } - NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId, bool Force = false) { const uptr BlockSize = getSizeByClassId(ClassId); @@ -457,7 +456,8 @@ class SizeClassAllocator32 { } if (!Force) { - const s32 IntervalMs = getReleaseToOsIntervalMs(); + const s32 IntervalMs = + atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); if (IntervalMs < 0) return 0; if (Sci->ReleaseInfo.LastReleaseAtNs + diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index e37dc4951f238..ad92ae250e1f4 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -91,7 +91,7 @@ class SizeClassAllocator64 { if (Region->CanRelease) Region->ReleaseInfo.LastReleaseAtNs = Time; } - setReleaseToOsIntervalMs(ReleaseToOsInterval); + setOption(Option::ReleaseInterval, static_cast(ReleaseToOsInterval)); if (SupportsMemoryTagging) UseMemoryTagging = systemSupportsMemoryTagging(); @@ -185,13 +185,16 @@ class SizeClassAllocator64 { getStats(Str, I, 0); } - void setReleaseToOsIntervalMs(s32 Interval) { - if (Interval >= MaxReleaseToOsIntervalMs) { - Interval = MaxReleaseToOsIntervalMs; - } else if (Interval <= MinReleaseToOsIntervalMs) { - Interval = MinReleaseToOsIntervalMs; + bool setOption(Option O, sptr Value) { + if (O == Option::ReleaseInterval) { + const s32 Interval = + Max(Min(static_cast(Value), MaxReleaseToOsIntervalMs), + MinReleaseToOsIntervalMs); + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + return true; } - atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + // Not supported by the Primary, but not an error either. + return true; } uptr releaseToOS() { @@ -435,10 +438,6 @@ class SizeClassAllocator64 { getRegionBaseByClassId(ClassId)); } - s32 getReleaseToOsIntervalMs() { - return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); - } - NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId, bool Force = false) { const uptr BlockSize = getSizeByClassId(ClassId); @@ -469,7 +468,8 @@ class SizeClassAllocator64 { } if (!Force) { - const s32 IntervalMs = getReleaseToOsIntervalMs(); + const s32 IntervalMs = + atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); if (IntervalMs < 0) return 0; if (Region->ReleaseInfo.LastReleaseAtNs + diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index 84eaa5091b434..b5bb53ddcf2d9 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -56,14 +56,21 @@ class MapAllocatorNoCache { return false; } bool store(UNUSED LargeBlock::Header *H) { return false; } - static bool canCache(UNUSED uptr Size) { return false; } + bool canCache(UNUSED uptr Size) { return false; } void disable() {} void enable() {} void releaseToOS() {} - void setReleaseToOsIntervalMs(UNUSED s32 Interval) {} + bool setOption(Option O, UNUSED sptr Value) { + if (O == Option::ReleaseInterval || O == Option::MaxCacheEntriesCount || + O == Option::MaxCacheEntrySize) + return false; + // Not supported by the Secondary Cache, but not an error either. + return true; + } }; -template class MapAllocatorCache { @@ -71,10 +78,17 @@ class MapAllocatorCache { // Fuchsia doesn't allow releasing Secondary blocks yet. Note that 0 length // arrays are an extension for some compilers. // FIXME(kostyak): support (partially) the cache on Fuchsia. - static_assert(!SCUDO_FUCHSIA || MaxEntriesCount == 0U, ""); + static_assert(!SCUDO_FUCHSIA || EntriesArraySize == 0U, ""); + + // Ensure the default maximum specified fits the array. + static_assert(DefaultMaxEntriesCount <= EntriesArraySize, ""); void initLinkerInitialized(s32 ReleaseToOsInterval) { - setReleaseToOsIntervalMs(ReleaseToOsInterval); + setOption(Option::MaxCacheEntriesCount, + static_cast(DefaultMaxEntriesCount)); + setOption(Option::MaxCacheEntrySize, + static_cast(DefaultMaxEntrySize)); + setOption(Option::ReleaseInterval, static_cast(ReleaseToOsInterval)); } void init(s32 ReleaseToOsInterval) { memset(this, 0, sizeof(*this)); @@ -85,13 +99,14 @@ class MapAllocatorCache { bool EntryCached = false; bool EmptyCache = false; const u64 Time = getMonotonicTime(); + const u32 MaxCount = atomic_load(&MaxEntriesCount, memory_order_relaxed); { ScopedLock L(Mutex); - if (EntriesCount == MaxEntriesCount) { + if (EntriesCount >= MaxCount) { if (IsFullEvents++ == 4U) EmptyCache = true; } else { - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (u32 I = 0; I < MaxCount; I++) { if (Entries[I].Block) continue; if (I != 0) @@ -111,17 +126,19 @@ class MapAllocatorCache { s32 Interval; if (EmptyCache) empty(); - else if ((Interval = getReleaseToOsIntervalMs()) >= 0) + else if ((Interval = atomic_load(&ReleaseToOsIntervalMs, + memory_order_relaxed)) >= 0) releaseOlderThan(Time - static_cast(Interval) * 1000000); return EntryCached; } bool retrieve(uptr Size, LargeBlock::Header **H) { const uptr PageSize = getPageSizeCached(); + const u32 MaxCount = atomic_load(&MaxEntriesCount, memory_order_relaxed); ScopedLock L(Mutex); if (EntriesCount == 0) return false; - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (u32 I = 0; I < MaxCount; I++) { if (!Entries[I].Block) continue; const uptr BlockSize = Entries[I].BlockEnd - Entries[I].Block; @@ -141,17 +158,31 @@ class MapAllocatorCache { return false; } - static bool canCache(uptr Size) { - return MaxEntriesCount != 0U && Size <= MaxEntrySize; + bool canCache(uptr Size) { + return atomic_load(&MaxEntriesCount, memory_order_relaxed) != 0U && + Size <= atomic_load(&MaxEntrySize, memory_order_relaxed); } - void setReleaseToOsIntervalMs(s32 Interval) { - if (Interval >= MaxReleaseToOsIntervalMs) { - Interval = MaxReleaseToOsIntervalMs; - } else if (Interval <= MinReleaseToOsIntervalMs) { - Interval = MinReleaseToOsIntervalMs; + bool setOption(Option O, sptr Value) { + if (O == Option::ReleaseInterval) { + const s32 Interval = + Max(Min(static_cast(Value), MaxReleaseToOsIntervalMs), + MinReleaseToOsIntervalMs); + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + return true; + } else if (O == Option::MaxCacheEntriesCount) { + const u32 MaxCount = static_cast(Value); + if (MaxCount > EntriesArraySize) + return false; + atomic_store(&MaxEntriesCount, MaxCount, memory_order_relaxed); + return true; + } else if (O == Option::MaxCacheEntrySize) { + atomic_store(&MaxEntrySize, static_cast(Value), + memory_order_relaxed); + return true; } - atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + // Not supported by the Secondary Cache, but not an error either. + return true; } void releaseToOS() { releaseOlderThan(UINT64_MAX); } @@ -166,11 +197,11 @@ class MapAllocatorCache { void *MapBase; uptr MapSize; MapPlatformData Data; - } MapInfo[MaxEntriesCount]; + } MapInfo[EntriesArraySize]; uptr N = 0; { ScopedLock L(Mutex); - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (uptr I = 0; I < EntriesArraySize; I++) { if (!Entries[I].Block) continue; MapInfo[N].MapBase = reinterpret_cast(Entries[I].MapBase); @@ -191,7 +222,7 @@ class MapAllocatorCache { ScopedLock L(Mutex); if (!EntriesCount) return; - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (uptr I = 0; I < EntriesArraySize; I++) { if (!Entries[I].Block || !Entries[I].Time || Entries[I].Time > Time) continue; releasePagesToOS(Entries[I].Block, 0, @@ -201,10 +232,6 @@ class MapAllocatorCache { } } - s32 getReleaseToOsIntervalMs() { - return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); - } - struct CachedBlock { uptr Block; uptr BlockEnd; @@ -215,8 +242,10 @@ class MapAllocatorCache { }; HybridMutex Mutex; - CachedBlock Entries[MaxEntriesCount]; + CachedBlock Entries[EntriesArraySize]; u32 EntriesCount; + atomic_u32 MaxEntriesCount; + atomic_uptr MaxEntrySize; uptr LargestSize; u32 IsFullEvents; atomic_s32 ReleaseToOsIntervalMs; @@ -265,11 +294,9 @@ template class MapAllocator { Callback(reinterpret_cast(&H) + LargeBlock::getHeaderSize()); } - static uptr canCache(uptr Size) { return CacheT::canCache(Size); } + uptr canCache(uptr Size) { return Cache.canCache(Size); } - void setReleaseToOsIntervalMs(s32 Interval) { - Cache.setReleaseToOsIntervalMs(Interval); - } + bool setOption(Option O, sptr Value) { return Cache.setOption(O, Value); } void releaseToOS() { Cache.releaseToOS(); } @@ -306,7 +333,7 @@ void *MapAllocator::allocate(uptr Size, uptr AlignmentHint, const uptr RoundedSize = roundUpTo(Size + LargeBlock::getHeaderSize(), PageSize); - if (AlignmentHint < PageSize && CacheT::canCache(RoundedSize)) { + if (AlignmentHint < PageSize && Cache.canCache(RoundedSize)) { LargeBlock::Header *H; if (Cache.retrieve(RoundedSize, &H)) { if (BlockEnd) @@ -400,7 +427,7 @@ template void MapAllocator::deallocate(void *Ptr) { Stats.sub(StatAllocated, CommitSize); Stats.sub(StatMapped, H->MapSize); } - if (CacheT::canCache(CommitSize) && Cache.store(H)) + if (Cache.canCache(CommitSize) && Cache.store(H)) return; void *Addr = reinterpret_cast(H->MapBase); const uptr Size = H->MapSize; diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 6cefe18b8f15c..9689c4265e06c 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -19,7 +19,7 @@ static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; static constexpr scudo::Chunk::Origin Origin = scudo::Chunk::Origin::Malloc; @@ -351,6 +351,7 @@ template static void stressAllocator(AllocatorT *A) { } template static void testAllocatorThreaded() { + Ready = false; using AllocatorT = TestAllocator; auto Allocator = std::unique_ptr(new AllocatorT()); std::thread Threads[32]; @@ -394,7 +395,7 @@ struct DeathConfig { typedef scudo::SizeClassAllocator64 Primary; typedef scudo::MapAllocator Secondary; - template using TSDRegistryT = scudo::TSDRegistrySharedT; + template using TSDRegistryT = scudo::TSDRegistrySharedT; }; TEST(ScudoCombinedTest, DeathCombined) { diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp index 010bf84490e0e..a7a2b3160611e 100644 --- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp @@ -149,7 +149,7 @@ TEST(ScudoPrimaryTest, PrimaryIterate) { static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; template static void performAllocations(Primary *Allocator) { static THREADLOCAL typename Primary::CacheT Cache; @@ -176,6 +176,7 @@ template static void performAllocations(Primary *Allocator) { } template static void testPrimaryThreaded() { + Ready = false; auto Deleter = [](Primary *P) { P->unmapTestOnly(); delete P; diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp index d2260b9c15b07..29efdb3060128 100644 --- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp @@ -21,7 +21,7 @@ template static void testSecondaryBasic(void) { scudo::GlobalStats S; S.init(); - SecondaryT *L = new SecondaryT; + std::unique_ptr L(new SecondaryT); L->init(&S); const scudo::uptr Size = 1U << 16; void *P = L->allocate(Size); @@ -30,7 +30,7 @@ template static void testSecondaryBasic(void) { EXPECT_GE(SecondaryT::getBlockSize(P), Size); L->deallocate(P); // If the Secondary can't cache that pointer, it will be unmapped. - if (!SecondaryT::canCache(Size)) + if (!L->canCache(Size)) EXPECT_DEATH(memset(P, 'A', Size), ""); const scudo::uptr Align = 1U << 16; @@ -59,7 +59,7 @@ TEST(ScudoSecondaryTest, SecondaryBasic) { #if !SCUDO_FUCHSIA testSecondaryBasic>>(); testSecondaryBasic< - scudo::MapAllocator>>(); + scudo::MapAllocator>>(); #endif } @@ -75,7 +75,7 @@ using LargeAllocator = scudo::MapAllocator>; TEST(ScudoSecondaryTest, SecondaryCombinations) { constexpr scudo::uptr MinAlign = FIRST_32_SECOND_64(8, 16); constexpr scudo::uptr HeaderSize = scudo::roundUpTo(8, MinAlign); - LargeAllocator *L = new LargeAllocator; + std::unique_ptr L(new LargeAllocator); L->init(nullptr); for (scudo::uptr SizeLog = 0; SizeLog <= 20; SizeLog++) { for (scudo::uptr AlignLog = FIRST_32_SECOND_64(3, 4); AlignLog <= 16; @@ -103,7 +103,7 @@ TEST(ScudoSecondaryTest, SecondaryCombinations) { } TEST(ScudoSecondaryTest, SecondaryIterate) { - LargeAllocator *L = new LargeAllocator; + std::unique_ptr L(new LargeAllocator); L->init(nullptr); std::vector V; const scudo::uptr PageSize = scudo::getPageSizeCached(); @@ -125,9 +125,32 @@ TEST(ScudoSecondaryTest, SecondaryIterate) { Str.output(); } +TEST(ScudoSecondaryTest, SecondaryOptions) { + std::unique_ptr L(new LargeAllocator); + L->init(nullptr); + // Attempt to set a maximum number of entries higher than the array size. + EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4096U)); + // A negative number will be cast to a scudo::u32, and fail. + EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, -1)); + if (L->canCache(0U)) { + // Various valid combinations. + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20)); + EXPECT_TRUE(L->canCache(1UL << 18)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17)); + EXPECT_FALSE(L->canCache(1UL << 18)); + EXPECT_TRUE(L->canCache(1UL << 16)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 0U)); + EXPECT_FALSE(L->canCache(1UL << 16)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20)); + EXPECT_TRUE(L->canCache(1UL << 16)); + } +} + static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; static void performAllocations(LargeAllocator *L) { std::vector V; @@ -153,11 +176,12 @@ static void performAllocations(LargeAllocator *L) { } TEST(ScudoSecondaryTest, SecondaryThreadsRace) { - LargeAllocator *L = new LargeAllocator; + Ready = false; + std::unique_ptr L(new LargeAllocator); L->init(nullptr, /*ReleaseToOsInterval=*/0); std::thread Threads[16]; for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) - Threads[I] = std::thread(performAllocations, L); + Threads[I] = std::thread(performAllocations, L.get()); { std::unique_lock Lock(Mutex); Ready = true; diff --git a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp index 4a3cf1cd0fc31..561bda47e24c8 100644 --- a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp @@ -13,6 +13,7 @@ #include #include +#include #include // We mock out an allocator with a TSD registry, mostly using empty stubs. The @@ -47,12 +48,12 @@ template class MockAllocator { struct OneCache { template - using TSDRegistryT = scudo::TSDRegistrySharedT; + using TSDRegistryT = scudo::TSDRegistrySharedT; }; struct SharedCaches { template - using TSDRegistryT = scudo::TSDRegistrySharedT; + using TSDRegistryT = scudo::TSDRegistrySharedT; }; struct ExclusiveCaches { @@ -116,7 +117,7 @@ TEST(ScudoTSDTest, TSDRegistryBasic) { static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; template static void stressCache(AllocatorT *Allocator) { auto Registry = Allocator->getTSDRegistry(); @@ -145,6 +146,7 @@ template static void stressCache(AllocatorT *Allocator) { } template static void testRegistryThreaded() { + Ready = false; auto Deleter = [](AllocatorT *A) { A->unmapTestOnly(); delete A; @@ -171,3 +173,73 @@ TEST(ScudoTSDTest, TSDRegistryThreaded) { testRegistryThreaded>(); #endif } + +static std::set Pointers; + +static void stressSharedRegistry(MockAllocator *Allocator) { + std::set Set; + auto Registry = Allocator->getTSDRegistry(); + { + std::unique_lock Lock(Mutex); + while (!Ready) + Cv.wait(Lock); + } + Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false); + bool UnlockRequired; + for (scudo::uptr I = 0; I < 4096U; I++) { + auto TSD = Registry->getTSDAndLock(&UnlockRequired); + EXPECT_NE(TSD, nullptr); + Set.insert(reinterpret_cast(TSD)); + if (UnlockRequired) + TSD->unlock(); + } + { + std::unique_lock Lock(Mutex); + Pointers.insert(Set.begin(), Set.end()); + } +} + +TEST(ScudoTSDTest, TSDRegistryTSDsCount) { + Ready = false; + using AllocatorT = MockAllocator; + auto Deleter = [](AllocatorT *A) { + A->unmapTestOnly(); + delete A; + }; + std::unique_ptr Allocator(new AllocatorT, + Deleter); + Allocator->reset(); + // We attempt to use as many TSDs as the shared cache offers by creating a + // decent amount of threads that will be run concurrently and attempt to get + // and lock TSDs. We put them all in a set and count the number of entries + // after we are done. + std::thread Threads[32]; + for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) + Threads[I] = std::thread(stressSharedRegistry, Allocator.get()); + { + std::unique_lock Lock(Mutex); + Ready = true; + Cv.notify_all(); + } + for (auto &T : Threads) + T.join(); + // The initial number of TSDs we get will be the minimum of the default count + // and the number of CPUs. + EXPECT_LE(Pointers.size(), 8U); + Pointers.clear(); + auto Registry = Allocator->getTSDRegistry(); + // Increase the number of TSDs to 16. + Registry->setOption(scudo::Option::MaxTSDsCount, 16); + Ready = false; + for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) + Threads[I] = std::thread(stressSharedRegistry, Allocator.get()); + { + std::unique_lock Lock(Mutex); + Ready = true; + Cv.notify_all(); + } + for (auto &T : Threads) + T.join(); + // We should get 16 distinct TSDs back. + EXPECT_EQ(Pointers.size(), 16U); +} diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp index 4b851a7af181f..ed5b64c203364 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp @@ -389,6 +389,7 @@ static void *enableMalloc(void *Unused) { TEST(ScudoWrappersCTest, DisableForkEnable) { pthread_t ThreadId; + Ready = false; EXPECT_EQ(pthread_create(&ThreadId, nullptr, &enableMalloc, nullptr), 0); // Wait for the thread to be warmed up. diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp index 4ccef5bb0deec..d24b6651d95e3 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp @@ -79,7 +79,7 @@ TEST(ScudoWrappersCppTest, New) { static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; static void stressNew() { std::vector V; @@ -103,6 +103,7 @@ static void stressNew() { } TEST(ScudoWrappersCppTest, ThreadedNew) { + Ready = false; std::thread Threads[32]; for (size_t I = 0U; I < sizeof(Threads) / sizeof(Threads[0]); I++) Threads[I] = std::thread(stressNew); diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h index 3492509b5a8eb..ac5a22c970701 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h +++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h @@ -66,6 +66,12 @@ template struct TSDRegistryExT { Mutex.unlock(); } + bool setOption(Option O, UNUSED sptr Value) { + if (O == Option::MaxTSDsCount) + return false; + return true; + } + private: void initOnceMaybe(Allocator *Instance) { ScopedLock L(Mutex); diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h index 038a5905ff489..25ba191826c3f 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_shared.h +++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h @@ -14,31 +14,16 @@ namespace scudo { -template struct TSDRegistrySharedT { +template +struct TSDRegistrySharedT { void initLinkerInitialized(Allocator *Instance) { Instance->initLinkerInitialized(); CHECK_EQ(pthread_key_create(&PThreadKey, nullptr), 0); // For non-TLS - const u32 NumberOfCPUs = getNumberOfCPUs(); - NumberOfTSDs = (SCUDO_ANDROID || NumberOfCPUs == 0) - ? MaxTSDCount - : Min(NumberOfCPUs, MaxTSDCount); - for (u32 I = 0; I < NumberOfTSDs; I++) + for (u32 I = 0; I < TSDsArraySize; I++) TSDs[I].initLinkerInitialized(Instance); - // Compute all the coprimes of NumberOfTSDs. This will be used to walk the - // array of TSDs in a random order. For details, see: - // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/ - for (u32 I = 0; I < NumberOfTSDs; I++) { - u32 A = I + 1; - u32 B = NumberOfTSDs; - // Find the GCD between I + 1 and NumberOfTSDs. If 1, they are coprimes. - while (B != 0) { - const u32 T = A; - A = B; - B = T % B; - } - if (A == 1) - CoPrimes[NumberOfCoPrimes++] = I + 1; - } + const u32 NumberOfCPUs = getNumberOfCPUs(); + setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount + : Min(NumberOfCPUs, DefaultTSDCount)); Initialized = true; } void init(Allocator *Instance) { @@ -66,21 +51,34 @@ template struct TSDRegistrySharedT { if (TSD->tryLock()) return TSD; // If that fails, go down the slow path. + if (TSDsArraySize == 1U) { + // Only 1 TSD, not need to go any further. + // The compiler will optimize this one way or the other. + TSD->lock(); + return TSD; + } return getTSDAndLockSlow(TSD); } void disable() { Mutex.lock(); - for (u32 I = 0; I < NumberOfTSDs; I++) + for (u32 I = 0; I < TSDsArraySize; I++) TSDs[I].lock(); } void enable() { - for (s32 I = static_cast(NumberOfTSDs - 1); I >= 0; I--) + for (s32 I = static_cast(TSDsArraySize - 1); I >= 0; I--) TSDs[I].unlock(); Mutex.unlock(); } + bool setOption(Option O, sptr Value) { + if (O == Option::MaxTSDsCount) + return setNumberOfTSDs(static_cast(Value)); + // Not supported by the TSD Registry, but not an error either. + return true; + } + private: ALWAYS_INLINE void setCurrentTSD(TSD *CurrentTSD) { #if _BIONIC @@ -104,6 +102,32 @@ template struct TSDRegistrySharedT { #endif } + bool setNumberOfTSDs(u32 N) { + ScopedLock L(MutexTSDs); + if (N < NumberOfTSDs) + return false; + if (N > TSDsArraySize) + N = TSDsArraySize; + NumberOfTSDs = N; + NumberOfCoPrimes = 0; + // Compute all the coprimes of NumberOfTSDs. This will be used to walk the + // array of TSDs in a random order. For details, see: + // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/ + for (u32 I = 0; I < N; I++) { + u32 A = I + 1; + u32 B = N; + // Find the GCD between I + 1 and N. If 1, they are coprimes. + while (B != 0) { + const u32 T = A; + A = B; + B = T % B; + } + if (A == 1) + CoPrimes[NumberOfCoPrimes++] = I + 1; + } + return true; + } + void initOnceMaybe(Allocator *Instance) { ScopedLock L(Mutex); if (LIKELY(Initialized)) @@ -120,17 +144,23 @@ template struct TSDRegistrySharedT { } NOINLINE TSD *getTSDAndLockSlow(TSD *CurrentTSD) { - if (MaxTSDCount > 1U && NumberOfTSDs > 1U) { - // Use the Precedence of the current TSD as our random seed. Since we are - // in the slow path, it means that tryLock failed, and as a result it's - // very likely that said Precedence is non-zero. - const u32 R = static_cast(CurrentTSD->getPrecedence()); - const u32 Inc = CoPrimes[R % NumberOfCoPrimes]; - u32 Index = R % NumberOfTSDs; + // Use the Precedence of the current TSD as our random seed. Since we are + // in the slow path, it means that tryLock failed, and as a result it's + // very likely that said Precedence is non-zero. + const u32 R = static_cast(CurrentTSD->getPrecedence()); + u32 N, Inc; + { + ScopedLock L(MutexTSDs); + N = NumberOfTSDs; + DCHECK_NE(NumberOfCoPrimes, 0U); + Inc = CoPrimes[R % NumberOfCoPrimes]; + } + if (N > 1U) { + u32 Index = R % N; uptr LowestPrecedence = UINTPTR_MAX; TSD *CandidateTSD = nullptr; // Go randomly through at most 4 contexts and find a candidate. - for (u32 I = 0; I < Min(4U, NumberOfTSDs); I++) { + for (u32 I = 0; I < Min(4U, N); I++) { if (TSDs[Index].tryLock()) { setCurrentTSD(&TSDs[Index]); return &TSDs[Index]; @@ -142,8 +172,8 @@ template struct TSDRegistrySharedT { LowestPrecedence = Precedence; } Index += Inc; - if (Index >= NumberOfTSDs) - Index -= NumberOfTSDs; + if (Index >= N) + Index -= N; } if (CandidateTSD) { CandidateTSD->lock(); @@ -160,19 +190,20 @@ template struct TSDRegistrySharedT { atomic_u32 CurrentIndex; u32 NumberOfTSDs; u32 NumberOfCoPrimes; - u32 CoPrimes[MaxTSDCount]; + u32 CoPrimes[TSDsArraySize]; bool Initialized; HybridMutex Mutex; - TSD TSDs[MaxTSDCount]; + HybridMutex MutexTSDs; + TSD TSDs[TSDsArraySize]; #if SCUDO_LINUX && !_BIONIC static THREADLOCAL TSD *ThreadTSD; #endif }; #if SCUDO_LINUX && !_BIONIC -template +template THREADLOCAL TSD - *TSDRegistrySharedT::ThreadTSD; + *TSDRegistrySharedT::ThreadTSD; #endif } // namespace scudo From 9cc70e047c2892a318ade3afacab7faffa4f49cc Mon Sep 17 00:00:00 2001 From: Siva Chandra Reddy Date: Tue, 28 Jul 2020 11:43:04 -0700 Subject: [PATCH 0347/1035] [libc][NFC] Zero out padding bits in the uint form of x86 FPBits. --- libc/utils/FPUtil/LongDoubleBitsX86.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/libc/utils/FPUtil/LongDoubleBitsX86.h b/libc/utils/FPUtil/LongDoubleBitsX86.h index 5438e0b2b6edb..4d64490c5a11d 100644 --- a/libc/utils/FPUtil/LongDoubleBitsX86.h +++ b/libc/utils/FPUtil/LongDoubleBitsX86.h @@ -23,10 +23,10 @@ template <> struct MantissaWidth { template struct Padding; // i386 padding. -template <> struct Padding<4> { static constexpr unsigned Value = 16; }; +template <> struct Padding<4> { static constexpr unsigned value = 16; }; // x86_64 padding. -template <> struct Padding<8> { static constexpr unsigned Value = 48; }; +template <> struct Padding<8> { static constexpr unsigned value = 48; }; template <> struct __attribute__((packed)) FPBits { using UIntType = __uint128_t; @@ -38,7 +38,7 @@ template <> struct __attribute__((packed)) FPBits { uint8_t implicitBit : 1; uint16_t exponent : ExponentWidth::value; uint8_t sign : 1; - uint64_t padding : Padding::Value; + uint64_t padding : Padding::value; template ::Value, int> = 0> @@ -91,7 +91,15 @@ template <> struct __attribute__((packed)) FPBits { // zero in case i386. UIntType result = UIntType(0); *reinterpret_cast *>(&result) = *this; - return result; + + // Even though we zero out |result| before copying the long double value, + // there can be garbage bits in the padding. So, we zero the padding bits + // in |result|. + static constexpr UIntType mask = + (UIntType(1) << (sizeof(long double) - + Padding::value / 8)) - + 1; + return result & mask; } static FPBits zero() { return FPBits(0.0l); } From f8ab66538c4c8dc60cd458024639772a164ef9c5 Mon Sep 17 00:00:00 2001 From: jasonliu Date: Tue, 28 Jul 2020 17:56:13 +0000 Subject: [PATCH 0348/1035] [NFC][XCOFF] Use getFunctionEntryPointSymbol from TLOF to simplify logic Reviewed By: Xiangling_L Differential Revision: https://reviews.llvm.org/D84693 --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 87 ++++++++++----------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index db3833d595797..29a6d54d3296c 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -74,6 +74,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionXCOFF.h" #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" @@ -5120,50 +5121,38 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, Subtarget.is32BitELFABI() && !isLocalCallee() && Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_; - // On AIX, direct function calls reference the symbol for the function's - // entry point, which is named by prepending a "." before the function's - // C-linkage name. - const auto getFunctionEntryPointSymbol = [&](StringRef SymName) { - auto &Context = DAG.getMachineFunction().getMMI().getContext(); - return cast( - Context.getOrCreateSymbol(Twine(".") + Twine(SymName))); + const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) { + const TargetMachine &TM = Subtarget.getTargetMachine(); + const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering(); + MCSymbolXCOFF *S = + cast(TLOF->getFunctionEntryPointSymbol(GV, TM)); + + if (GV->isDeclaration() && !S->hasRepresentedCsectSet()) { + // On AIX, an undefined symbol needs to be associated with a + // MCSectionXCOFF to get the correct storage mapping class. + // In this case, XCOFF::XMC_PR. + const XCOFF::StorageClass SC = + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV); + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + MCSectionXCOFF *Sec = Context.getXCOFFSection( + S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC, + SectionKind::getMetadata()); + S->setRepresentedCsect(Sec); + } + + MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + return DAG.getMCSymbol(S, PtrVT); }; - const auto getAIXFuncEntryPointSymbolSDNode = - [&](StringRef FuncName, bool IsDeclaration, - const XCOFF::StorageClass &SC) { - MCSymbolXCOFF *S = getFunctionEntryPointSymbol(FuncName); - - auto &Context = DAG.getMachineFunction().getMMI().getContext(); - - if (IsDeclaration && !S->hasRepresentedCsectSet()) { - // On AIX, an undefined symbol needs to be associated with a - // MCSectionXCOFF to get the correct storage mapping class. - // In this case, XCOFF::XMC_PR. - MCSectionXCOFF *Sec = Context.getXCOFFSection( - S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC, - SectionKind::getMetadata()); - S->setRepresentedCsect(Sec); - } - - MVT PtrVT = - DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); - return DAG.getMCSymbol(S, PtrVT); - }; - if (isFunctionGlobalAddress(Callee)) { - const GlobalAddressSDNode *G = cast(Callee); - const GlobalValue *GV = G->getGlobal(); - - if (!Subtarget.isAIXABI()) - return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0, - UsePlt ? PPCII::MO_PLT : 0); + const GlobalValue *GV = cast(Callee)->getGlobal(); - assert(!isa(GV) && "IFunc is not supported on AIX."); - const XCOFF::StorageClass SC = - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV); - return getAIXFuncEntryPointSymbolSDNode(GV->getName(), GV->isDeclaration(), - SC); + if (Subtarget.isAIXABI()) { + assert(!isa(GV) && "IFunc is not supported on AIX."); + return getAIXFuncEntryPointSymbolSDNode(GV); + } + return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0, + UsePlt ? PPCII::MO_PLT : 0); } if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { @@ -5173,12 +5162,18 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, // ExternalSymbol's, then we pick up the user-declared version. const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); if (const Function *F = - dyn_cast_or_null(Mod->getNamedValue(SymName))) { - const XCOFF::StorageClass SC = - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F); - return getAIXFuncEntryPointSymbolSDNode(F->getName(), - F->isDeclaration(), SC); - } + dyn_cast_or_null(Mod->getNamedValue(SymName))) + return getAIXFuncEntryPointSymbolSDNode(F); + + // On AIX, direct function calls reference the symbol for the function's + // entry point, which is named by prepending a "." before the function's + // C-linkage name. + const auto getFunctionEntryPointSymbol = [&](StringRef SymName) { + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + return cast( + Context.getOrCreateSymbol(Twine(".") + Twine(SymName))); + }; + SymName = getFunctionEntryPointSymbol(SymName)->getName().data(); } return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(), From 496fc3f196bca77d48804ab4bc9343c3d82b97bf Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 28 Jul 2020 11:13:05 -0400 Subject: [PATCH 0349/1035] [InstSimplify] add tests for icmp with partial undef constant; NFC --- .../Transforms/InstSimplify/icmp-constant.ll | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/llvm/test/Transforms/InstSimplify/icmp-constant.ll b/llvm/test/Transforms/InstSimplify/icmp-constant.ll index 3ebaca7c8aafb..8f51ba3c4c0e5 100644 --- a/llvm/test/Transforms/InstSimplify/icmp-constant.ll +++ b/llvm/test/Transforms/InstSimplify/icmp-constant.ll @@ -19,6 +19,15 @@ define <2 x i1> @tautological_ule_vec(<2 x i8> %x) { ret <2 x i1> %cmp } +define <2 x i1> @tautological_ule_vec_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @tautological_ule_vec_partial_undef( +; CHECK-NEXT: [[CMP:%.*]] = icmp ule <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %cmp = icmp ule <2 x i8> %x, + ret <2 x i1> %cmp +} + define i1 @tautological_ugt(i8 %x) { ; CHECK-LABEL: @tautological_ugt( ; CHECK-NEXT: ret i1 false @@ -35,6 +44,15 @@ define <2 x i1> @tautological_ugt_vec(<2 x i8> %x) { ret <2 x i1> %cmp } +define <2 x i1> @tautological_ugt_vec_partial_undef(<2 x i8> %x) { +; CHECK-LABEL: @tautological_ugt_vec_partial_undef( +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %cmp = icmp ugt <2 x i8> %x, + ret <2 x i1> %cmp +} + ; 'urem x, C2' produces [0, C2) define i1 @urem3(i32 %X) { ; CHECK-LABEL: @urem3( @@ -54,6 +72,17 @@ define <2 x i1> @urem3_vec(<2 x i32> %X) { ret <2 x i1> %B } +define <2 x i1> @urem3_vec_partial_undef(<2 x i32> %X) { +; CHECK-LABEL: @urem3_vec_partial_undef( +; CHECK-NEXT: [[A:%.*]] = urem <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[B:%.*]] = icmp ult <2 x i32> [[A]], +; CHECK-NEXT: ret <2 x i1> [[B]] +; + %A = urem <2 x i32> %X, + %B = icmp ult <2 x i32> %A, + ret <2 x i1> %B +} + ;'srem x, C2' produces (-|C2|, |C2|) define i1 @srem1(i32 %X) { ; CHECK-LABEL: @srem1( @@ -73,6 +102,17 @@ define <2 x i1> @srem1_vec(<2 x i32> %X) { ret <2 x i1> %B } +define <2 x i1> @srem1_vec_partial_undef(<2 x i32> %X) { +; CHECK-LABEL: @srem1_vec_partial_undef( +; CHECK-NEXT: [[A:%.*]] = srem <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[B:%.*]] = icmp sgt <2 x i32> [[A]], +; CHECK-NEXT: ret <2 x i1> [[B]] +; + %A = srem <2 x i32> %X, + %B = icmp sgt <2 x i32> %A, + ret <2 x i1> %B +} + ;'udiv C2, x' produces [0, C2] define i1 @udiv5(i32 %X) { ; CHECK-LABEL: @udiv5( @@ -169,6 +209,17 @@ define <2 x i1> @shl5_vec(<2 x i32> %X) { ret <2 x i1> %cmp } +define <2 x i1> @shl5_vec_partial_undef(<2 x i32> %X) { +; CHECK-LABEL: @shl5_vec_partial_undef( +; CHECK-NEXT: [[SUB:%.*]] = shl nuw <2 x i32> , [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt <2 x i32> [[SUB]], +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %sub = shl nuw <2 x i32> , %X + %cmp = icmp ugt <2 x i32> %sub, + ret <2 x i1> %cmp +} + ; 'shl nsw C2, x' produces [C2 << CLO(C2)-1, C2] define i1 @shl2(i32 %X) { ; CHECK-LABEL: @shl2( @@ -378,6 +429,17 @@ define <2 x i1> @or1_vec(<2 x i32> %X) { ret <2 x i1> %B } +define <2 x i1> @or1_vec_partial_undef(<2 x i32> %X) { +; CHECK-LABEL: @or1_vec_partial_undef( +; CHECK-NEXT: [[A:%.*]] = or <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[B:%.*]] = icmp ult <2 x i32> [[A]], +; CHECK-NEXT: ret <2 x i1> [[B]] +; + %A = or <2 x i32> %X, + %B = icmp ult <2 x i32> %A, + ret <2 x i1> %B +} + ; Single bit OR. define i1 @or2_true(i8 %x) { ; CHECK-LABEL: @or2_true( From f75cf240d6ed528e1ce7770bbe09b417338b40ef Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 28 Jul 2020 15:02:36 -0400 Subject: [PATCH 0350/1035] [InstCombine] avoid crashing on vector constant expression (PR46872) --- .../InstCombine/InstructionCombining.cpp | 2 +- llvm/test/Transforms/InstCombine/vec_shuffle.ll | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 965a72d3a6500..ceeddda05fd9a 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1599,7 +1599,7 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { Constant *C; if (match(&Inst, m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))), - m_Constant(C))) && + m_Constant(C))) && !isa(C) && cast(V1->getType())->getNumElements() <= NumElts) { assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() && "Shuffle should not change scalar type"); diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll index e7e55b07b7cd1..3f3431c5d9045 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll @@ -1745,3 +1745,18 @@ define <4 x i32> @splat_assoc_add_mul(<4 x i32> %x, <4 x i32> %y) { %r = mul <4 x i32> %splatx, %a ret <4 x i32> %r } + + +; Do not crash on constant expressions. + +define <4 x i32> @PR46872(<4 x i32> %x) { +; CHECK-LABEL: @PR46872( +; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[A:%.*]] = and <4 x i32> [[S]], bitcast (<2 x i64> (<4 x i32>)* @PR46872 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64)> to <4 x i32>) +; CHECK-NEXT: ret <4 x i32> [[A]] +; + %s = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> + %a = and <4 x i32> %s, bitcast (<2 x i64> (<4 x i32>)* @PR46872 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64)> to <4 x i32>) + ret <4 x i32> %a +} + From e5608cacfd60bb28685206ca96a8f3ceeee1e8a6 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Tue, 28 Jul 2020 11:47:30 -0700 Subject: [PATCH 0351/1035] [mlir][GPUToSPIRV] Add a test pass to set workgroup size for kernel functions. This allows using command line flags to lowere from GPU to SPIR-V. The pass added is only for testing/example purposes. Most uses cases will need more fine-grained control on setting workgroup sizes for kernel functions. Differential Revision: https://reviews.llvm.org/D84619 --- .../GPUToSPIRV/test_spirv_entry_point.mlir | 14 ++++ mlir/test/lib/Dialect/SPIRV/CMakeLists.txt | 2 + .../lib/Dialect/SPIRV/TestEntryPointAbi.cpp | 64 +++++++++++++++++++ mlir/tools/mlir-opt/mlir-opt.cpp | 2 + 4 files changed, 82 insertions(+) create mode 100644 mlir/test/Conversion/GPUToSPIRV/test_spirv_entry_point.mlir create mode 100644 mlir/test/lib/Dialect/SPIRV/TestEntryPointAbi.cpp diff --git a/mlir/test/Conversion/GPUToSPIRV/test_spirv_entry_point.mlir b/mlir/test/Conversion/GPUToSPIRV/test_spirv_entry_point.mlir new file mode 100644 index 0000000000000..26556a41c9be1 --- /dev/null +++ b/mlir/test/Conversion/GPUToSPIRV/test_spirv_entry_point.mlir @@ -0,0 +1,14 @@ +// RUN: mlir-opt -test-spirv-entry-point-abi %s | FileCheck %s -check-prefix=DEFAULT +// RUN: mlir-opt -test-spirv-entry-point-abi="workgroup-size=32" %s | FileCheck %s -check-prefix=WG32 + +// DEFAULT: gpu.func @foo() +// DEFAULT-SAME: spv.entry_point_abi = {local_size = dense<1> : vector<3xi32>} + +// WG32: gpu.func @foo() +// WG32-SAME: spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>} + +gpu.module @kernels { + gpu.func @foo() kernel { + gpu.return + } +} diff --git a/mlir/test/lib/Dialect/SPIRV/CMakeLists.txt b/mlir/test/lib/Dialect/SPIRV/CMakeLists.txt index 15d4673f381f3..204a633377307 100644 --- a/mlir/test/lib/Dialect/SPIRV/CMakeLists.txt +++ b/mlir/test/lib/Dialect/SPIRV/CMakeLists.txt @@ -1,6 +1,7 @@ # Exclude tests from libMLIR.so add_mlir_library(MLIRSPIRVTestPasses TestAvailability.cpp + TestEntryPointAbi.cpp EXCLUDE_FROM_LIBMLIR @@ -9,6 +10,7 @@ add_mlir_library(MLIRSPIRVTestPasses ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR LINK_LIBS PUBLIC + MLIRGPU MLIRIR MLIRPass MLIRSPIRV diff --git a/mlir/test/lib/Dialect/SPIRV/TestEntryPointAbi.cpp b/mlir/test/lib/Dialect/SPIRV/TestEntryPointAbi.cpp new file mode 100644 index 0000000000000..bcbdb828523d4 --- /dev/null +++ b/mlir/test/lib/Dialect/SPIRV/TestEntryPointAbi.cpp @@ -0,0 +1,64 @@ +//===- TestAvailability.cpp - Test pass for setting Entry point ABI info --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass that sets the spv.entry_point_abi attribute on +// functions that are to be lowered as entry point functions. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/SPIRV/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/TargetAndABI.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +/// Pass to set the spv.entry_point_abi +class TestSpirvEntryPointABIPass + : public PassWrapper> { +public: + TestSpirvEntryPointABIPass() = default; + TestSpirvEntryPointABIPass(const TestSpirvEntryPointABIPass &) {} + void runOnOperation() override; + +private: + Pass::ListOption workgroupSize{ + *this, "workgroup-size", + llvm::cl::desc( + "Workgroup size to use for all gpu.func kernels in the module, " + "specified with x-dimension first, y-dimension next and z-dimension " + "last. Unspecified dimensions will be set to 1"), + llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated}; +}; +} // namespace + +void TestSpirvEntryPointABIPass::runOnOperation() { + gpu::GPUModuleOp gpuModule = getOperation(); + MLIRContext *context = &getContext(); + StringRef attrName = spirv::getEntryPointABIAttrName(); + for (gpu::GPUFuncOp gpuFunc : gpuModule.getOps()) { + if (!gpu::GPUDialect::isKernel(gpuFunc) || gpuFunc.getAttr(attrName)) + continue; + SmallVector workgroupSizeVec(workgroupSize.begin(), + workgroupSize.end()); + workgroupSizeVec.resize(3, 1); + gpuFunc.setAttr(attrName, + spirv::getEntryPointABIAttr(workgroupSizeVec, context)); + } +} + +namespace mlir { +void registerTestSpirvEntryPointABIPass() { + PassRegistration registration( + "test-spirv-entry-point-abi", + "Set the spv.entry_point_abi attribute on GPU kernel function within the " + "module, intended for testing only"); +} +} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index f60864a6a371b..620c5871a420c 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -66,6 +66,7 @@ void registerTestPreparationPassWithAllowedMemrefResults(); void registerTestRecursiveTypesPass(); void registerTestReducer(); void registerTestGpuParallelLoopMappingPass(); +void registerTestSpirvEntryPointABIPass(); void registerTestSCFUtilsPass(); void registerTestVectorConversions(); void registerVectorizerTestPass(); @@ -142,6 +143,7 @@ void registerTestPasses() { registerTestRecursiveTypesPass(); registerTestReducer(); registerTestGpuParallelLoopMappingPass(); + registerTestSpirvEntryPointABIPass(); registerTestSCFUtilsPass(); registerTestVectorConversions(); registerVectorizerTestPass(); From 69152a11cf181d9c1859947f0f2c2f1554a891f2 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 28 Jul 2020 12:03:54 -0700 Subject: [PATCH 0352/1035] [X86] Merge the two 'Emit the normal disp32 encoding' cases in SIB byte handling in emitMemModRMByte. NFCI By repeating the Disp.isImm() check in a couple spots we can make the normal case for immediate and for expression the same. And then always rely on the ForceDisp32 flag to remove a later non-zero immediate check. This should make {disp32} pseudo prefix handling slightly easier as we need the normal disp32 handler to handle a immediate of 0. --- .../lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 25f1089912639..abdc0f156b9f9 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -582,23 +582,21 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // MOD=0, BASE=5, to JUST get the index, scale, and displacement. emitByte(modRMByte(0, RegOpcodeField, 4), OS); ForceDisp32 = true; - } else if (!Disp.isImm()) { - // Emit the normal disp32 encoding. - emitByte(modRMByte(2, RegOpcodeField, 4), OS); - ForceDisp32 = true; - } else if (Disp.getImm() == 0 && + } else if (Disp.isImm() && Disp.getImm() == 0 && // Base reg can't be anything that ends up with '5' as the base // reg, it is the magic [*] nomenclature that indicates no base. BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte emitByte(modRMByte(0, RegOpcodeField, 4), OS); - } else if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { + } else if (Disp.isImm() && + isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { // Emit the disp8 encoding. emitByte(modRMByte(1, RegOpcodeField, 4), OS); ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP } else { // Emit the normal disp32 encoding. emitByte(modRMByte(2, RegOpcodeField, 4), OS); + ForceDisp32 = true; } // Calculate what the SS field value should be... @@ -618,7 +616,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, if (ForceDisp8) emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, ImmOffset); - else if (ForceDisp32 || Disp.getImm() != 0) + else if (ForceDisp32) emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), StartByte, OS, Fixups); } From 1bd7c02233969b430b2d49e95345f507fdcc9f30 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 28 Jul 2020 13:41:12 -0400 Subject: [PATCH 0353/1035] [libc++] Clean up tests for "optional" C11 features First, add a TEST_HAS_QUICK_EXIT macro to mirror other C11 features like TEST_HAS_ALIGNED_ALLOC, and update the tests for that. Second, get rid of TEST_HAS_C11_FEATURES and _LIBCPP_HAS_C11_FEATURES, which were only used to ensure that feature macros don't get out of sync between <__config> and "test_macros.h". This is not necessary anymore, since we have tests for each individual macro now. --- libcxx/include/__config | 5 --- .../has_aligned_alloc.compile.pass.cpp | 19 +++++++++++ .../has_c11_features.pass.cpp | 33 ------------------- .../has_quick_exit.compile.pass.cpp | 19 +++++++++++ .../has_timespec_get.compile.pass.cpp | 19 +++++++++++ .../std/depr/depr.c.headers/float_h.pass.cpp | 6 ++-- .../support.limits/c.limits/cfloat.pass.cpp | 6 ++-- ...ail.cpp => at_quick_exit.compile.fail.cpp} | 18 +++++----- ...e.fail.cpp => quick_exit.compile.fail.cpp} | 15 ++++----- .../support.start.term/quick_exit.pass.cpp | 2 +- .../test/support/msvc_stdlib_force_include.h | 3 -- libcxx/test/support/test_macros.h | 16 +++++---- 12 files changed, 90 insertions(+), 71 deletions(-) create mode 100644 libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp delete mode 100644 libcxx/test/libcxx/language.support/has_c11_features.pass.cpp create mode 100644 libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp create mode 100644 libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp rename libcxx/test/std/language.support/support.start.term/{quick_exit_check1.compile.fail.cpp => at_quick_exit.compile.fail.cpp} (70%) rename libcxx/test/std/language.support/support.start.term/{quick_exit_check2.compile.fail.cpp => quick_exit.compile.fail.cpp} (72%) diff --git a/libcxx/include/__config b/libcxx/include/__config index ebdd64ed4292d..a9eca04959bf4 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -348,13 +348,11 @@ # if defined(__FreeBSD__) # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT -# define _LIBCPP_HAS_C11_FEATURES # if __FreeBSD_version >= 1300064 || \ (__FreeBSD_version >= 1201504 && __FreeBSD_version < 1300000) # define _LIBCPP_HAS_TIMESPEC_GET # endif # elif defined(__BIONIC__) -# define _LIBCPP_HAS_C11_FEATURES # if __ANDROID_API__ >= 21 # define _LIBCPP_HAS_QUICK_EXIT # endif @@ -368,7 +366,6 @@ # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT # define _LIBCPP_HAS_TIMESPEC_GET -# define _LIBCPP_HAS_C11_FEATURES # elif defined(__linux__) # if !defined(_LIBCPP_HAS_MUSL_LIBC) # if _LIBCPP_GLIBC_PREREQ(2, 15) || defined(__BIONIC__) @@ -376,14 +373,12 @@ # endif # if _LIBCPP_GLIBC_PREREQ(2, 17) # define _LIBCPP_HAS_ALIGNED_ALLOC -# define _LIBCPP_HAS_C11_FEATURES # define _LIBCPP_HAS_TIMESPEC_GET # endif # else // defined(_LIBCPP_HAS_MUSL_LIBC) # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT # define _LIBCPP_HAS_TIMESPEC_GET -# define _LIBCPP_HAS_C11_FEATURES # endif # endif // __linux__ #endif diff --git a/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp b/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp new file mode 100644 index 0000000000000..d1b41de5dc1a9 --- /dev/null +++ b/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Make sure TEST_HAS_ALIGNED_ALLOC (defined by the test suite) and +// _LIBCPP_HAS_ALIGNED_ALLOC (defined by libc++) stay in sync. + +#include <__config> +#include "test_macros.h" + +#if defined(TEST_HAS_ALIGNED_ALLOC) != defined(_LIBCPP_HAS_ALIGNED_ALLOC) +# error "TEST_HAS_ALIGNED_ALLOC and _LIBCPP_HAS_ALIGNED_ALLOC are out of sync" +#endif diff --git a/libcxx/test/libcxx/language.support/has_c11_features.pass.cpp b/libcxx/test/libcxx/language.support/has_c11_features.pass.cpp deleted file mode 100644 index ab1e6ea619252..0000000000000 --- a/libcxx/test/libcxx/language.support/has_c11_features.pass.cpp +++ /dev/null @@ -1,33 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14 - -// We have two macros for checking whether or not the underlying C library -// has C11 features: -// TEST_HAS_C11_FEATURES - which is defined in "test_macros.h" -// _LIBCPP_HAS_C11_FEATURES - which is defined in <__config> -// They should always be the same - -#include <__config> -#include "test_macros.h" - -#ifdef TEST_HAS_C11_FEATURES -# ifndef _LIBCPP_HAS_C11_FEATURES -# error "TEST_HAS_C11_FEATURES is defined, but _LIBCPP_HAS_C11_FEATURES is not" -# endif -#endif - -#ifdef _LIBCPP_HAS_C11_FEATURES -# ifndef TEST_HAS_C11_FEATURES -# error "_LIBCPP_HAS_C11_FEATURES is defined, but TEST_HAS_C11_FEATURES is not" -# endif -#endif - -int main(int, char**) { - return 0; -} diff --git a/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp b/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp new file mode 100644 index 0000000000000..ad0fe6fb18fd1 --- /dev/null +++ b/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Make sure TEST_HAS_QUICK_EXIT (defined by the test suite) and +// _LIBCPP_HAS_QUICK_EXIT (defined by libc++) stay in sync. + +#include <__config> +#include "test_macros.h" + +#if defined(TEST_HAS_QUICK_EXIT) != defined(_LIBCPP_HAS_QUICK_EXIT) +# error "TEST_HAS_QUICK_EXIT and _LIBCPP_HAS_QUICK_EXIT are out of sync" +#endif diff --git a/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp b/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp new file mode 100644 index 0000000000000..8b86a5ef97195 --- /dev/null +++ b/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Make sure TEST_HAS_TIMESPEC_GET (defined by the test suite) and +// _LIBCPP_HAS_TIMESPEC_GET (defined by libc++) stay in sync. + +#include <__config> +#include "test_macros.h" + +#if defined(TEST_HAS_TIMESPEC_GET) != defined(_LIBCPP_HAS_TIMESPEC_GET) +# error "TEST_HAS_TIMESPEC_GET and _LIBCPP_HAS_TIMESPEC_GET are out of sync" +#endif diff --git a/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp index 779fbc66f5d02..1069a679263fb 100644 --- a/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp +++ b/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp @@ -24,7 +24,7 @@ #error FLT_RADIX not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 && 0 #ifndef FLT_HAS_SUBNORM #error FLT_HAS_SUBNORM not defined #endif @@ -54,7 +54,7 @@ #error DECIMAL_DIG not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 && 0 #ifndef FLT_DECIMAL_DIG #error FLT_DECIMAL_DIG not defined #endif @@ -164,7 +164,7 @@ #error LDBL_MIN not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 && 0 #ifndef FLT_TRUE_MIN #error FLT_TRUE_MIN not defined #endif diff --git a/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp b/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp index 12b80adf01169..6ddd8c60834b0 100644 --- a/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp @@ -24,7 +24,7 @@ #error FLT_RADIX not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 && 0 #ifndef FLT_HAS_SUBNORM #error FLT_HAS_SUBNORM not defined #endif @@ -54,7 +54,7 @@ #error DECIMAL_DIG not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 && 0 #ifndef FLT_DECIMAL_DIG #error FLT_DECIMAL_DIG not defined #endif @@ -164,7 +164,7 @@ #error LDBL_MIN not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 && 0 #ifndef FLT_TRUE_MIN #error FLT_TRUE_MIN not defined #endif diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit_check1.compile.fail.cpp b/libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp similarity index 70% rename from libcxx/test/std/language.support/support.start.term/quick_exit_check1.compile.fail.cpp rename to libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp index 10bd5767de0e8..79051fb5167d4 100644 --- a/libcxx/test/std/language.support/support.start.term/quick_exit_check1.compile.fail.cpp +++ b/libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp @@ -5,23 +5,23 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + // UNSUPPORTED: c++03 -// test that referencing at_quick_exit when _LIBCPP_HAS_QUICK_EXIT is not defined +// test that referencing at_quick_exit when TEST_HAS_QUICK_EXIT is not defined // results in a compile error. #include -void f() {} +#include "test_macros.h" -int main(int, char**) -{ -#ifndef _LIBCPP_HAS_QUICK_EXIT +void f() { } + +int main(int, char**) { +#if !defined(TEST_HAS_QUICK_EXIT) std::at_quick_exit(f); #else -#error +# error #endif - - return 0; + return 0; } diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit_check2.compile.fail.cpp b/libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp similarity index 72% rename from libcxx/test/std/language.support/support.start.term/quick_exit_check2.compile.fail.cpp rename to libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp index cb0dbd35c5575..f9c5bdc835694 100644 --- a/libcxx/test/std/language.support/support.start.term/quick_exit_check2.compile.fail.cpp +++ b/libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp @@ -5,22 +5,21 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// + // UNSUPPORTED: c++03 -// test that referencing quick_exit when _LIBCPP_HAS_QUICK_EXIT is not defined +// test that referencing quick_exit when TEST_HAS_QUICK_EXIT is not defined // results in a compile error. #include -void f() {} +#include "test_macros.h" -int main(int, char**) -{ -#ifndef _LIBCPP_HAS_QUICK_EXIT +int main(int, char**) { +#if !defined(TEST_HAS_QUICK_EXIT) std::quick_exit(0); #else -#error +# error #endif - - return 0; + return 0; } diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp b/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp index b45b3391ba5c3..16f68d435640c 100644 --- a/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp +++ b/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp @@ -17,7 +17,7 @@ void f() {} int main(int, char**) { -#ifdef _LIBCPP_HAS_QUICK_EXIT +#ifdef TEST_HAS_QUICK_EXIT std::at_quick_exit(f); std::quick_exit(0); #endif diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h index cf8696af5588a..0869be349e23d 100644 --- a/libcxx/test/support/msvc_stdlib_force_include.h +++ b/libcxx/test/support/msvc_stdlib_force_include.h @@ -65,9 +65,6 @@ const AssertionDialogAvoider assertion_dialog_avoider{}; // MSVC doesn't have __int128_t. #define _LIBCPP_HAS_NO_INT128 -// MSVC has quick_exit() and at_quick_exit(). -#define _LIBCPP_HAS_QUICK_EXIT - #ifndef _LIBCXX_IN_DEVCRT // atomic_is_lock_free.pass.cpp needs this VS 2015 Update 2 fix. #define _ENABLE_ATOMIC_ALIGNMENT_FIX diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 63aa4f47a70f0..15c6a280dc192 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -169,9 +169,11 @@ // Specifically, FreeBSD does NOT have timespec_get, even though they have all // the rest of C11 - this is PR#38495 # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES +# define TEST_HAS_QUICK_EXIT # elif defined(__BIONIC__) -# define TEST_HAS_C11_FEATURES +# if __ANDROID_API__ >= 21 +# define TEST_HAS_QUICK_EXIT +# endif # if __ANDROID_API__ >= 28 # define TEST_HAS_ALIGNED_ALLOC # endif @@ -179,8 +181,8 @@ # define TEST_HAS_TIMESPEC_GET # endif # elif defined(__Fuchsia__) || defined(__wasi__) || defined(__NetBSD__) +# define TEST_HAS_QUICK_EXIT # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES # define TEST_HAS_TIMESPEC_GET # elif defined(__linux__) // This block preserves the old behavior used by include/__config: @@ -188,20 +190,22 @@ // available. The configuration here may be too vague though, as Bionic, uClibc, // newlib, etc may all support these features but need to be configured. # if defined(TEST_GLIBC_PREREQ) +# if TEST_GLIBC_PREREQ(2, 15) +# define TEST_HAS_QUICK_EXIT +# endif # if TEST_GLIBC_PREREQ(2, 17) # define TEST_HAS_ALIGNED_ALLOC # define TEST_HAS_TIMESPEC_GET -# define TEST_HAS_C11_FEATURES # endif # elif defined(_LIBCPP_HAS_MUSL_LIBC) +# define TEST_HAS_QUICK_EXIT # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES # define TEST_HAS_TIMESPEC_GET # endif # elif defined(_WIN32) # if defined(_MSC_VER) && !defined(__MINGW32__) +# define TEST_HAS_QUICK_EXIT # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES // Using Microsoft's C Runtime library # define TEST_HAS_TIMESPEC_GET # endif # endif From 67dfba96296b37f7bac9b4a68572288bc44b63b2 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 27 Jul 2020 14:25:03 -0400 Subject: [PATCH 0354/1035] [libc++] Provide std::aligned_alloc and std::timespec_get on Apple platforms rdar://66113878 --- libcxx/include/__config | 12 +++++++++++- libcxx/test/support/test_macros.h | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index a9eca04959bf4..8c22323c2babe 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -380,7 +380,17 @@ # define _LIBCPP_HAS_QUICK_EXIT # define _LIBCPP_HAS_TIMESPEC_GET # endif -# endif // __linux__ +# elif defined(__APPLE__) + // timespec_get and aligned_alloc were introduced in macOS 10.15 and + // aligned releases +# if (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 101500 || \ + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 60000) +# define _LIBCPP_HAS_ALIGNED_ALLOC +# define _LIBCPP_HAS_TIMESPEC_GET +# endif +# endif // __APPLE__ #endif #ifndef _LIBCPP_CXX03_LANG diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 15c6a280dc192..6fe1c56a5588e 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -208,7 +208,17 @@ # define TEST_HAS_ALIGNED_ALLOC # define TEST_HAS_TIMESPEC_GET # endif -# endif +# elif defined(__APPLE__) + // timespec_get and aligned_alloc were introduced in macOS 10.15 and + // aligned releases +# if (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 101500 || \ + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 60000) +# define TEST_HAS_ALIGNED_ALLOC +# define TEST_HAS_TIMESPEC_GET +# endif +# endif // __APPLE__ #endif /* Features that were introduced in C++14 */ From 3fb13b8484dcbec085da047879bf89ccb1b65b12 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 28 Jul 2020 15:13:19 -0400 Subject: [PATCH 0355/1035] [InstSimplify] allow undefs in icmp with vector constant folds This is the main icmp simplification shortcoming seen in D84655. Alive2 agrees that the basic examples are correct at least: define <2 x i1> @src(<2 x i8> %x) { %0: %r = icmp sle <2 x i8> { undef, 128 }, %x ret <2 x i1> %r } => define <2 x i1> @tgt(<2 x i8> %x) { %0: ret <2 x i1> { 1, 1 } } Transformation seems to be correct! define <2 x i1> @src(<2 x i32> %X) { %0: %A = or <2 x i32> %X, { 63, 63 } %B = icmp ult <2 x i32> %A, { undef, 50 } ret <2 x i1> %B } => define <2 x i1> @tgt(<2 x i32> %X) { %0: ret <2 x i1> { 0, 0 } } Transformation seems to be correct! https://alive2.llvm.org/ce/z/omt2ee https://alive2.llvm.org/ce/z/GW4nP_ Differential Revision: https://reviews.llvm.org/D84762 --- llvm/lib/Analysis/InstructionSimplify.cpp | 2 +- .../Transforms/InstSimplify/icmp-constant.ll | 22 +++++-------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index c920fb3f52bea..01b2eb14a5b49 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2732,7 +2732,7 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, } const APInt *C; - if (!match(RHS, m_APInt(C))) + if (!match(RHS, m_APIntAllowUndef(C))) return nullptr; // Rule out tautological comparisons (eg., ult 0 or uge 0). diff --git a/llvm/test/Transforms/InstSimplify/icmp-constant.ll b/llvm/test/Transforms/InstSimplify/icmp-constant.ll index 8f51ba3c4c0e5..94126bf158487 100644 --- a/llvm/test/Transforms/InstSimplify/icmp-constant.ll +++ b/llvm/test/Transforms/InstSimplify/icmp-constant.ll @@ -21,8 +21,7 @@ define <2 x i1> @tautological_ule_vec(<2 x i8> %x) { define <2 x i1> @tautological_ule_vec_partial_undef(<2 x i8> %x) { ; CHECK-LABEL: @tautological_ule_vec_partial_undef( -; CHECK-NEXT: [[CMP:%.*]] = icmp ule <2 x i8> [[X:%.*]], -; CHECK-NEXT: ret <2 x i1> [[CMP]] +; CHECK-NEXT: ret <2 x i1> ; %cmp = icmp ule <2 x i8> %x, ret <2 x i1> %cmp @@ -46,8 +45,7 @@ define <2 x i1> @tautological_ugt_vec(<2 x i8> %x) { define <2 x i1> @tautological_ugt_vec_partial_undef(<2 x i8> %x) { ; CHECK-LABEL: @tautological_ugt_vec_partial_undef( -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt <2 x i8> [[X:%.*]], -; CHECK-NEXT: ret <2 x i1> [[CMP]] +; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %cmp = icmp ugt <2 x i8> %x, ret <2 x i1> %cmp @@ -74,9 +72,7 @@ define <2 x i1> @urem3_vec(<2 x i32> %X) { define <2 x i1> @urem3_vec_partial_undef(<2 x i32> %X) { ; CHECK-LABEL: @urem3_vec_partial_undef( -; CHECK-NEXT: [[A:%.*]] = urem <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[B:%.*]] = icmp ult <2 x i32> [[A]], -; CHECK-NEXT: ret <2 x i1> [[B]] +; CHECK-NEXT: ret <2 x i1> ; %A = urem <2 x i32> %X, %B = icmp ult <2 x i32> %A, @@ -104,9 +100,7 @@ define <2 x i1> @srem1_vec(<2 x i32> %X) { define <2 x i1> @srem1_vec_partial_undef(<2 x i32> %X) { ; CHECK-LABEL: @srem1_vec_partial_undef( -; CHECK-NEXT: [[A:%.*]] = srem <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[B:%.*]] = icmp sgt <2 x i32> [[A]], -; CHECK-NEXT: ret <2 x i1> [[B]] +; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %A = srem <2 x i32> %X, %B = icmp sgt <2 x i32> %A, @@ -211,9 +205,7 @@ define <2 x i1> @shl5_vec(<2 x i32> %X) { define <2 x i1> @shl5_vec_partial_undef(<2 x i32> %X) { ; CHECK-LABEL: @shl5_vec_partial_undef( -; CHECK-NEXT: [[SUB:%.*]] = shl nuw <2 x i32> , [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt <2 x i32> [[SUB]], -; CHECK-NEXT: ret <2 x i1> [[CMP]] +; CHECK-NEXT: ret <2 x i1> ; %sub = shl nuw <2 x i32> , %X %cmp = icmp ugt <2 x i32> %sub, @@ -431,9 +423,7 @@ define <2 x i1> @or1_vec(<2 x i32> %X) { define <2 x i1> @or1_vec_partial_undef(<2 x i32> %X) { ; CHECK-LABEL: @or1_vec_partial_undef( -; CHECK-NEXT: [[A:%.*]] = or <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[B:%.*]] = icmp ult <2 x i32> [[A]], -; CHECK-NEXT: ret <2 x i1> [[B]] +; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %A = or <2 x i32> %X, %B = icmp ult <2 x i32> %A, From 811828a01d64868329de279a7117e12d10e9da2a Mon Sep 17 00:00:00 2001 From: Ahsan Saghir Date: Tue, 28 Jul 2020 13:29:25 -0500 Subject: [PATCH 0356/1035] [PowerPC] Mark allocator_oom_test.cpp unsupported on PowerPC This patch marks compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp unsupported on PowerPC 64bit-LE architecture since this test fails when run on a machine with larger system memory. Reviewed By: #powerpc, nemanjai Differential Revision: https://reviews.llvm.org/D84786 --- compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp b/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp index c450ae5bb8e91..c294ca53b6be5 100644 --- a/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp @@ -32,7 +32,7 @@ // AArch64 bots fail on this test. // TODO(alekseys): Android lit do not run ulimit on device. // REQUIRES: shadow-scale-3 -// UNSUPPORTED: s390,android,aarch64 +// UNSUPPORTED: s390,android,aarch64,powerpc64le #include #include From 3fac05e49fe3eb4d6becf1761b4df01dbd871aa4 Mon Sep 17 00:00:00 2001 From: clementval Date: Tue, 28 Jul 2020 15:26:27 -0400 Subject: [PATCH 0357/1035] [openacc] Add missing newline at end of file --- llvm/include/llvm/Frontend/OpenACC/ACC.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Frontend/OpenACC/ACC.td b/llvm/include/llvm/Frontend/OpenACC/ACC.td index e96b7e8466628..4dbf4bbc0fb0b 100644 --- a/llvm/include/llvm/Frontend/OpenACC/ACC.td +++ b/llvm/include/llvm/Frontend/OpenACC/ACC.td @@ -601,4 +601,4 @@ def ACC_SerialLoop : Directive<"serial loop"> { def ACC_Unknown : Directive<"unknown"> { let isDefault = 1; -} \ No newline at end of file +} From 8e67982384d4a11892c04d16c2d10d7533e56094 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 28 Jul 2020 20:31:57 +0100 Subject: [PATCH 0358/1035] [NewGVN] Add test cases for remaining known issues. This patch adds IR tests for the known NewGVN issues. The intention is that adding them now will make it easier to keep track of fixes. --- .../NewGVN/todo-pr33165-distribute-undef.ll | 17 ++++ .../NewGVN/todo-pr35074-phi-of-ops.ll | 35 ++++++++ .../NewGVN/todo-pr36335-phi-undef.ll | 32 +++++++ .../todo-pr37121-seens-this-value-a-lot.ll | 25 ++++++ .../NewGVN/todo-pr42422-phi-of-ops.ll | 84 +++++++++++++++++++ 5 files changed, 193 insertions(+) create mode 100644 llvm/test/Transforms/NewGVN/todo-pr33165-distribute-undef.ll create mode 100644 llvm/test/Transforms/NewGVN/todo-pr35074-phi-of-ops.ll create mode 100644 llvm/test/Transforms/NewGVN/todo-pr36335-phi-undef.ll create mode 100644 llvm/test/Transforms/NewGVN/todo-pr37121-seens-this-value-a-lot.ll create mode 100644 llvm/test/Transforms/NewGVN/todo-pr42422-phi-of-ops.ll diff --git a/llvm/test/Transforms/NewGVN/todo-pr33165-distribute-undef.ll b/llvm/test/Transforms/NewGVN/todo-pr33165-distribute-undef.ll new file mode 100644 index 0000000000000..0a9255c448e56 --- /dev/null +++ b/llvm/test/Transforms/NewGVN/todo-pr33165-distribute-undef.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -newgvn -S %s | FileCheck %s + +; Test for PR33165. + +; TODO: Currently NewGVN miscompiles the function. +define i2 @f(i2, i1) { +; CHECK-LABEL: @f( +; CHECK-NEXT: [[A:%.*]] = xor i2 [[TMP0:%.*]], -1 +; CHECK-NEXT: [[B:%.*]] = select i1 [[TMP1:%.*]], i2 [[A]], i2 undef +; CHECK-NEXT: ret i2 [[B]] +; + %a = xor i2 %0, -1 + %b = select i1 %1, i2 %a, i2 undef + %c = and i2 %a, %b + ret i2 %c +} diff --git a/llvm/test/Transforms/NewGVN/todo-pr35074-phi-of-ops.ll b/llvm/test/Transforms/NewGVN/todo-pr35074-phi-of-ops.ll new file mode 100644 index 0000000000000..e80292e94fd1c --- /dev/null +++ b/llvm/test/Transforms/NewGVN/todo-pr35074-phi-of-ops.ll @@ -0,0 +1,35 @@ +; RUN: opt -newgvn -S %s | FileCheck %s + +XFAIL: * + +; TODO: Test case for PR35074. Crashes caused by phi-of-ops. +define void @crash1_pr35074(i32 %this, i1 %c) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc6, %entry + %y.0 = phi i32 [ 1, %entry ], [ %inc7, %for.inc6 ] + br i1 %c, label %for.inc6, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %for.cond + %sub = add nsw i32 %y.0, -1 + br label %for.body4 + +for.body4: ; preds = %for.body.lr.ph + %cmp = icmp ugt i32 %sub, %y.0 + br i1 %cmp, label %for.end, label %for.body4.1 + +for.end: ; preds = %for.body4.1, %for.body4 + ret void + +for.inc6: ; preds = %for.cond + %inc7 = add nuw nsw i32 %y.0, 1 + br label %for.cond + +for.body4.1: ; preds = %for.body4 + %inc.1 = add nuw nsw i32 %y.0, 1 + tail call void @_blah(i32 %inc.1) + br label %for.end +} + +declare void @_blah(i32) diff --git a/llvm/test/Transforms/NewGVN/todo-pr36335-phi-undef.ll b/llvm/test/Transforms/NewGVN/todo-pr36335-phi-undef.ll new file mode 100644 index 0000000000000..6208f4eb19346 --- /dev/null +++ b/llvm/test/Transforms/NewGVN/todo-pr36335-phi-undef.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -newgvn -S %s | FileCheck %s + +; TODO: NewGVN currently miscomiles the function below. PR36335. + +declare void @foo(i32) + +define void @main(i1 %c1, i1 %c2, i32 %x) { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[L:%.*]], label [[END:%.*]] +; CHECK: L: +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[X:%.*]], -1 +; CHECK-NEXT: call void @foo(i32 [[XOR]]) +; CHECK-NEXT: br label [[L]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + br i1 %c1, label %L, label %end + +L: + %d.1 = phi i8 [ undef, %entry ], [ -1, %L ] + %conv = sext i8 %d.1 to i32 + %xor = xor i32 %x, %conv + %neg = xor i32 %xor, -1 + call void @foo(i32 %neg) + br label %L + +end: + ret void +} diff --git a/llvm/test/Transforms/NewGVN/todo-pr37121-seens-this-value-a-lot.ll b/llvm/test/Transforms/NewGVN/todo-pr37121-seens-this-value-a-lot.ll new file mode 100644 index 0000000000000..b4961fa32ef00 --- /dev/null +++ b/llvm/test/Transforms/NewGVN/todo-pr37121-seens-this-value-a-lot.ll @@ -0,0 +1,25 @@ +; RUN: opt -newgvn -S %s | FileCheck %s + +; XFAIL: * + +; TODO: Current NewGVN crashes on the function below. See PR37121. + +define hidden void @foo() { +top: + %.promoted = load i8, i8* undef, align 8 + br label %if + +;; This is really a multi-valued phi, because the phi is defined by an expression of the phi. +;; This means that we can't propagate the value over the backedge, because we'll just cycle +;; through every value. + +if: ; preds = %if, %top + %0 = phi i8 [ %1, %if ], [ %.promoted, %top ] + %1 = xor i8 %0, undef + br i1 false, label %L50, label %if + +L50: ; preds = %if + %.lcssa = phi i8 [ %1, %if ] + store i8 %.lcssa, i8* undef, align 8 + ret void +} diff --git a/llvm/test/Transforms/NewGVN/todo-pr42422-phi-of-ops.ll b/llvm/test/Transforms/NewGVN/todo-pr42422-phi-of-ops.ll new file mode 100644 index 0000000000000..6c4cf5f82314b --- /dev/null +++ b/llvm/test/Transforms/NewGVN/todo-pr42422-phi-of-ops.ll @@ -0,0 +1,84 @@ +; RUN: opt -newgvn -S %s | FileChecks %s + +; XFAIL: * + +; TODO: Currently NewGVN crashes on the function below, see PR42422. + +define void @d() { +entry: + br label %for.cond + +for.cond: ; preds = %cleanup20, %entry + br label %for.cond1 + +for.cond1: ; preds = %for.inc17, %for.cond + %0 = phi i32 [ %inc18, %for.inc17 ], [ 0, %for.cond ] + %cmp = icmp sle i32 %0, 1 + br i1 %cmp, label %for.body, label %for.end19 + +for.body: ; preds = %for.cond1 + br i1 undef, label %for.body3, label %for.body.for.cond4_crit_edge + +for.body.for.cond4_crit_edge: ; preds = %for.body + br label %for.cond4 + +for.body3: ; preds = %for.body + br label %cleanup14 + +for.cond4: ; preds = %cleanup, %for.body.for.cond4_crit_edge + br i1 undef, label %if.then, label %if.end + +if.then: ; preds = %for.cond4 + br label %cleanup + +if.end: ; preds = %for.cond4 + br label %for.cond6 + +for.cond6: ; preds = %for.inc, %if.end + %1 = phi i64 [ %inc, %for.inc ], [ 0, %if.end ] + %cmp7 = icmp sle i64 %1, 1 + br i1 %cmp7, label %for.inc, label %for.end9 + +for.inc: ; preds = %for.cond6 + %inc = add nsw i64 %1, 1 + br label %for.cond6 + +for.end9: ; preds = %for.cond6 + br i1 true, label %if.then11, label %if.end12 + +if.then11: ; preds = %for.end9 + br label %cleanup + +if.end12: ; preds = %for.end9 + br label %cleanup + +cleanup: ; preds = %if.end12, %if.then11, %if.then + %cleanup.dest = phi i32 [ undef, %if.end12 ], [ 1, %if.then11 ], [ 9, %if.then ] + switch i32 %cleanup.dest, label %cleanup14 [ + i32 0, label %for.cond4 + i32 9, label %for.end13 + ] + +for.end13: ; preds = %cleanup + br label %cleanup14 + +cleanup14: ; preds = %for.end13, %cleanup, %for.body3 + %cleanup.dest15 = phi i32 [ 0, %for.end13 ], [ %cleanup.dest, %cleanup ], [ 1, %for.body3 ] + %cond1 = icmp eq i32 %cleanup.dest15, 0 + br i1 %cond1, label %for.inc17, label %cleanup20 + +for.inc17: ; preds = %cleanup14 + %inc18 = add nsw i32 %0, 1 + br label %for.cond1 + +for.end19: ; preds = %for.cond1 + br label %cleanup20 + +cleanup20: ; preds = %for.end19, %cleanup14 + %cleanup.dest21 = phi i32 [ %cleanup.dest15, %cleanup14 ], [ 0, %for.end19 ] + %cond = icmp eq i32 %cleanup.dest21, 0 + br i1 %cond, label %for.cond, label %cleanup23 + +cleanup23: ; preds = %cleanup20 + ret void +} From b8943e7cea7cafebe26dd96e1a0d98e1f68a1386 Mon Sep 17 00:00:00 2001 From: Vincent Zhao Date: Tue, 28 Jul 2020 19:40:55 +0000 Subject: [PATCH 0359/1035] [MLIR][Linalg] Fixed obsolete examples in the MLIR Linalg Dialect doc This diff fixes some obsolete examples in the Linalg dialect documentation: https://mlir.llvm.org/docs/Dialects/Linalg/ These examples are used to explain the basic properties of the Linalg dialect, which are not automatically generated from TableGen and are using out-of-date MLIR/Linalg syntax. This diff extends each example by adding essential attributes and changing its syntax to make it processible by `mlir-opt`. There is also a command attached to each example that says how the example can be processed. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D84229 --- mlir/docs/Dialects/Linalg.md | 247 ++++++++++++++++++++++------------- 1 file changed, 157 insertions(+), 90 deletions(-) diff --git a/mlir/docs/Dialects/Linalg.md b/mlir/docs/Dialects/Linalg.md index b383294ffd741..edf5eb217799a 100644 --- a/mlir/docs/Dialects/Linalg.md +++ b/mlir/docs/Dialects/Linalg.md @@ -60,32 +60,55 @@ needed to synthesize the control-flow required to iterate over its operands, according to their type. This notion of IR localization bears some resemblance to [URUK](http://icps.u-strasbg.fr/~bastoul/research/papers/GVBCPST06-IJPP.pdf). -Consider the following, partially specified, `linalg.generic` example: -``` -#attrs = {args_in: 1, args_out: 1} -func @example(%A: memref, - %B: memref>) { - linalg.generic #attrs (%2, %3): memref, - memref> +Consider the following fully specified `linalg.generic` example. +Here, the first operand is a `memref` of `f32` scalar elements that +has an ordinary identity layout, and the second one is a `memref` of +4-element vectors with a 2-strided, 1-offset layout. + +```mlir +// File name: example1.mlir +#accesses = [ + affine_map<(m) -> (m)>, + affine_map<(m) -> (m)> +] +#attrs = { + args_in = 1, + args_out = 1, + indexing_maps = #accesses, + iterator_types = ["parallel"] +} +// memory layouts +#identity = affine_map<(d0) -> (d0)> + +func @example(%A: memref, + %B: memref, offset: 1, strides: [2]>) { + linalg.generic #attrs %A, %B { + ^bb0(%a: f32, %b: vector<4xf32>): + %c = "some_compute"(%a, %b): (f32, vector<4xf32>) -> (vector<4xf32>) + linalg.yield %c: vector<4xf32> + } : memref, memref, offset: 1, strides: [2]> return } ``` The property "*Input and Output Operands Define The Iteration Space*" is materialized by a lowering into a form that will resemble: -``` -func @example(%A: memref, - %B: memref>) { - %M = "dim" %A, 0: index - %N = "dim" %B, 0: index - %eq = eq %M, %N: i1 // iteration space is consistent with data - assert(%eq): (i1) -> () - for %i = 0 to %M { - %a = load %A[%i]: memref - %b = load %B[%i]: memref, layout2> - // compute arg types match elemental tensor types - %c = "some_compute"(%a, %b): (f32, vector<4xf32>) -> (vector<4xf32>) - store %c, %B[%i]: memref, layout2> + +```mlir +// Run: mlir-opt example1.mlir -allow-unregistered-dialect -convert-linalg-to-loops +// This converted representation is in the `scf` dialect. +// It's syntax can be found here: https://mlir.llvm.org/docs/Dialects/SCFDialect/ +#map0 = affine_map<(d0) -> (d0 * 2 + 1)> + +func @example(%arg0: memref, %arg1: memref, #map0>) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = dim %arg0, %c0 : memref + scf.for %arg2 = %c0 to %0 step %c1 { + %1 = load %arg0[%arg2] : memref + %2 = load %arg1[%arg2] : memref, #map0> + %3 = "some_compute"(%1, %2) : (f32, vector<4xf32>) -> vector<4xf32> + store %3, %arg1[%arg2] : memref, #map0> } return } @@ -123,17 +146,30 @@ as well as [TACO](http://tensor-compiler.org/), has shown. A `linalg.generic` *defines* the mapping between the iteration space (i.e. the loops) and the data. -Consider the following, partially specified, `linalg.generic` example: +Consider the following fully specified `linalg.generic` example. +Here, the first `memref` is a 2-strided one on both of its dimensions, +and the second `memref` uses an identity layout. + ``` -#indexing_maps = { - (i, j) -> (j, i), - (i, j) -> (j) +// File name: example2.mlir +#indexing_maps = [ + affine_map<(i, j) -> (j, i)>, + affine_map<(i, j) -> (j)> +] +#attrs = { + args_in = 1, + args_out = 1, + indexing_maps = #indexing_maps, + iterator_types = ["parallel", "parallel"] } -#attrs = {args_in: 1, args_out: 1, indexings: indexing_maps} -func @example(%A: memref<8x?xf32, layout1>, - %B: memref>) { - linalg.generic #attrs (%A, %B): memref<8x?xf32, layout1>, - memref> + +func @example(%A: memref<8x?xf32, offset: 0, strides: [2, 2]>, + %B: memref>) { + linalg.generic #attrs %A, %B { + ^bb0(%a: f32, %b: vector<4xf32>): + %c = "some_compute"(%a, %b): (f32, vector<4xf32>) -> (vector<4xf32>) + linalg.yield %c: vector<4xf32> + }: memref<8x?xf32 , offset: 0, strides: [2, 2]>, memref> return } ``` @@ -141,22 +177,20 @@ func @example(%A: memref<8x?xf32, layout1>, The property "*Reversible Mappings Between Control and Data Structures*" is materialized by a lowering into a form that will resemble: ``` -#attrs = {args_in: 1, args_out: 1, indexings: indexing_maps} -func @example(%A: memref<8x?xf32, layout1>, - %B: memref>) { - // loop bounds determined from data sizes by “inverting the map” - %J = "dim" %A, 0: index - %I = "dim" %A, 1: index - %J2 = "dim" %B, 0: index - // iteration space is consistent with data + mapping inference - %eq = "eq" %J, %J2: i1 - "assert" %eq: (i1) -> () - for %i = 0 to %I { // loop order is fully defined by indexing maps - for %j = 0 to %J { // arbitrary permutations are possible - %a = "load" %A, %j, %i: memref<8x?xf32> - %b = "load" %B, %j: memref> - %c = "some_compute"(%a, %b): (f32, vector<4xf32>) -> (vector<4xf32>) - "store" %c, %B, %j: memref> +// Run: mlir-opt example2.mlir -allow-unregistered-dialect -convert-linalg-to-loops +#map0 = affine_map<(d0, d1) -> (d0 * 2 + d1 * 2)> + +func @example(%arg0: memref<8x?xf32, #map0>, %arg1: memref>) { + %c8 = constant 8 : index + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = dim %arg0, %c1 : memref<8x?xf32, #map0> + scf.for %arg2 = %c0 to %0 step %c1 { + scf.for %arg3 = %c0 to %c8 step %c1 { + %1 = load %arg0[%arg3, %arg2] : memref<8x?xf32, #map0> + %2 = load %arg1[%arg3] : memref> + %3 = "some_compute"(%1, %2) : (f32, vector<4xf32>) -> vector<4xf32> + store %3, %arg1[%arg3] : memref> } } return @@ -174,7 +208,7 @@ Answering these `2` questions is one of the main analyses that Linalg uses to implement transformations such as tiling, tiled producer-consumer fusion, and promotion to temporary buffers in fast memory. -In the current implementation, `linalg.generic` uses a list of [AffineMaps](). +In the current implementation, `linalg.generic` uses a list of [AffineMaps](https://mlir.llvm.org/docs/LangRef/#affinemap-attribute) (see the `#indexing_maps` attribute in the previous examples). This is a pragmatic short-term solution, but in the longer term note that this property could be even evaluated dynamically, similarly to inspector-executor algorithms. @@ -234,38 +268,53 @@ to correspond to the operations inside the region: the region can capture buffers arbitrarily and write into them. If this conflicts with some parallel iterator requirement, this is undefined behavior. -Concretely, consider the following, partially specified, `linalg.generic` -example: +Previous examples already ellaborate compute payloads with an unregistered function `"some_compute"`. The following code snippet shows what the result will be when using a concrete operation `addf`: ``` -#indexing_maps = { - (i, j) -> (i, j), - (i, j) -> (i, j) +// File name: example3.mlir +#indexing_maps = [ + affine_map<(i, j) -> (i, j)>, + affine_map<(i, j) -> (i, j)>, + affine_map<(i, j) -> (i, j)> +] +#attrs = { + args_in = 2, + args_out = 1, + indexing_maps = #indexing_maps, + iterator_types = ["parallel", "parallel"] } -#attrs = {args_in: 2, args_out: 1, indexings: #indexing_maps} func @example(%A: memref, %B: memref, %C: memref) { - linalg.generic #attrs (%A, %B, %C) { - ^bb0(%a: f32, %b: f32): - %c = addf %a, %b : f32 - return %c : f32 + linalg.generic #attrs %A, %B, %C { + ^bb0(%a: f32, %b: f32, %c: f32): + %d = addf %a, %b : f32 + linalg.yield %d : f32 }: memref, memref, memref return } ``` +This function basically element-wise adds up two matrices (`%A` and `%B`) and stores the result into another one (`%C`). + The property "*The Compute Payload is Specified With a Region*" is materialized by a lowering into a form that will resemble: ``` +// Run: mlir-opt example3.mlir -convert-linalg-to-loops +#indexing_maps = [ + affine_map<(i, j) -> (i, j)>, + affine_map<(i, j) -> (i, j)>, + affine_map<(i, j) -> (i, j)> +] +#attrs = { + args_in = 2, + args_out = 1, + indexing_maps = #indexing_maps, + iterator_types = ["parallel", "parallel"] +} func @example(%A: memref, %B: memref, %C: memref) { - %M = dim %A, 0: index - %N = dim %B, 1: index - for %i = 0 to %M { - for %j = 0 to %N { - %a = load %A[%i, %j]: memref - %b = load %B[%i, %j]: memref> - %c = addf %a, %b : f32 - store %c, %C[%i, %j]: memref - } - } + linalg.generic #attrs %A, %B, %C { + ^bb0(%a: f32, %b: f32, %c: f32): + %d = addf %a, %b : f32 + linalg.yield %d : f32 + }: memref, memref, memref return } ``` @@ -287,20 +336,27 @@ and integration at the ABI level. Regardless of whether one wants to use external library calls or a custom ISA, the problem for codegen is similar: preservation of a fixed granularity. -Consider the following, partially specified, `linalg.generic` -example: +Consider the following example that adds an additional attribute `library_call="pointwise_add"` +that specifies the name of an external library call we intend to use: ``` -#fun_attr = "pointwise_add" -#indexing_maps = { - (i, j) -> (i, j), - (i, j) -> (i, j) +// File name: example4.mlir +#indexing_maps = [ + affine_map<(i, j) -> (i, j)>, + affine_map<(i, j) -> (i, j)>, + affine_map<(i, j) -> (i, j)> +] +#attrs = { + args_in = 2, + args_out = 1, + indexing_maps = #indexing_maps, + iterator_types = ["parallel", "parallel"], + library_call = "pointwise_add" } -#attrs = {args_in: 2, args_out: 1, indexings: #indexing_maps, fun: #fun_attr} func @example(%A: memref, %B: memref, %C: memref) { - linalg.generic #attrs (%A, %B, %C) { - ^bb0(%a: f32, %b: f32): - %c = addf %a, %b : f32 - return %c : f32 + linalg.generic #attrs %A, %B, %C { + ^bb0(%a: f32, %b: f32, %c: f32): + %d = addf %a, %b : f32 + linalg.yield %d : f32 }: memref, memref, memref return } @@ -310,28 +366,39 @@ The property "*Map To an External Library Call*" is materialized by a lowering into a form that will resemble: ``` -func @pointwise_add_sxsxf32_sxsxf32(memref, memref, memref) -> () +// Run: mlir-opt example4.mlir -convert-linalg-to-std +// Note that we lower the Linalg dialect directly to the Standard dialect. +// See this doc: https://mlir.llvm.org/docs/Dialects/Standard/ -func @example(%A: memref, %B: memref, %C: memref) { - call @pointwise_add_sxsxf32_sxsxf32 (%A, %B, %C): - (memref, memref, memref) -> () +#map0 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> + +func @example(%arg0: memref, %arg1: memref, %arg2: memref) { + %0 = memref_cast %arg0 : memref to memref + %1 = memref_cast %arg1 : memref to memref + %2 = memref_cast %arg2 : memref to memref + call @pointwise_add(%0, %1, %2) : (memref, memref, memref) -> () return } +func @pointwise_add(memref, memref, memref) attributes {llvm.emit_c_interface} ``` Which, after lowering to LLVM resembles: ``` -func @pointwise_add_sxsxf32_sxsxf32(!llvm<"{ float*, i64, [2 x i64], [3 x i64] }*">, - !llvm<"{ float*, i64, [2 x i64], [3 x i64] }*">, - !llvm<"{ float*, i64, [2 x i64], [3 x i64] }*">) -> () - -func @example(%A: !llvm<"{ float*, i64, [2 x i64], [3 x i64] }*">, - %B: !llvm<"{ float*, i64, [2 x i64], [3 x i64] }*">, - %C: !llvm<"{ float*, i64, [2 x i64], [3 x i64] }*">) { - llvm.call @pointwise_add_sxsxf32_sxsxf32 (%A, %B, %C): - (!llvm<"{ float*, i64, [2 x i64], [3 x i64] }*">...) -> () +// Run: mlir-opt example4.mlir -convert-linalg-to-std | mlir-opt -convert-std-to-llvm +// Some generated code are omitted here. +func @example(%arg0: !llvm<"float*">, ...) { + ... + llvm.call @pointwise_add(...) : (!llvm<"float*">, ...) -> () return } + +llvm.func @pointwise_add(%arg0: !llvm<"float*">, ...) attributes {llvm.emit_c_interface} { + ... + llvm.call @_mlir_ciface_pointwise_add(%9, %19, %29) : (!llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] } +*">) -> () + llvm.return +} +llvm.func @_mlir_ciface_pointwise_add(!llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">, !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*">) attributes {llvm.emit_c_interface} ``` ##### Convention For External Library Interoperability From 1e027b77f056fb50e50d9f743e21eb41151c32da Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 28 Jul 2020 13:04:11 -0700 Subject: [PATCH 0360/1035] [llvm][NFC] refactor setBlockFrequency for clarity. The refactoring encapsulates frequency calculation in MachineBlockFrequencyInfo, and renames the API to clarify its motivation. It should clarify frequencies may not be reset 'freely' by users of the analysis, as the API serves as a partial update to avoid a full analysis recomputation. Differential Revision: https://reviews.llvm.org/D84427 --- llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h | 6 +++++- llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp | 11 ++++++++--- llvm/lib/CodeGen/MachineSink.cpp | 8 +++----- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h index 7ce11c784b08f..6c442d3d07bdb 100644 --- a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h +++ b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h @@ -74,7 +74,11 @@ class MachineBlockFrequencyInfo : public MachineFunctionPass { bool isIrrLoopHeader(const MachineBasicBlock *MBB) const; - void setBlockFreq(const MachineBasicBlock *MBB, uint64_t Freq); + /// incrementally calculate block frequencies when we split edges, to avoid + /// full CFG traversal. + void onEdgeSplit(const MachineBasicBlock &NewPredecessor, + const MachineBasicBlock &NewSuccessor, + const MachineBranchProbabilityInfo &MBPI); const MachineFunction *getFunction() const; const MachineBranchProbabilityInfo *getMBPI() const; diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp index c836997060149..54e0a14e05557 100644 --- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -247,10 +247,15 @@ bool MachineBlockFrequencyInfo::isIrrLoopHeader( return MBFI->isIrrLoopHeader(MBB); } -void MachineBlockFrequencyInfo::setBlockFreq(const MachineBasicBlock *MBB, - uint64_t Freq) { +void MachineBlockFrequencyInfo::onEdgeSplit( + const MachineBasicBlock &NewPredecessor, + const MachineBasicBlock &NewSuccessor, + const MachineBranchProbabilityInfo &MBPI) { assert(MBFI && "Expected analysis to be available"); - MBFI->setBlockFreq(MBB, Freq); + auto NewSuccFreq = MBFI->getBlockFreq(&NewPredecessor) * + MBPI.getEdgeProbability(&NewPredecessor, &NewSuccessor); + + MBFI->setBlockFreq(&NewSuccessor, NewSuccFreq.getFrequency()); } const MachineFunction *MachineBlockFrequencyInfo::getFunction() const { diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 5f958bbc31b7a..dfb88f6bf5170 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -347,11 +347,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) { << printMBBReference(*Pair.first) << " -- " << printMBBReference(*NewSucc) << " -- " << printMBBReference(*Pair.second) << '\n'); - if (MBFI) { - auto NewSuccFreq = MBFI->getBlockFreq(Pair.first) * - MBPI->getEdgeProbability(Pair.first, NewSucc); - MBFI->setBlockFreq(NewSucc, NewSuccFreq.getFrequency()); - } + if (MBFI) + MBFI->onEdgeSplit(*Pair.first, *NewSucc, *MBPI); + MadeChange = true; ++NumSplit; } else From 1f166edeb47ea3584f4f6a267a9054af994af45c Mon Sep 17 00:00:00 2001 From: Hafiz Abid Qadeer Date: Thu, 16 Jul 2020 21:40:31 +0100 Subject: [PATCH 0361/1035] [lld][linkerscript] Fix handling of DEFINED. Current implementation did not check that symbols is actually defined. Only checked for presence. GNU ld documentation says, "Return 1 if symbol is in the linker global symbol table and is defined before the statement using DEFINED in the script, otherwise return 0." https://sourceware.org/binutils/docs/ld/Builtin-Functions.html#Builtin-Functions Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D83758 --- lld/ELF/ScriptParser.cpp | 5 ++++- lld/test/ELF/linkerscript/Inputs/define.s | 3 +++ lld/test/ELF/linkerscript/define.test | 6 +++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 17ac7ff6d5f4a..eae1d17b2f43e 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -1311,7 +1311,10 @@ Expr ScriptParser::readPrimary() { } if (tok == "DEFINED") { StringRef name = readParenLiteral(); - return [=] { return symtab->find(name) ? 1 : 0; }; + return [=] { + Symbol *b = symtab->find(name); + return (b && b->isDefined()) ? 1 : 0; + }; } if (tok == "LENGTH") { StringRef name = readParenLiteral(); diff --git a/lld/test/ELF/linkerscript/Inputs/define.s b/lld/test/ELF/linkerscript/Inputs/define.s index bc60a233dcb4b..69f47a8803164 100644 --- a/lld/test/ELF/linkerscript/Inputs/define.s +++ b/lld/test/ELF/linkerscript/Inputs/define.s @@ -6,3 +6,6 @@ defined = 0 .section .bar,"a" .quad 1 + +.section .test,"a" +.quad 1 diff --git a/lld/test/ELF/linkerscript/define.test b/lld/test/ELF/linkerscript/define.test index 689476ba32ad0..3ecaa11cc5b69 100644 --- a/lld/test/ELF/linkerscript/define.test +++ b/lld/test/ELF/linkerscript/define.test @@ -3,13 +3,17 @@ # RUN: ld.lld -o %t --script %s %t.o # RUN: llvm-objdump --section-headers %t | FileCheck %s +EXTERN(extern_defined) SECTIONS { . = DEFINED(defined) ? 0x11000 : .; .foo : { *(.foo*) } . = DEFINED(notdefined) ? 0x12000 : 0x13000; .bar : { *(.bar*) } + . = DEFINED(extern_defined) ? 0x14000 : 0x15000; + .test : { *(.test*) } } # CHECK: 1 .foo 00000008 0000000000011000 DATA # CHECK: 2 .bar 00000008 0000000000013000 DATA -# CHECK: 3 .text 00000000 0000000000013008 TEXT +# CHECK: 3 .test 00000008 0000000000015000 DATA +# CHECK: 4 .text 00000000 0000000000015008 TEXT From b46176bbb094610460667edad950a9c99f844118 Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Wed, 22 Jul 2020 19:04:59 -0700 Subject: [PATCH 0362/1035] Reland [Coverage] Add comment to skipped regions Bug filled here: https://bugs.llvm.org/show_bug.cgi?id=45757. Add comment to skipped regions so we don't track execution count for lines containing only comments. Differential Revision: https://reviews.llvm.org/D83592 --- clang/include/clang/Lex/Preprocessor.h | 5 ++ clang/lib/CodeGen/CodeGenAction.cpp | 8 +-- clang/lib/CodeGen/CoverageMappingGen.cpp | 69 ++++++++++++++++-- clang/lib/CodeGen/CoverageMappingGen.h | 34 ++++++++- clang/lib/Lex/Preprocessor.cpp | 6 +- clang/test/CoverageMapping/break.c | 3 +- clang/test/CoverageMapping/builtinmacro.c | 3 +- clang/test/CoverageMapping/classtemplate.cpp | 3 +- clang/test/CoverageMapping/comment-in-macro.c | 7 +- clang/test/CoverageMapping/continue.c | 3 +- clang/test/CoverageMapping/coroutine.cpp | 3 +- .../test/CoverageMapping/deferred-region.cpp | 3 +- clang/test/CoverageMapping/if.cpp | 3 +- clang/test/CoverageMapping/includehell.cpp | 2 + clang/test/CoverageMapping/label.cpp | 5 +- clang/test/CoverageMapping/logical.cpp | 3 +- clang/test/CoverageMapping/loops.cpp | 5 +- .../CoverageMapping/macro-expressions.cpp | 4 +- clang/test/CoverageMapping/macroparams2.c | 4 +- clang/test/CoverageMapping/macros.c | 4 +- clang/test/CoverageMapping/macroscopes.cpp | 4 +- clang/test/CoverageMapping/moremacros.c | 4 +- clang/test/CoverageMapping/objc.m | 3 +- clang/test/CoverageMapping/pr32679.cpp | 5 +- clang/test/CoverageMapping/preprocessor.c | 11 +-- clang/test/CoverageMapping/return.c | 3 +- clang/test/CoverageMapping/switch.cpp | 5 +- clang/test/CoverageMapping/switchmacro.c | 3 +- clang/test/CoverageMapping/test.c | 3 +- clang/test/CoverageMapping/trycatch.cpp | 3 +- .../test/CoverageMapping/unreachable-macro.c | 4 +- clang/test/CoverageMapping/while.c | 5 +- clang/test/lit.cfg.py | 5 ++ .../test/profile/Inputs/instrprof-comdat.h | 2 +- .../test/profile/coverage_comments.cpp | 71 +++++++++++++++++++ .../instrprof-set-file-object-merging.c | 2 +- 36 files changed, 250 insertions(+), 60 deletions(-) create mode 100644 compiler-rt/test/profile/coverage_comments.cpp diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 5cd017fa925fd..b0dd363555ab1 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -419,6 +419,9 @@ class Preprocessor { /// The number of (LexLevel 0) preprocessor tokens. unsigned TokenCount = 0; + /// Preprocess every token regardless of LexLevel. + bool PreprocessToken = false; + /// The maximum number of (LexLevel 0) tokens before issuing a -Wmax-tokens /// warning, or zero for unlimited. unsigned MaxTokens = 0; @@ -1038,6 +1041,8 @@ class Preprocessor { OnToken = std::move(F); } + void setPreprocessToken(bool Preprocess) { PreprocessToken = Preprocess; } + bool isMacroDefined(StringRef Id) { return isMacroDefined(&Identifiers.get(Id)); } diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp index 55925110708ec..5a6ce0f5dbd50 100644 --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -990,11 +990,9 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { CoverageSourceInfo *CoverageInfo = nullptr; // Add the preprocessor callback only when the coverage mapping is generated. - if (CI.getCodeGenOpts().CoverageMapping) { - CoverageInfo = new CoverageSourceInfo; - CI.getPreprocessor().addPPCallbacks( - std::unique_ptr(CoverageInfo)); - } + if (CI.getCodeGenOpts().CoverageMapping) + CoverageInfo = CodeGen::CoverageMappingModuleGen::setUpCoverageCallbacks( + CI.getPreprocessor()); std::unique_ptr Result(new BackendConsumer( BA, CI.getDiagnostics(), CI.getHeaderSearchOpts(), diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 78b268f423cbf..9a7096b8d1d02 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -35,8 +35,35 @@ using namespace clang; using namespace CodeGen; using namespace llvm::coverage; +CoverageSourceInfo * +CoverageMappingModuleGen::setUpCoverageCallbacks(Preprocessor &PP) { + CoverageSourceInfo *CoverageInfo = new CoverageSourceInfo(); + PP.addPPCallbacks(std::unique_ptr(CoverageInfo)); + PP.addCommentHandler(CoverageInfo); + PP.setPreprocessToken(true); + PP.setTokenWatcher([CoverageInfo](clang::Token Tok) { + // Update previous token location. + CoverageInfo->PrevTokLoc = Tok.getLocation(); + CoverageInfo->updateNextTokLoc(Tok.getLocation()); + }); + return CoverageInfo; +} + void CoverageSourceInfo::SourceRangeSkipped(SourceRange Range, SourceLocation) { - SkippedRanges.push_back(Range); + SkippedRanges.push_back({Range}); +} + +bool CoverageSourceInfo::HandleComment(Preprocessor &PP, SourceRange Range) { + SkippedRanges.push_back({Range, PrevTokLoc}); + AfterComment = true; + return false; +} + +void CoverageSourceInfo::updateNextTokLoc(SourceLocation Loc) { + if (AfterComment) { + SkippedRanges.back().NextTokLoc = Loc; + AfterComment = false; + } } namespace { @@ -274,8 +301,34 @@ class CoverageMappingBuilder { return None; } + /// This shrinks the skipped range if it spans a line that contains a + /// non-comment token. If shrinking the skipped range would make it empty, + /// this returns None. + Optional adjustSkippedRange(SourceManager &SM, + SpellingRegion SR, + SourceLocation PrevTokLoc, + SourceLocation NextTokLoc) { + // If Range begin location is invalid, it's not a comment region. + if (PrevTokLoc.isInvalid()) + return SR; + unsigned PrevTokLine = SM.getSpellingLineNumber(PrevTokLoc); + unsigned NextTokLine = SM.getSpellingLineNumber(NextTokLoc); + SpellingRegion newSR(SR); + if (SR.LineStart == PrevTokLine) { + newSR.LineStart = SR.LineStart + 1; + newSR.ColumnStart = 1; + } + if (SR.LineEnd == NextTokLine) { + newSR.LineEnd = SR.LineEnd - 1; + newSR.ColumnEnd = SR.ColumnStart + 1; + } + if (newSR.isInSourceOrder()) + return newSR; + return None; + } + /// Gather all the regions that were skipped by the preprocessor - /// using the constructs like #if. + /// using the constructs like #if or comments. void gatherSkippedRegions() { /// An array of the minimum lineStarts and the maximum lineEnds /// for mapping regions from the appropriate source files. @@ -291,9 +344,10 @@ class CoverageMappingBuilder { } auto SkippedRanges = CVM.getSourceInfo().getSkippedRanges(); - for (const auto &I : SkippedRanges) { - auto LocStart = I.getBegin(); - auto LocEnd = I.getEnd(); + for (auto &I : SkippedRanges) { + SourceRange Range = I.Range; + auto LocStart = Range.getBegin(); + auto LocEnd = Range.getEnd(); assert(SM.isWrittenInSameFile(LocStart, LocEnd) && "region spans multiple files"); @@ -301,6 +355,11 @@ class CoverageMappingBuilder { if (!CovFileID) continue; SpellingRegion SR{SM, LocStart, LocEnd}; + if (Optional res = + adjustSkippedRange(SM, SR, I.PrevTokLoc, I.NextTokLoc)) + SR = res.getValue(); + else + continue; auto Region = CounterMappingRegion::makeSkipped( *CovFileID, SR.LineStart, SR.ColumnStart, SR.LineEnd, SR.ColumnEnd); // Make sure that we only collect the regions that are inside diff --git a/clang/lib/CodeGen/CoverageMappingGen.h b/clang/lib/CodeGen/CoverageMappingGen.h index 5d79d1e656703..d3eec226d67ca 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.h +++ b/clang/lib/CodeGen/CoverageMappingGen.h @@ -16,6 +16,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" #include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/Preprocessor.h" #include "llvm/ADT/DenseMap.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/raw_ostream.h" @@ -29,15 +30,40 @@ class Preprocessor; class Decl; class Stmt; +struct SkippedRange { + SourceRange Range; + // The location of token before the skipped source range. + SourceLocation PrevTokLoc; + // The location of token after the skipped source range. + SourceLocation NextTokLoc; + + SkippedRange(SourceRange Range, SourceLocation PrevTokLoc = SourceLocation(), + SourceLocation NextTokLoc = SourceLocation()) + : Range(Range), PrevTokLoc(PrevTokLoc), NextTokLoc(NextTokLoc) {} +}; + /// Stores additional source code information like skipped ranges which /// is required by the coverage mapping generator and is obtained from /// the preprocessor. -class CoverageSourceInfo : public PPCallbacks { - std::vector SkippedRanges; +class CoverageSourceInfo : public PPCallbacks, public CommentHandler { + // A vector of skipped source ranges and PrevTokLoc with NextTokLoc. + std::vector SkippedRanges; + bool AfterComment = false; + public: - ArrayRef getSkippedRanges() const { return SkippedRanges; } + // Location of the token parsed before HandleComment is called. This is + // updated every time Preprocessor::Lex lexes a new token. + SourceLocation PrevTokLoc; + // The location of token before comment. + SourceLocation BeforeCommentLoc; + + std::vector &getSkippedRanges() { return SkippedRanges; } void SourceRangeSkipped(SourceRange Range, SourceLocation EndifLoc) override; + + bool HandleComment(Preprocessor &PP, SourceRange Range) override; + + void updateNextTokLoc(SourceLocation Loc); }; namespace CodeGen { @@ -66,6 +92,8 @@ class CoverageMappingModuleGen { uint64_t FilenamesRef); public: + static CoverageSourceInfo *setUpCoverageCallbacks(Preprocessor &PP); + CoverageMappingModuleGen(CodeGenModule &CGM, CoverageSourceInfo &SourceInfo) : CGM(CGM), SourceInfo(SourceInfo) {} diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 160e2b6ed8846..58c28cea30a18 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -969,8 +969,10 @@ void Preprocessor::Lex(Token &Result) { LastTokenWasAt = Result.is(tok::at); --LexLevel; - if (LexLevel == 0 && !Result.getFlag(Token::IsReinjected)) { - ++TokenCount; + if ((LexLevel == 0 || PreprocessToken) && + !Result.getFlag(Token::IsReinjected)) { + if (LexLevel == 0) + ++TokenCount; if (OnToken) OnToken(Result); } diff --git a/clang/test/CoverageMapping/break.c b/clang/test/CoverageMapping/break.c index 08461d7ed2de2..191f34765a65b 100644 --- a/clang/test/CoverageMapping/break.c +++ b/clang/test/CoverageMapping/break.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %t.stripped.c | FileCheck %s int main() { // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 int cnt = 0; // CHECK-NEXT: File 0, [[@LINE+1]]:9 -> [[@LINE+1]]:18 = #0 diff --git a/clang/test/CoverageMapping/builtinmacro.c b/clang/test/CoverageMapping/builtinmacro.c index 63f5584d40c2c..0de0e8e6a1252 100644 --- a/clang/test/CoverageMapping/builtinmacro.c +++ b/clang/test/CoverageMapping/builtinmacro.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %t.stripped.c | FileCheck %s // Test the coverage mapping generation for built-in macroes. diff --git a/clang/test/CoverageMapping/classtemplate.cpp b/clang/test/CoverageMapping/classtemplate.cpp index 0dbb0c0ede841..9250069e04f65 100644 --- a/clang/test/CoverageMapping/classtemplate.cpp +++ b/clang/test/CoverageMapping/classtemplate.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %s > %tmapping +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %t.stripped.cpp > %tmapping // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-CONSTRUCTOR // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-GETTER // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-SETTER diff --git a/clang/test/CoverageMapping/comment-in-macro.c b/clang/test/CoverageMapping/comment-in-macro.c index 06e8adbc41ee2..86b554f8bd50c 100644 --- a/clang/test/CoverageMapping/comment-in-macro.c +++ b/clang/test/CoverageMapping/comment-in-macro.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %t.stripped.c | FileCheck %s #define x1 "" // ... #define x2 return 0 @@ -7,5 +8,5 @@ int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+3]]:2 = #0 x1; // CHECK-NEXT: Expansion,File 0, [[@LINE]]:3 -> [[@LINE]]:5 = #0 x2; // CHECK-NEXT: Expansion,File 0, [[@LINE]]:3 -> [[@LINE]]:5 = #0 } -// CHECK-NEXT: File 1, 3:12 -> 3:14 = #0 -// CHECK-NEXT: File 2, 4:12 -> 4:20 = #0 +// CHECK-NEXT: File 1, 4:12 -> 4:14 = #0 +// CHECK-NEXT: File 2, 5:12 -> 5:20 = #0 diff --git a/clang/test/CoverageMapping/continue.c b/clang/test/CoverageMapping/continue.c index 9864c912f239c..774fe2ee6fffd 100644 --- a/clang/test/CoverageMapping/continue.c +++ b/clang/test/CoverageMapping/continue.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %t.stripped.c | FileCheck %s int main() { // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+21]]:2 = #0 int j = 0; // CHECK-NEXT: File 0, [[@LINE+2]]:18 -> [[@LINE+2]]:24 = (#0 + #1) diff --git a/clang/test/CoverageMapping/coroutine.cpp b/clang/test/CoverageMapping/coroutine.cpp index dc9473348fc90..a614e6e8ea402 100644 --- a/clang/test/CoverageMapping/coroutine.cpp +++ b/clang/test/CoverageMapping/coroutine.cpp @@ -1,6 +1,7 @@ // fixme: the following line is added to cleanup bots, will be removed in weeks. // RUN: rm -f %S/coroutine.ll -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping %s -o - | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping %t.stripped.cpp -o - | FileCheck %s namespace std::experimental { template diff --git a/clang/test/CoverageMapping/deferred-region.cpp b/clang/test/CoverageMapping/deferred-region.cpp index 45113e46f02b2..8db6119472dbc 100644 --- a/clang/test/CoverageMapping/deferred-region.cpp +++ b/clang/test/CoverageMapping/deferred-region.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -fexceptions -fcxx-exceptions -emit-llvm-only -triple %itanium_abi_triple -main-file-name deferred-region.cpp -I %S/Inputs %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -fexceptions -fcxx-exceptions -emit-llvm-only -triple %itanium_abi_triple -main-file-name deferred-region.cpp -I %S/Inputs %t.stripped.cpp | FileCheck %s #define IF if #define STMT(S) S diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp index e3d6f4e25e573..8ffc09d29a3c7 100644 --- a/clang/test/CoverageMapping/if.cpp +++ b/clang/test/CoverageMapping/if.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name if.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name if.cpp %t.stripped.cpp | FileCheck %s int nop() { return 0; } diff --git a/clang/test/CoverageMapping/includehell.cpp b/clang/test/CoverageMapping/includehell.cpp index fd08d6af7f3da..c92f12e5e80d0 100644 --- a/clang/test/CoverageMapping/includehell.cpp +++ b/clang/test/CoverageMapping/includehell.cpp @@ -51,6 +51,7 @@ int main() { // CHECK-START: File [[START3]], 4:29 -> 5:1 = #9 // CHECK-CODE: File [[CODE1:[0-9]]], 1:1 -> 14:1 = #1 +// CHECK-CODE: Skipped,File [[CODE1]], 1:1 -> 1:41 = 0 // CHECK-CODE-NEXT: File [[CODE1]], 4:5 -> 4:11 = #1 // CHECK-CODE: File [[CODE1]], 4:13 -> 6:2 = #2 // CHECK-CODE: File [[CODE1]], 6:8 -> 8:2 = (#1 - #2) @@ -58,6 +59,7 @@ int main() { // CHECK-CODE: File [[CODE1]], 9:11 -> 11:2 = #3 // CHECK-CODE: File [[CODE1]], 11:8 -> 13:2 = (#1 - #3) // CHECK-CODE: File [[CODE2:[0-9]]], 1:1 -> 14:1 = #5 +// CHECK-CODE: Skipped,File [[CODE2]], 1:1 -> 1:41 = 0 // CHECK-CODE-NEXT: File [[CODE2]], 4:5 -> 4:11 = #5 // CHECK-CODE: File [[CODE2]], 4:13 -> 6:2 = #6 // CHECK-CODE: File [[CODE2]], 6:8 -> 8:2 = (#5 - #6) diff --git a/clang/test/CoverageMapping/label.cpp b/clang/test/CoverageMapping/label.cpp index 995835ad6b2ff..e77372d325e5a 100644 --- a/clang/test/CoverageMapping/label.cpp +++ b/clang/test/CoverageMapping/label.cpp @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %t.stripped.cpp | FileCheck %s - // CHECK: func +// CHECK: func void func() { // CHECK-NEXT: File 0, [[@LINE]]:13 -> {{[0-9]+}}:2 = #0 int i = 0; // CHECK-NEXT: File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:20 = (#0 + #3) // CHECK-NEXT: File 0, [[@LINE+1]]:22 -> [[@LINE+1]]:25 = #3 diff --git a/clang/test/CoverageMapping/logical.cpp b/clang/test/CoverageMapping/logical.cpp index bc7c785b7b550..cca03377c98a4 100644 --- a/clang/test/CoverageMapping/logical.cpp +++ b/clang/test/CoverageMapping/logical.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %t.stripped.cpp | FileCheck %s int main() { // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+15]]:2 = #0 bool bt = true; diff --git a/clang/test/CoverageMapping/loops.cpp b/clang/test/CoverageMapping/loops.cpp index ff7aafd66d944..498d214e69d9c 100644 --- a/clang/test/CoverageMapping/loops.cpp +++ b/clang/test/CoverageMapping/loops.cpp @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %t.stripped.cpp | FileCheck %s - // CHECK: rangedFor +// CHECK: rangedFor void rangedFor() { // CHECK-NEXT: File 0, [[@LINE]]:18 -> {{[0-9]+}}:2 = #0 int arr[] = { 1, 2, 3, 4, 5 }; int sum = 0; // CHECK: Gap,File 0, [[@LINE+1]]:20 -> [[@LINE+1]]:21 = #1 diff --git a/clang/test/CoverageMapping/macro-expressions.cpp b/clang/test/CoverageMapping/macro-expressions.cpp index 26d70c67fca08..60afc5238b9eb 100644 --- a/clang/test/CoverageMapping/macro-expressions.cpp +++ b/clang/test/CoverageMapping/macro-expressions.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp -w %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp -w %t.stripped.cpp | FileCheck %s #define EXPR(x) (x) #define NEXPR(x) (!x) #define DECL(T, x) T x diff --git a/clang/test/CoverageMapping/macroparams2.c b/clang/test/CoverageMapping/macroparams2.c index 4e04581b725e8..30ce25d9accda 100644 --- a/clang/test/CoverageMapping/macroparams2.c +++ b/clang/test/CoverageMapping/macroparams2.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %t.stripped.c | FileCheck %s #define MACRO(REFS, CALLS) (4 * (CALLS) < (REFS)) struct S { diff --git a/clang/test/CoverageMapping/macros.c b/clang/test/CoverageMapping/macros.c index 39cd190b2a882..83e2029be5612 100644 --- a/clang/test/CoverageMapping/macros.c +++ b/clang/test/CoverageMapping/macros.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %t.stripped.c | FileCheck %s #define MACRO return; bar() #define MACRO_2 bar() #define MACRO_1 return; MACRO_2 diff --git a/clang/test/CoverageMapping/macroscopes.cpp b/clang/test/CoverageMapping/macroscopes.cpp index 3f5f65e5ad7b5..62f5dbe77981f 100644 --- a/clang/test/CoverageMapping/macroscopes.cpp +++ b/clang/test/CoverageMapping/macroscopes.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %t.stripped.cpp | FileCheck %s #define starts_a_scope for (int i = 0; i < 2; ++i) { #define ends_a_scope \ diff --git a/clang/test/CoverageMapping/moremacros.c b/clang/test/CoverageMapping/moremacros.c index 5e0d4a6abf257..ed89dcafd6723 100644 --- a/clang/test/CoverageMapping/moremacros.c +++ b/clang/test/CoverageMapping/moremacros.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %t.stripped.c | FileCheck %s #define LBRAC { #define RBRAC } diff --git a/clang/test/CoverageMapping/objc.m b/clang/test/CoverageMapping/objc.m index 4e4c184f0a887..008d291aee31a 100644 --- a/clang/test/CoverageMapping/objc.m +++ b/clang/test/CoverageMapping/objc.m @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 -w %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.m +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 -w %t.stripped.m | FileCheck %s @interface A - (void)bork:(int)msg; diff --git a/clang/test/CoverageMapping/pr32679.cpp b/clang/test/CoverageMapping/pr32679.cpp index eac3afb6efb6c..c93133f8c2f2d 100644 --- a/clang/test/CoverageMapping/pr32679.cpp +++ b/clang/test/CoverageMapping/pr32679.cpp @@ -1,5 +1,6 @@ -// RUN: %clang_cc1 -cc1 -triple i686-pc-windows-msvc19.0.0 -emit-obj -fprofile-instrument=clang -std=c++14 -fdelayed-template-parsing -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=MSABI -implicit-check-not=f2 -// RUN: %clang_cc1 -cc1 -triple %itanium_abi_triple -emit-obj -fprofile-instrument=clang -std=c++14 -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=ITANIUM -implicit-check-not=f2 +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -cc1 -triple i686-pc-windows-msvc19.0.0 -emit-obj -fprofile-instrument=clang -std=c++14 -fdelayed-template-parsing -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %t.stripped.cpp | FileCheck %s -check-prefix=MSABI -implicit-check-not=f2 +// RUN: %clang_cc1 -cc1 -triple %itanium_abi_triple -emit-obj -fprofile-instrument=clang -std=c++14 -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %t.stripped.cpp | FileCheck %s -check-prefix=ITANIUM -implicit-check-not=f2 template struct CreateSpecialization; diff --git a/clang/test/CoverageMapping/preprocessor.c b/clang/test/CoverageMapping/preprocessor.c index 9225c9f162a20..b68aa2e0e93c6 100644 --- a/clang/test/CoverageMapping/preprocessor.c +++ b/clang/test/CoverageMapping/preprocessor.c @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %t.stripped.c | FileCheck %s - // CHECK: func +// CHECK: func void func() { // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+5]]:2 = #0 int i = 0; #ifdef MACRO // CHECK-NEXT: Skipped,File 0, [[@LINE]]:1 -> [[@LINE+2]]:7 = 0 @@ -11,7 +12,7 @@ void func() { // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+5]]:2 = #0 // CHECK: main int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 int i = 0; -# if 0 // CHECK-NEXT: Skipped,File 0, [[@LINE]]:1 -> [[@LINE+4]]:29 = 0 +#if 0 // CHECK-NEXT: Skipped,File 0, [[@LINE]]:1 -> [[@LINE+4]]:9 = 0 if(i == 0) { i = 1; } @@ -29,7 +30,7 @@ int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 } #endif - // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+4]]:24 + // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+4]]:8 #\ if 0 #\ @@ -59,7 +60,7 @@ int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 #\ endif - // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+6]]:26 + // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+6]]:10 #\ ifdef NOT_DEFINED #\ diff --git a/clang/test/CoverageMapping/return.c b/clang/test/CoverageMapping/return.c index 440acb569b8fa..fae2e9f761b7a 100644 --- a/clang/test/CoverageMapping/return.c +++ b/clang/test/CoverageMapping/return.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %t.stripped.c | FileCheck %s // CHECK: func void func() { // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+3]]:2 = #0 diff --git a/clang/test/CoverageMapping/switch.cpp b/clang/test/CoverageMapping/switch.cpp index 25ea4053f4e2c..1a8db09b3ff14 100644 --- a/clang/test/CoverageMapping/switch.cpp +++ b/clang/test/CoverageMapping/switch.cpp @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name switch.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name switch.cpp %t.stripped.cpp | FileCheck %s - // CHECK: foo +// CHECK: foo void foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+8]]:2 = #0 switch(i) { // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+4]]:10 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:11 = #2 diff --git a/clang/test/CoverageMapping/switchmacro.c b/clang/test/CoverageMapping/switchmacro.c index fc0392fb91e53..5c6a37e77f944 100644 --- a/clang/test/CoverageMapping/switchmacro.c +++ b/clang/test/CoverageMapping/switchmacro.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %t.stripped.c | FileCheck %s #define FOO(x) (void)x diff --git a/clang/test/CoverageMapping/test.c b/clang/test/CoverageMapping/test.c index ae73fcb3bbab2..559036a3be9be 100644 --- a/clang/test/CoverageMapping/test.c +++ b/clang/test/CoverageMapping/test.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %t.stripped.c | FileCheck %s void bar(); static void static_func(); diff --git a/clang/test/CoverageMapping/trycatch.cpp b/clang/test/CoverageMapping/trycatch.cpp index 5d284daaca017..ba1b26b7acee7 100644 --- a/clang/test/CoverageMapping/trycatch.cpp +++ b/clang/test/CoverageMapping/trycatch.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %t.stripped.cpp | FileCheck %s class Error { }; diff --git a/clang/test/CoverageMapping/unreachable-macro.c b/clang/test/CoverageMapping/unreachable-macro.c index b9d4f3616ffa5..b84acca33f938 100644 --- a/clang/test/CoverageMapping/unreachable-macro.c +++ b/clang/test/CoverageMapping/unreachable-macro.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %t.stripped.c | FileCheck %s #define WHILE while (0) {} // CHECK: counters_in_macro_following_unreachable diff --git a/clang/test/CoverageMapping/while.c b/clang/test/CoverageMapping/while.c index 616ecf69020d7..e0c3ca35537f8 100644 --- a/clang/test/CoverageMapping/while.c +++ b/clang/test/CoverageMapping/while.c @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %t.stripped.c | FileCheck %s - // CHECK: main +// CHECK: main int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+8]]:2 = #0 int j = 0; // CHECK-NEXT: File 0, [[@LINE+1]]:9 -> [[@LINE+1]]:14 = (#0 + #1) while(j < 5) ++j; // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE]]:16 = #1 diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index dacda6894a045..6c677eda9a093 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -91,6 +91,11 @@ ('%hmaptool', "'%s' %s" % (config.python_executable, os.path.join(config.clang_tools_dir, 'hmaptool')))) +# Strip C++ comments "//"" from tests +config.substitutions.append( + ('%strip_comments', "sed 's/[ \t]*\/\/.*//' %s") +) + # Plugins (loadable modules) if config.has_plugins and config.llvm_plugin_ext: config.available_features.add('plugins') diff --git a/compiler-rt/test/profile/Inputs/instrprof-comdat.h b/compiler-rt/test/profile/Inputs/instrprof-comdat.h index 61e283cc878ed..956496ee1361a 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-comdat.h +++ b/compiler-rt/test/profile/Inputs/instrprof-comdat.h @@ -18,6 +18,6 @@ template T FOO::DoIt(T ti) { // HEADER: [[@LINE]]| 2|template if (I > ti / 2) // HEADER: [[@LINE]]| 20| if (I > ti t -= 1; // HEADER: [[@LINE]]| 8| t -= 1; } // HEADER: [[@LINE]]| 10| } - // HEADER: [[@LINE]]| 1| + // HEADER: [[@LINE]]| | return t; // HEADER: [[@LINE]]| 1| return t; } diff --git a/compiler-rt/test/profile/coverage_comments.cpp b/compiler-rt/test/profile/coverage_comments.cpp new file mode 100644 index 0000000000000..0cf78add7913b --- /dev/null +++ b/compiler-rt/test/profile/coverage_comments.cpp @@ -0,0 +1,71 @@ +// RUN: %clangxx_profgen -fcoverage-mapping -Wno-comment -o %t %s +// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t +// RUN: llvm-profdata merge -o %t.profdata %t.profraw +// RUN: llvm-cov show %t -instr-profile %t.profdata -path-equivalence=/tmp,%S 2>&1 | FileCheck %s + +int main() { // CHECK: [[# @LINE]]| 1|int main() { + /* comment */ int x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + int y = 0; /* comment */ // CHECK-NEXT: [[# @LINE]]| 1| + int z = 0; // comment // CHECK-NEXT: [[# @LINE]]| 1| + // comment // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + x = 0; /* // CHECK-NEXT: [[# @LINE]]| 1| + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + z = // CHECK-NEXT: [[# @LINE]]| 1| + x // comment // CHECK-NEXT: [[# @LINE]]| 1| + // comment // CHECK-NEXT: [[# @LINE]]| | + + /* // CHECK-NEXT: [[# @LINE]]| 1| + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */y; // CHECK-NEXT: [[# @LINE]]| 1| + // CHECK-NEXT: [[# @LINE]]| | + // Comments inside directives. // CHECK-NEXT: [[# @LINE]]| | + #if 0 //comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ x = 0; // CHECK-NEXT: [[# @LINE]]| | + y = 0; /* comment */ // CHECK-NEXT: [[# @LINE]]| | + z = 0; // comment // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + x = 0; /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ x = 0; // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + #endif // comment // CHECK-NEXT: [[# @LINE]]| | + #if 1 // comment // CHECK-NEXT: [[# @LINE]]| 1| + /* comment */ x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + y = 0; /* comment */ // CHECK-NEXT: [[# @LINE]]| 1| + z = 0; // comment // CHECK-NEXT: [[# @LINE]]| 1| + // comment // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + x = 0; /* // CHECK-NEXT: [[# @LINE]]| 1| + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + #endif //comment // CHECK-NEXT: [[# @LINE]]| 1| + return 0; // CHECK-NEXT: [[# @LINE]]| 1| +} // CHECK-NEXT: [[# @LINE]]| 1| diff --git a/compiler-rt/test/profile/instrprof-set-file-object-merging.c b/compiler-rt/test/profile/instrprof-set-file-object-merging.c index 0ca5f6ff9ed95..35e9becf228f0 100644 --- a/compiler-rt/test/profile/instrprof-set-file-object-merging.c +++ b/compiler-rt/test/profile/instrprof-set-file-object-merging.c @@ -34,7 +34,7 @@ int main(int argc, const char *argv[]) { // CHECK: 17| 2| // CHECK: 18| 2| FILE *F = fopen(argv[1], "r+b"); // CHECK: 19| 2| if (!F) { -// CHECK: 20| 1| // File might not exist, try opening with truncation +// CHECK: 20| | // File might not exist, try opening with truncation // CHECK: 21| 1| F = fopen(argv[1], "w+b"); // CHECK: 22| 1| } // CHECK: 23| 2| __llvm_profile_set_file_object(F, 1); From 740a164dec483225cbd02ab6c82199e2747ffacb Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Tue, 28 Jul 2020 12:09:16 -0700 Subject: [PATCH 0363/1035] PR46377: Fix dependence calculation for function types and typedef types. We previously did not treat a function type as dependent if it had a parameter pack with a non-dependent type -- such a function type depends on the arity of the pack so is dependent even though none of the parameter types is dependent. In order to properly handle this, we now treat pack expansion types as always being dependent types (depending on at least the pack arity), and always canonically being pack expansion types, even in the unusual case when the pattern is not a dependent type. This does mean that we can have canonical types that are pack expansions that contain no unexpanded packs, which is unfortunate but not inaccurate. We also previously did not treat a typedef type as instantiation-dependent if its canonical type was not instantiation-dependent. That's wrong because instantiation-dependence is a property of the type sugar, not of the type; an instantiation-dependent type can have a non-instantiation-dependent canonical type. --- clang/include/clang/AST/ASTContext.h | 10 +++++- clang/include/clang/AST/Type.h | 13 +++---- clang/include/clang/Basic/TypeNodes.td | 2 +- clang/lib/AST/ASTContext.cpp | 34 +++++++------------ clang/lib/AST/Type.cpp | 9 +++-- clang/lib/CodeGen/CGDebugInfo.cpp | 1 - clang/lib/CodeGen/CodeGenFunction.cpp | 1 - clang/lib/Sema/SemaExpr.cpp | 1 - clang/lib/Sema/SemaLambda.cpp | 3 +- clang/lib/Sema/SemaTemplateVariadic.cpp | 3 +- clang/lib/Sema/SemaType.cpp | 2 +- .../alias-template-nondependent.cpp | 24 +++++++++++++ 12 files changed, 62 insertions(+), 41 deletions(-) create mode 100644 clang/test/SemaTemplate/alias-template-nondependent.cpp diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 59e2679ddded7..6c00fe86f282d 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1459,8 +1459,16 @@ class ASTContext : public RefCountedBase { void getInjectedTemplateArgs(const TemplateParameterList *Params, SmallVectorImpl &Args); + /// Form a pack expansion type with the given pattern. + /// \param NumExpansions The number of expansions for the pack, if known. + /// \param ExpectPackInType If \c false, we should not expect \p Pattern to + /// contain an unexpanded pack. This only makes sense if the pack + /// expansion is used in a context where the arity is inferred from + /// elsewhere, such as if the pattern contains a placeholder type or + /// if this is the canonical type of another pack expansion type. QualType getPackExpansionType(QualType Pattern, - Optional NumExpansions); + Optional NumExpansions, + bool ExpectPackInType = true); QualType getObjCInterfaceType(const ObjCInterfaceDecl *Decl, ObjCInterfaceDecl *PrevDecl = nullptr) const; diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 9a745ef20fac3..7fe652492b0e0 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -4383,11 +4383,7 @@ class TypedefType : public Type { protected: friend class ASTContext; // ASTContext creates these. - TypedefType(TypeClass tc, const TypedefNameDecl *D, QualType can) - : Type(tc, can, can->getDependence() & ~TypeDependence::UnexpandedPack), - Decl(const_cast(D)) { - assert(!isa(can) && "Invalid canonical type"); - } + TypedefType(TypeClass tc, const TypedefNameDecl *D, QualType can); public: TypedefNameDecl *getDecl() const { return Decl; } @@ -5624,7 +5620,8 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { PackExpansionType(QualType Pattern, QualType Canon, Optional NumExpansions) : Type(PackExpansion, Canon, - (Pattern->getDependence() | TypeDependence::Instantiation) & + (Pattern->getDependence() | TypeDependence::Dependent | + TypeDependence::Instantiation) & ~TypeDependence::UnexpandedPack), Pattern(Pattern) { PackExpansionTypeBits.NumExpansions = @@ -5645,8 +5642,8 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { return None; } - bool isSugared() const { return !Pattern->isDependentType(); } - QualType desugar() const { return isSugared() ? Pattern : QualType(this, 0); } + bool isSugared() const { return false; } + QualType desugar() const { return QualType(this, 0); } void Profile(llvm::FoldingSetNodeID &ID) { Profile(ID, getPattern(), getNumExpansions()); diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td index a4e3002b90753..011394c3ef455 100644 --- a/clang/include/clang/Basic/TypeNodes.td +++ b/clang/include/clang/Basic/TypeNodes.td @@ -100,7 +100,7 @@ def DeducedTemplateSpecializationType : TypeNode; def InjectedClassNameType : TypeNode, AlwaysDependent, LeafType; def DependentNameType : TypeNode, AlwaysDependent; def DependentTemplateSpecializationType : TypeNode, AlwaysDependent; -def PackExpansionType : TypeNode, NeverCanonicalUnlessDependent; +def PackExpansionType : TypeNode, AlwaysDependent; def ObjCTypeParamType : TypeNode, NeverCanonical; def ObjCObjectType : TypeNode; def ObjCInterfaceType : TypeNode, LeafType; diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index e7518a538fe67..25bf71519d1c3 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -4817,37 +4817,27 @@ ASTContext::getInjectedTemplateArgs(const TemplateParameterList *Params, } QualType ASTContext::getPackExpansionType(QualType Pattern, - Optional NumExpansions) { + Optional NumExpansions, + bool ExpectPackInType) { + assert((!ExpectPackInType || Pattern->containsUnexpandedParameterPack()) && + "Pack expansions must expand one or more parameter packs"); + llvm::FoldingSetNodeID ID; PackExpansionType::Profile(ID, Pattern, NumExpansions); - // A deduced type can deduce to a pack, eg - // auto ...x = some_pack; - // That declaration isn't (yet) valid, but is created as part of building an - // init-capture pack: - // [...x = some_pack] {} - assert((Pattern->containsUnexpandedParameterPack() || - Pattern->getContainedDeducedType()) && - "Pack expansions must expand one or more parameter packs"); void *InsertPos = nullptr; - PackExpansionType *T - = PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); + PackExpansionType *T = PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); if (T) return QualType(T, 0); QualType Canon; if (!Pattern.isCanonical()) { - Canon = getCanonicalType(Pattern); - // The canonical type might not contain an unexpanded parameter pack, if it - // contains an alias template specialization which ignores one of its - // parameters. - if (Canon->containsUnexpandedParameterPack()) { - Canon = getPackExpansionType(Canon, NumExpansions); - - // Find the insert position again, in case we inserted an element into - // PackExpansionTypes and invalidated our insert position. - PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); - } + Canon = getPackExpansionType(getCanonicalType(Pattern), NumExpansions, + /*ExpectPackInType=*/false); + + // Find the insert position again, in case we inserted an element into + // PackExpansionTypes and invalidated our insert position. + PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); } T = new (*this, TypeAlignment) diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 0122d2e7de52d..d40ba4c648c4c 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -1187,9 +1187,6 @@ struct SimpleTransformVisitor : public TypeVisitor { T->getTypeConstraintArguments()); } - // FIXME: Non-trivial to implement, but important for C++ - SUGARED_TYPE_CLASS(PackExpansion) - QualType VisitObjCObjectType(const ObjCObjectType *T) { QualType baseType = recurse(T->getBaseType()); if (baseType.isNull()) @@ -3348,6 +3345,12 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, getExtProtoInfo(), Ctx, isCanonicalUnqualified()); } +TypedefType::TypedefType(TypeClass tc, const TypedefNameDecl *D, QualType can) + : Type(tc, can, D->getUnderlyingType()->getDependence()), + Decl(const_cast(D)) { + assert(!isa(can) && "Invalid canonical type"); +} + QualType TypedefType::desugar() const { return getDecl()->getUnderlyingType(); } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 6965c4a1209c2..780e0c692c051 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -3252,7 +3252,6 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) { case Type::TypeOf: case Type::Decltype: case Type::UnaryTransform: - case Type::PackExpansion: break; } diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 8ce488f35dd32..8f79cc77f0e64 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -2075,7 +2075,6 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) { case Type::UnaryTransform: case Type::Attributed: case Type::SubstTemplateTypeParm: - case Type::PackExpansion: case Type::MacroQualified: // Keep walking after single level desugaring. type = type.getSingleStepDesugaredType(getContext()); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 21d3bbf419a9a..bb0b1fa49851d 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -4345,7 +4345,6 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T, case Type::UnaryTransform: case Type::Attributed: case Type::SubstTemplateTypeParm: - case Type::PackExpansion: case Type::MacroQualified: // Keep walking after single level desugaring. T = T.getSingleStepDesugaredType(Context); diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp index 657ed13f207ad..dc74f6e2f7dc5 100644 --- a/clang/lib/Sema/SemaLambda.cpp +++ b/clang/lib/Sema/SemaLambda.cpp @@ -803,7 +803,8 @@ QualType Sema::buildLambdaInitCaptureInitialization( Diag(EllipsisLoc, getLangOpts().CPlusPlus20 ? diag::warn_cxx17_compat_init_capture_pack : diag::ext_init_capture_pack); - DeductType = Context.getPackExpansionType(DeductType, NumExpansions); + DeductType = Context.getPackExpansionType(DeductType, NumExpansions, + /*ExpectPackInType=*/false); TLB.push(DeductType).setEllipsisLoc(EllipsisLoc); } else { // Just ignore the ellipsis for now and form a non-pack variable. We'll diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 7b77d1cb482ae..259cc51657763 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -614,7 +614,8 @@ QualType Sema::CheckPackExpansion(QualType Pattern, SourceRange PatternRange, return QualType(); } - return Context.getPackExpansionType(Pattern, NumExpansions); + return Context.getPackExpansionType(Pattern, NumExpansions, + /*ExpectPackInType=*/false); } ExprResult Sema::ActOnPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc) { diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 3eabe7ca6ffe7..4c7eece68bca3 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -5516,7 +5516,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, << T << D.getSourceRange(); D.setEllipsisLoc(SourceLocation()); } else { - T = Context.getPackExpansionType(T, None); + T = Context.getPackExpansionType(T, None, /*ExpectPackInType=*/false); } break; case DeclaratorContext::TemplateParamContext: diff --git a/clang/test/SemaTemplate/alias-template-nondependent.cpp b/clang/test/SemaTemplate/alias-template-nondependent.cpp new file mode 100644 index 0000000000000..e8ea16483a09f --- /dev/null +++ b/clang/test/SemaTemplate/alias-template-nondependent.cpp @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -std=c++20 -verify %s + +namespace PR46377 { + template using IntPtr = int*; + template auto non_dependent_typedef() { + typedef int(*P)(IntPtr...); + return P(); + } + template auto non_dependent_alias() { + using P = int(*)(IntPtr...); + return P(); + } + template auto non_dependent_via_sizeof() { + using P = int(*)(int(...pack)[sizeof(sizeof(T))]); // expected-error {{invalid application of 'sizeof'}} + return P(); + } + + using a = int (*)(int*, int*); + using a = decltype(non_dependent_typedef()); + using a = decltype(non_dependent_alias()); + using a = decltype(non_dependent_via_sizeof()); + + using b = decltype(non_dependent_via_sizeof()); // expected-note {{instantiation of}} +} From dd405f1a5397d3e7595458144ccb0bcec192a3bf Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 28 Jul 2020 13:26:09 -0700 Subject: [PATCH 0364/1035] Revert D83834 "Add test utility 'extract'" This reverts commit d054c7ee2e9f4f98af7f22a5b00a941eb919bd59. There are discussions about the utility name, its functionality and user interface. Revert before we reach consensus. --- lld/test/CMakeLists.txt | 2 +- lld/test/ELF/linkerscript/noload.s | 19 ++- lld/test/lit.cfg.py | 8 +- llvm/docs/TestingGuide.rst | 23 +--- llvm/test/CMakeLists.txt | 1 - llvm/test/lit.cfg.py | 1 - llvm/test/tools/extract/Inputs/basic-aa.txt | 6 - llvm/test/tools/extract/Inputs/basic-bb.txt | 10 -- llvm/test/tools/extract/basic.test | 32 ----- llvm/test/tools/extract/help.test | 5 - llvm/test/tools/extract/no-leading-lines.test | 10 -- llvm/test/tools/gold/X86/multiple-sections.ll | 14 +-- .../tools/llvm-objcopy/ELF/strip-symbol.test | 19 ++- llvm/test/tools/llvm-strings/radix.test | 23 ++-- llvm/tools/extract/.clang-tidy | 19 --- llvm/tools/extract/CMakeLists.txt | 7 -- llvm/tools/extract/extract.cpp | 113 ------------------ 17 files changed, 36 insertions(+), 276 deletions(-) delete mode 100644 llvm/test/tools/extract/Inputs/basic-aa.txt delete mode 100644 llvm/test/tools/extract/Inputs/basic-bb.txt delete mode 100644 llvm/test/tools/extract/basic.test delete mode 100644 llvm/test/tools/extract/help.test delete mode 100644 llvm/test/tools/extract/no-leading-lines.test delete mode 100644 llvm/tools/extract/.clang-tidy delete mode 100644 llvm/tools/extract/CMakeLists.txt delete mode 100644 llvm/tools/extract/extract.cpp diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt index 753eacf4d255b..e7d1133307393 100644 --- a/lld/test/CMakeLists.txt +++ b/lld/test/CMakeLists.txt @@ -25,7 +25,7 @@ configure_lit_site_cfg( set(LLD_TEST_DEPS lld) if (NOT LLD_BUILT_STANDALONE) list(APPEND LLD_TEST_DEPS - FileCheck count extract llc llvm-ar llvm-as llvm-bcanalyzer llvm-config llvm-cvtres + FileCheck count llc llvm-ar llvm-as llvm-bcanalyzer llvm-config llvm-cvtres llvm-dis llvm-dwarfdump llvm-lib llvm-lipo llvm-mc llvm-nm llvm-objcopy llvm-objdump llvm-pdbutil llvm-readelf llvm-readobj llvm-strip not obj2yaml opt yaml2obj diff --git a/lld/test/ELF/linkerscript/noload.s b/lld/test/ELF/linkerscript/noload.s index c2014722985d3..2f52b465854e2 100644 --- a/lld/test/ELF/linkerscript/noload.s +++ b/lld/test/ELF/linkerscript/noload.s @@ -1,7 +1,11 @@ # REQUIRES: x86 -# RUN: extract asm %s -o %t.s && extract lds %s -o %t.lds -# RUN: llvm-mc -filetype=obj -triple=x86_64 %t.s -o %t.o -# RUN: ld.lld -o %t --script %t.lds %t.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o +# RUN: echo "SECTIONS { \ +# RUN: .data_noload_a (NOLOAD) : { *(.data_noload_a) } \ +# RUN: .data_noload_b (0x10000) (NOLOAD) : { *(.data_noload_b) } \ +# RUN: .no_input_sec_noload (NOLOAD) : { . += 1; } \ +# RUN: .text (0x20000) : { *(.text) } };" > %t.script +# RUN: ld.lld -o %t --script %t.script %t.o # RUN: llvm-readelf -S -l %t | FileCheck %s # CHECK: Name Type Address Off Size @@ -12,7 +16,6 @@ # CHECK: Type Offset VirtAddr PhysAddr # CHECK-NEXT: LOAD 0x001000 0x0000000000020000 0x0000000000020000 -#--- asm .section .text,"ax",@progbits nop @@ -21,11 +24,3 @@ .section .data_noload_b,"aw",@progbits .zero 4096 - -#--- lds -SECTIONS { - .data_noload_a (NOLOAD) : { *(.data_noload_a) } - .data_noload_b (0x10000) (NOLOAD) : { *(.data_noload_b) } - .no_input_sec_noload (NOLOAD) : { . += 1; } - .text (0x20000) : { *(.text) } -} diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py index 0fa9b48c3c792..267f8c5178584 100644 --- a/lld/test/lit.cfg.py +++ b/lld/test/lit.cfg.py @@ -39,9 +39,9 @@ llvm_config.use_lld() tool_patterns = [ - 'extract', 'llc', 'llvm-as', 'llvm-mc', 'llvm-nm', 'llvm-objdump', - 'llvm-pdbutil', 'llvm-dwarfdump', 'llvm-readelf', 'llvm-readobj', - 'obj2yaml', 'yaml2obj', 'opt', 'llvm-dis'] + 'llc', 'llvm-as', 'llvm-mc', 'llvm-nm', 'llvm-objdump', 'llvm-pdbutil', + 'llvm-dwarfdump', 'llvm-readelf', 'llvm-readobj', 'obj2yaml', 'yaml2obj', + 'opt', 'llvm-dis'] llvm_config.add_tool_substitutions(tool_patterns) @@ -87,7 +87,7 @@ # Indirectly check if the mt.exe Microsoft utility exists by searching for # cvtres, which always accompanies it. Alternatively, check if we can use # libxml2 to merge manifests. -if (lit.util.which('cvtres', config.environment['PATH']) or +if (lit.util.which('cvtres', config.environment['PATH']) or config.llvm_libxml2_enabled): config.available_features.add('manifest_tool') diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst index 6fd9ab2d24ca4..2e937f0006272 100644 --- a/llvm/docs/TestingGuide.rst +++ b/llvm/docs/TestingGuide.rst @@ -271,27 +271,8 @@ adding your code there instead of creating a new file. Extra files ----------- -If your test requires extra files besides the file containing the ``RUN:`` lines -and the extra files are small, consider specifying them in the same file and -using ``extract`` to extract them. For example, - -.. code-block:: llvm - - ; RUN: extract b %s -o %tb.ll - ; RUN: extract a %s | llvm-link - %tb.ll -S | FileCheck %s - - ; CHECK: ... - - ;--- a - ... - ;--- b - ... - -The parts are separated by the regex ``^(.|//)--- ``. By default the -extracted content has leading empty lines to preserve line numbers. Specify -``--no-leading-lines`` to drop leading lines. - -If the extra files are large, the idiomatic place to put them is in a subdirectory ``Inputs``. +If your test requires extra files besides the file containing the ``RUN:`` +lines, the idiomatic place to put them is in a subdirectory ``Inputs``. You can then refer to the extra files as ``%S/Inputs/foo.bar``. For example, consider ``test/Linker/ident.ll``. The directory structure is diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 7ec0453fb131e..91215b3ca0ef0 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -52,7 +52,6 @@ set(LLVM_TEST_DEPENDS UnitTests bugpoint count - extract llc lli lli-child-target diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 49bd8ddfb2dc5..0a3289fcc4ad4 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -130,7 +130,6 @@ def get_asan_rtlib(): config.llvm_locstats_used = os.path.exists(llvm_locstats_tool) tools = [ - ToolSubst('%extract', FindTool('extract')), ToolSubst('%lli', FindTool('lli'), post='.', extra_args=lli_args), ToolSubst('%llc_dwarf', FindTool('llc'), extra_args=llc_args), ToolSubst('%go', config.go_executable, unresolved='ignore'), diff --git a/llvm/test/tools/extract/Inputs/basic-aa.txt b/llvm/test/tools/extract/Inputs/basic-aa.txt deleted file mode 100644 index 9eac3fdccbee4..0000000000000 --- a/llvm/test/tools/extract/Inputs/basic-aa.txt +++ /dev/null @@ -1,6 +0,0 @@ - - - -aa -; BB-NOT: {{.}} -; BB: {{^}}bb{{$}} diff --git a/llvm/test/tools/extract/Inputs/basic-bb.txt b/llvm/test/tools/extract/Inputs/basic-bb.txt deleted file mode 100644 index de17efab6fb6b..0000000000000 --- a/llvm/test/tools/extract/Inputs/basic-bb.txt +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - -bb - -// CC: // Comments are preserved. diff --git a/llvm/test/tools/extract/basic.test b/llvm/test/tools/extract/basic.test deleted file mode 100644 index 9f9413106cc75..0000000000000 --- a/llvm/test/tools/extract/basic.test +++ /dev/null @@ -1,32 +0,0 @@ -# AA-NOT: {{.}} -# AA: {{^}}aa{{$}} -#--- aa -aa -; BB-NOT: {{.}} -; BB: {{^}}bb{{$}} -;--- bb -bb - -// CC: // Comments are preserved. -//--- cc -cc -// Comments are preserved. -;--- dup -;--- dup - -# RUN: extract aa %s | diff %S/Inputs/basic-aa.txt - -# RUN: extract bb - < %s | diff %S/Inputs/basic-bb.txt - -# RUN: extract cc %s -o %t -# RUN: FileCheck %s --check-prefix=CC < %t - -# RUN: not %extract aa 2>&1 | FileCheck %s --check-prefix=NO_INPUT - -# NO_INPUT: extract: error: input filename is not specified - -# RUN: not %extract dup %s 2>&1 | FileCheck %s --check-prefix=DUP - -# DUP: extract: error: {{.*}}.test: ';--- dup' occurs more than once - -# RUN: not %extract not_exist %s 2>&1 | FileCheck %s --check-prefix=NOT_EXIST - -# NOT_EXIST: extract: error: {{.*}}.test: ';--- not_exist' was not found diff --git a/llvm/test/tools/extract/help.test b/llvm/test/tools/extract/help.test deleted file mode 100644 index 282052869116c..0000000000000 --- a/llvm/test/tools/extract/help.test +++ /dev/null @@ -1,5 +0,0 @@ -RUN: extract --help 2>&1 | FileCheck --implicit-check-not='General Options:' %s -CHECK: OVERVIEW: Split input {{.*}} -CHECK: Generic Options: -CHECK: extract Options: -CHECK: -o diff --git a/llvm/test/tools/extract/no-leading-lines.test b/llvm/test/tools/extract/no-leading-lines.test deleted file mode 100644 index f0efff5475afb..0000000000000 --- a/llvm/test/tools/extract/no-leading-lines.test +++ /dev/null @@ -1,10 +0,0 @@ -## With --no-leading-lines, don't add leading lines (which is used to preserve line numbers). - -# RUN: extract --no-leading-lines input %s -o %t -# RUN: count 1 < %t -# RUN: FileCheck %s < %t - -# CHECK: input - -#--- input -input diff --git a/llvm/test/tools/gold/X86/multiple-sections.ll b/llvm/test/tools/gold/X86/multiple-sections.ll index 31a89a9d3b484..facbd8d992ed7 100644 --- a/llvm/test/tools/gold/X86/multiple-sections.ll +++ b/llvm/test/tools/gold/X86/multiple-sections.ll @@ -1,8 +1,10 @@ -; RUN: extract order %s -o %t.order -; RUN: extract ir %s | llvm-as -o %t.o +; RUN: echo ".text.tin" > %t_order_lto.txt +; RUN: echo ".text._start" >> %t_order_lto.txt +; RUN: echo ".text.pat" >> %t_order_lto.txt +; RUN: llvm-as %s -o %t.o ; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \ ; RUN: -m elf_x86_64 -o %t.exe %t.o \ -; RUN: --section-ordering-file=%t.order +; RUN: --section-ordering-file=%t_order_lto.txt ; RUN: llvm-readelf -s %t.exe | FileCheck %s ; Check that the order of the sections is tin -> _start -> pat. @@ -11,12 +13,6 @@ ; CHECK: 00000000004000b0 1 FUNC LOCAL DEFAULT 1 tin ; CHECK: 00000000004000c0 15 FUNC GLOBAL DEFAULT 1 _start -;--- order -.text.tin -.text._start -.text.pat - -;--- ir target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/tools/llvm-objcopy/ELF/strip-symbol.test b/llvm/test/tools/llvm-objcopy/ELF/strip-symbol.test index ad71e81eab830..78de46cc47b5d 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/strip-symbol.test +++ b/llvm/test/tools/llvm-objcopy/ELF/strip-symbol.test @@ -1,24 +1,19 @@ -# RUN: extract yaml %s | yaml2obj - -o %t +# RUN: yaml2obj %s -o %t # RUN: llvm-objcopy --strip-symbol baz -N bar %t %t2 # RUN: llvm-readobj --symbols --sections %t2 | FileCheck %s # RUN: llvm-strip --strip-symbol baz -N bar %t -o %t3 # RUN: cmp %t2 %t3 # RUN: llvm-strip --regex --strip-symbol '^b.*' -N bar %t -o %t4 # RUN: cmp %t3 %t4 -# RUN: extract list1 %s -o %t-list.txt && llvm-objcopy --strip-symbols %t-list.txt %t %t5 +# RUN: echo " bar # bar" > %t-list.txt +# RUN: echo " baz # baz" >> %t-list.txt +# RUN: echo " # no symbol" >> %t-list.txt +# RUN: llvm-objcopy --strip-symbols %t-list.txt %t %t5 # RUN: cmp %t3 %t5 -# RUN: extract list2 %s -o %t-list2.txt && llvm-objcopy --regex --strip-symbols %t-list2.txt %t %t6 +# RUN: echo "b.* # bar & baz" > %t-list2.txt +# RUN: llvm-objcopy --regex --strip-symbols %t-list2.txt %t %t6 # RUN: cmp %t3 %t6 -#--- list1 -bar # bar -baz # baz -# no symbol - -#--- list2 -b.* # bar & baz - -#--- yaml !ELF FileHeader: Class: ELFCLASS64 diff --git a/llvm/test/tools/llvm-strings/radix.test b/llvm/test/tools/llvm-strings/radix.test index d9796a937d905..d23fb3cddc8f8 100644 --- a/llvm/test/tools/llvm-strings/radix.test +++ b/llvm/test/tools/llvm-strings/radix.test @@ -1,18 +1,15 @@ ## Show that llvm-strings can handle the -t/--radix switch properly. -RUN: extract --no-leading-lines input %s -o %t -#--- input -one -two -three -four -five -six -seven -eight -nine -ten -#--- end +RUN: echo one > %t +RUN: echo two >> %t +RUN: echo three >> %t +RUN: echo four >> %t +RUN: echo five >> %t +RUN: echo six >> %t +RUN: echo seven >> %t +RUN: echo eight >> %t +RUN: echo nine >> %t +RUN: echo ten >> %t RUN: llvm-strings %t | FileCheck %s -check-prefix CHECK-NONE --implicit-check-not={{.}} RUN: llvm-strings -t d %t | FileCheck %s -check-prefix CHECK-DEC --strict-whitespace --implicit-check-not={{.}} diff --git a/llvm/tools/extract/.clang-tidy b/llvm/tools/extract/.clang-tidy deleted file mode 100644 index 87ec2ff53af6e..0000000000000 --- a/llvm/tools/extract/.clang-tidy +++ /dev/null @@ -1,19 +0,0 @@ -# Almost identical to the top-level .clang-tidy, except that {Member,Parameter,Variable}Case use camelBack. -Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-unused-parameters,-misc-non-private-member-variables-in-classes,readability-identifier-naming' -CheckOptions: - - key: readability-identifier-naming.ClassCase - value: CamelCase - - key: readability-identifier-naming.EnumCase - value: CamelCase - - key: readability-identifier-naming.FunctionCase - value: camelBack - - key: readability-identifier-naming.MemberCase - value: camelBack - - key: readability-identifier-naming.ParameterCase - value: camelBack - - key: readability-identifier-naming.UnionCase - value: CamelCase - - key: readability-identifier-naming.VariableCase - value: camelBack - - key: readability-identifier-naming.IgnoreMainLikeFunctions - value: 1 diff --git a/llvm/tools/extract/CMakeLists.txt b/llvm/tools/extract/CMakeLists.txt deleted file mode 100644 index dae1f463f0666..0000000000000 --- a/llvm/tools/extract/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - -add_llvm_tool(extract - extract.cpp - ) diff --git a/llvm/tools/extract/extract.cpp b/llvm/tools/extract/extract.cpp deleted file mode 100644 index 8ccb539156145..0000000000000 --- a/llvm/tools/extract/extract.cpp +++ /dev/null @@ -1,113 +0,0 @@ -//===- extract.cpp - Input splitting utility ------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Split input into multipe parts separated by regex '^(.|//)--- ' and extract -// the specified part. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/FileOutputBuffer.h" -#include "llvm/Support/LineIterator.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/WithColor.h" -#include - -using namespace llvm; - -static cl::OptionCategory cat("extract Options"); - -static cl::opt part(cl::Positional, cl::desc("part"), - cl::cat(cat)); - -static cl::opt input(cl::Positional, cl::desc("filename"), - cl::cat(cat)); - -static cl::opt output("o", cl::desc("Output filename"), - cl::value_desc("filename"), cl::init("-"), - cl::cat(cat)); - -static cl::opt noLeadingLines("no-leading-lines", - cl::desc("Don't preserve line numbers"), - cl::cat(cat)); - -static StringRef toolName; - -LLVM_ATTRIBUTE_NORETURN static void error(StringRef filename, - const Twine &message) { - if (filename.empty()) - WithColor::error(errs(), toolName) << message << '\n'; - else - WithColor::error(errs(), toolName) << filename << ": " << message << '\n'; - exit(1); -} - -static void handle(MemoryBuffer &inputBuf, StringRef input) { - const char *partBegin = nullptr, *partEnd = nullptr; - int numEmptyLines = 0; - StringRef separator; - for (line_iterator i(inputBuf, /*SkipBlanks=*/false, '\0'); !i.is_at_eof();) { - StringRef line = *i++; - size_t markerLen = line.startswith("//") ? 6 : 5; - if (!(line.size() > markerLen && - line.substr(markerLen - 4).startswith("--- "))) - continue; - separator = line.substr(0, markerLen); - StringRef cur = line.substr(markerLen); - if (cur == part) { - if (partBegin) - error(input, "'" + separator + cur + "' occurs more than once"); - if (!noLeadingLines) - numEmptyLines = i.line_number() - 1; - if (i.is_at_eof()) - break; - partBegin = i->data(); - } else if (partBegin && !partEnd) { - partEnd = line.data(); - } - } - if (!partBegin) - error(input, "'" + separator + part + "' was not found"); - if (!partEnd) - partEnd = inputBuf.getBufferEnd(); - - Expected> outputBuf = - FileOutputBuffer::create(output, numEmptyLines + (partEnd - partBegin)); - if (!outputBuf) - error(input, toString(outputBuf.takeError())); - uint8_t *buf = (*outputBuf)->getBufferStart(); - - // If --no-leading-lines is not specified, numEmptyLines is 0. Append newlines - // so that the extracted part preserves line numbers. - std::fill_n(buf, numEmptyLines, '\n'); - std::copy(partBegin, partEnd, buf + numEmptyLines); - if (Error e = (*outputBuf)->commit()) - error(input, toString(std::move(e))); -} - -int main(int argc, const char **argv) { - toolName = sys::path::stem(argv[0]); - cl::HideUnrelatedOptions({&cat}); - cl::ParseCommandLineOptions( - argc, argv, - "Split input into multiple parts separated by regex '^(.|//)--- ' and " - "extract the part specified by '^(.|//)--- '\n", - nullptr, - /*EnvVar=*/nullptr, - /*LongOptionsUseDoubleDash=*/true); - - if (input.empty()) - error("", "input filename is not specified"); - ErrorOr> bufferOrErr = - MemoryBuffer::getFileOrSTDIN(input); - if (std::error_code ec = bufferOrErr.getError()) - error(input, ec.message()); - handle(**bufferOrErr, input); -} From 26fc91eb5d26fb8bc3ab5328e698c9c1d9546b44 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 28 Jul 2020 13:30:29 -0700 Subject: [PATCH 0365/1035] Revert "[gn build] (manually) merge d054c7ee2e9" This reverts commit ab73b6da95750164daac4cfbd351ca96e1084117. --- llvm/utils/gn/secondary/lld/test/BUILD.gn | 1 - llvm/utils/gn/secondary/llvm/test/BUILD.gn | 1 - llvm/utils/gn/secondary/llvm/tools/extract/BUILD.gn | 4 ---- 3 files changed, 6 deletions(-) delete mode 100644 llvm/utils/gn/secondary/llvm/tools/extract/BUILD.gn diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn index a6fb457cff6ac..581cc5482578c 100644 --- a/llvm/utils/gn/secondary/lld/test/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn @@ -78,7 +78,6 @@ group("test") { ":lit_unit_site_cfg", "//lld/tools/lld:symlinks", "//lld/unittests", - "//llvm/tools/extract", "//llvm/tools/llc", "//llvm/tools/llvm-ar:symlinks", "//llvm/tools/llvm-as", diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn index 3ad1d0ba4f443..2c4a23ffbaacb 100644 --- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn @@ -203,7 +203,6 @@ group("test") { "//llvm/lib/Testing/Support", "//llvm/tools/bugpoint", "//llvm/tools/dsymutil", - "//llvm/tools/extract", "//llvm/tools/llc", "//llvm/tools/lli", "//llvm/tools/lli/ChildTarget:lli-child-target", diff --git a/llvm/utils/gn/secondary/llvm/tools/extract/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/extract/BUILD.gn deleted file mode 100644 index f4553476f1e8b..0000000000000 --- a/llvm/utils/gn/secondary/llvm/tools/extract/BUILD.gn +++ /dev/null @@ -1,4 +0,0 @@ -executable("extract") { - deps = [ "//llvm/lib/Support" ] - sources = [ "extract.cpp" ] -} From e1dd212c874c2ff01b72e9e60db6dbded9e2e6d1 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Tue, 28 Jul 2020 23:35:04 +0300 Subject: [PATCH 0366/1035] [X86] Remove disabled miscompiling X86CondBrFolding pass As briefly discussed in IRC with @craig.topper, the pass is disabled basically since it's original introduction (nov 2018) due to known correctness issues (miscompilations), and there hasn't been much work done to fix that. While i won't promise that i will "fix" the pass, i have looked at it previously, and i'm sure i won't try to fix it if that requires actually fixing this existing code. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D84775 --- llvm/lib/Target/X86/CMakeLists.txt | 1 - llvm/lib/Target/X86/X86.h | 4 - llvm/lib/Target/X86/X86.td | 7 - llvm/lib/Target/X86/X86CondBrFolding.cpp | 579 ------------------ llvm/lib/Target/X86/X86Subtarget.h | 4 - llvm/lib/Target/X86/X86TargetMachine.cpp | 8 - llvm/lib/Target/X86/X86TargetTransformInfo.h | 1 - llvm/test/CodeGen/X86/condbr_if.ll | 178 ------ llvm/test/CodeGen/X86/condbr_switch.ll | 167 ----- .../CodeGen/X86/test_x86condbr_globaladdr.mir | 30 - .../gn/secondary/llvm/lib/Target/X86/BUILD.gn | 1 - 11 files changed, 980 deletions(-) delete mode 100644 llvm/lib/Target/X86/X86CondBrFolding.cpp delete mode 100644 llvm/test/CodeGen/X86/condbr_if.ll delete mode 100644 llvm/test/CodeGen/X86/condbr_switch.ll delete mode 100644 llvm/test/CodeGen/X86/test_x86condbr_globaladdr.mir diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 2b0cbb676c129..677d0a55336a1 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -28,7 +28,6 @@ set(sources X86CallingConv.cpp X86CallLowering.cpp X86CmovConversion.cpp - X86CondBrFolding.cpp X86DomainReassignment.cpp X86DiscriminateMemOps.cpp X86ExpandPseudo.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 91ba4e3d091e8..d4ad10d79bab2 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -67,9 +67,6 @@ FunctionPass *createX86OptimizeLEAs(); /// Return a pass that transforms setcc + movzx pairs into xor + setcc. FunctionPass *createX86FixupSetCC(); -/// Return a pass that folds conditional branch jumps. -FunctionPass *createX86CondBrFolding(); - /// Return a pass that avoids creating store forward block issues in the hardware. FunctionPass *createX86AvoidStoreForwardingBlocks(); @@ -154,7 +151,6 @@ void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86AvoidTrailingCallPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); -void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86ExpandPseudoPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 69d83ab1528cb..bc8927899d687 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -499,12 +499,6 @@ def FeatureUseGLMDivSqrtCosts : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true", "Use Goldmont specific floating point div/sqrt costs">; -// Merge branches using three-way conditional code. -def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", - "ThreewayBranchProfitable", "true", - "Merge branches to a three-way " - "conditional branch">; - // Enable use of alias analysis during code generation. def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", "Use alias analysis during codegen">; @@ -586,7 +580,6 @@ def ProcessorFeatures { FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate, - FeatureMergeToThreeWayBranch, FeatureFast15ByteNOP]; list SNBSpecificFeatures = [FeatureSlowUAMem32, FeaturePOPCNTFalseDeps]; diff --git a/llvm/lib/Target/X86/X86CondBrFolding.cpp b/llvm/lib/Target/X86/X86CondBrFolding.cpp deleted file mode 100644 index 7ede94664bf62..0000000000000 --- a/llvm/lib/Target/X86/X86CondBrFolding.cpp +++ /dev/null @@ -1,579 +0,0 @@ -//===---- X86CondBrFolding.cpp - optimize conditional branches ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This file defines a pass that optimizes condition branches on x86 by taking -// advantage of the three-way conditional code generated by compare -// instructions. -// Currently, it tries to hoisting EQ and NE conditional branch to a dominant -// conditional branch condition where the same EQ/NE conditional code is -// computed. An example: -// bb_0: -// cmp %0, 19 -// jg bb_1 -// jmp bb_2 -// bb_1: -// cmp %0, 40 -// jg bb_3 -// jmp bb_4 -// bb_4: -// cmp %0, 20 -// je bb_5 -// jmp bb_6 -// Here we could combine the two compares in bb_0 and bb_4 and have the -// following code: -// bb_0: -// cmp %0, 20 -// jg bb_1 -// jl bb_2 -// jmp bb_5 -// bb_1: -// cmp %0, 40 -// jg bb_3 -// jmp bb_6 -// For the case of %0 == 20 (bb_5), we eliminate two jumps, and the control -// height for bb_6 is also reduced. bb_4 is gone after the optimization. -// -// There are plenty of this code patterns, especially from the switch case -// lowing where we generate compare of "pivot-1" for the inner nodes in the -// binary search tree. -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrInfo.h" -#include "X86Subtarget.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/BranchProbability.h" - -using namespace llvm; - -#define DEBUG_TYPE "x86-condbr-folding" - -STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded"); - -namespace { -class X86CondBrFoldingPass : public MachineFunctionPass { -public: - X86CondBrFoldingPass() : MachineFunctionPass(ID) { } - StringRef getPassName() const override { return "X86 CondBr Folding"; } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - } - -public: - static char ID; -}; -} // namespace - -char X86CondBrFoldingPass::ID = 0; -INITIALIZE_PASS(X86CondBrFoldingPass, "X86CondBrFolding", "X86CondBrFolding", false, false) - -FunctionPass *llvm::createX86CondBrFolding() { - return new X86CondBrFoldingPass(); -} - -namespace { -// A class the stores the auxiliary information for each MBB. -struct TargetMBBInfo { - MachineBasicBlock *TBB; - MachineBasicBlock *FBB; - MachineInstr *BrInstr; - MachineInstr *CmpInstr; - X86::CondCode BranchCode; - unsigned SrcReg; - int CmpValue; - bool Modified; - bool CmpBrOnly; -}; - -// A class that optimizes the conditional branch by hoisting and merge CondCode. -class X86CondBrFolding { -public: - X86CondBrFolding(const X86InstrInfo *TII, - const MachineBranchProbabilityInfo *MBPI, - MachineFunction &MF) - : TII(TII), MBPI(MBPI), MF(MF) {} - bool optimize(); - -private: - const X86InstrInfo *TII; - const MachineBranchProbabilityInfo *MBPI; - MachineFunction &MF; - std::vector> MBBInfos; - SmallVector RemoveList; - - void optimizeCondBr(MachineBasicBlock &MBB, - SmallVectorImpl &BranchPath); - void replaceBrDest(MachineBasicBlock *MBB, MachineBasicBlock *OrigDest, - MachineBasicBlock *NewDest); - void fixupModifiedCond(MachineBasicBlock *MBB); - std::unique_ptr analyzeMBB(MachineBasicBlock &MBB); - static bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - int &CmpValue); - bool findPath(MachineBasicBlock *MBB, - SmallVectorImpl &BranchPath); - TargetMBBInfo *getMBBInfo(MachineBasicBlock *MBB) const { - return MBBInfos[MBB->getNumber()].get(); - } -}; -} // namespace - -// Find a valid path that we can reuse the CondCode. -// The resulted path (if return true) is stored in BranchPath. -// Return value: -// false: is no valid path is found. -// true: a valid path is found and the targetBB can be reached. -bool X86CondBrFolding::findPath( - MachineBasicBlock *MBB, SmallVectorImpl &BranchPath) { - TargetMBBInfo *MBBInfo = getMBBInfo(MBB); - assert(MBBInfo && "Expecting a candidate MBB"); - int CmpValue = MBBInfo->CmpValue; - - MachineBasicBlock *PredMBB = *MBB->pred_begin(); - MachineBasicBlock *SaveMBB = MBB; - while (PredMBB) { - TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB); - if (!PredMBBInfo || PredMBBInfo->SrcReg != MBBInfo->SrcReg) - return false; - - assert(SaveMBB == PredMBBInfo->TBB || SaveMBB == PredMBBInfo->FBB); - bool IsFalseBranch = (SaveMBB == PredMBBInfo->FBB); - - X86::CondCode CC = PredMBBInfo->BranchCode; - assert(CC == X86::COND_L || CC == X86::COND_G || CC == X86::COND_E); - int PredCmpValue = PredMBBInfo->CmpValue; - bool ValueCmpTrue = ((CmpValue < PredCmpValue && CC == X86::COND_L) || - (CmpValue > PredCmpValue && CC == X86::COND_G) || - (CmpValue == PredCmpValue && CC == X86::COND_E)); - // Check if both the result of value compare and the branch target match. - if (!(ValueCmpTrue ^ IsFalseBranch)) { - LLVM_DEBUG(dbgs() << "Dead BB detected!\n"); - return false; - } - - BranchPath.push_back(PredMBB); - // These are the conditions on which we could combine the compares. - if ((CmpValue == PredCmpValue) || - (CmpValue == PredCmpValue - 1 && CC == X86::COND_L) || - (CmpValue == PredCmpValue + 1 && CC == X86::COND_G)) - return true; - - // If PredMBB has more than on preds, or not a pure cmp and br, we bailout. - if (PredMBB->pred_size() != 1 || !PredMBBInfo->CmpBrOnly) - return false; - - SaveMBB = PredMBB; - PredMBB = *PredMBB->pred_begin(); - } - return false; -} - -// Fix up any PHI node in the successor of MBB. -static void fixPHIsInSucc(MachineBasicBlock *MBB, MachineBasicBlock *OldMBB, - MachineBasicBlock *NewMBB) { - if (NewMBB == OldMBB) - return; - for (auto MI = MBB->instr_begin(), ME = MBB->instr_end(); - MI != ME && MI->isPHI(); ++MI) - for (unsigned i = 2, e = MI->getNumOperands() + 1; i != e; i += 2) { - MachineOperand &MO = MI->getOperand(i); - if (MO.getMBB() == OldMBB) - MO.setMBB(NewMBB); - } -} - -// Utility function to set branch probability for edge MBB->SuccMBB. -static inline bool setBranchProb(MachineBasicBlock *MBB, - MachineBasicBlock *SuccMBB, - BranchProbability Prob) { - auto MBBI = std::find(MBB->succ_begin(), MBB->succ_end(), SuccMBB); - if (MBBI == MBB->succ_end()) - return false; - MBB->setSuccProbability(MBBI, Prob); - return true; -} - -// Utility function to find the unconditional br instruction in MBB. -static inline MachineBasicBlock::iterator -findUncondBrI(MachineBasicBlock *MBB) { - return std::find_if(MBB->begin(), MBB->end(), [](MachineInstr &MI) -> bool { - return MI.getOpcode() == X86::JMP_1; - }); -} - -// Replace MBB's original successor, OrigDest, with NewDest. -// Also update the MBBInfo for MBB. -void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB, - MachineBasicBlock *OrigDest, - MachineBasicBlock *NewDest) { - TargetMBBInfo *MBBInfo = getMBBInfo(MBB); - MachineInstr *BrMI; - if (MBBInfo->TBB == OrigDest) { - BrMI = MBBInfo->BrInstr; - MachineInstrBuilder MIB = - BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1)) - .addMBB(NewDest).addImm(MBBInfo->BranchCode); - MBBInfo->TBB = NewDest; - MBBInfo->BrInstr = MIB.getInstr(); - } else { // Should be the unconditional jump stmt. - MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB); - BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1)) - .addMBB(NewDest); - MBBInfo->FBB = NewDest; - BrMI = &*UncondBrI; - } - fixPHIsInSucc(NewDest, OrigDest, MBB); - BrMI->eraseFromParent(); - MBB->addSuccessor(NewDest); - setBranchProb(MBB, NewDest, MBPI->getEdgeProbability(MBB, OrigDest)); - MBB->removeSuccessor(OrigDest); -} - -// Change the CondCode and BrInstr according to MBBInfo. -void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) { - TargetMBBInfo *MBBInfo = getMBBInfo(MBB); - if (!MBBInfo->Modified) - return; - - MachineInstr *BrMI = MBBInfo->BrInstr; - X86::CondCode CC = MBBInfo->BranchCode; - MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), - TII->get(X86::JCC_1)) - .addMBB(MBBInfo->TBB).addImm(CC); - BrMI->eraseFromParent(); - MBBInfo->BrInstr = MIB.getInstr(); - - MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB); - BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1)) - .addMBB(MBBInfo->FBB); - MBB->erase(UncondBrI); - MBBInfo->Modified = false; -} - -// -// Apply the transformation: -// RootMBB -1-> ... PredMBB -3-> MBB -5-> TargetMBB -// \-2-> \-4-> \-6-> FalseMBB -// ==> -// RootMBB -1-> ... PredMBB -7-> FalseMBB -// TargetMBB <-8-/ \-2-> \-4-> -// -// Note that PredMBB and RootMBB could be the same. -// And in the case of dead TargetMBB, we will not have TargetMBB and edge 8. -// -// There are some special handling where the RootMBB is COND_E in which case -// we directly short-cycle the brinstr. -// -void X86CondBrFolding::optimizeCondBr( - MachineBasicBlock &MBB, SmallVectorImpl &BranchPath) { - - X86::CondCode CC; - TargetMBBInfo *MBBInfo = getMBBInfo(&MBB); - assert(MBBInfo && "Expecting a candidate MBB"); - MachineBasicBlock *TargetMBB = MBBInfo->TBB; - BranchProbability TargetProb = MBPI->getEdgeProbability(&MBB, MBBInfo->TBB); - - // Forward the jump from MBB's predecessor to MBB's false target. - MachineBasicBlock *PredMBB = BranchPath.front(); - TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB); - assert(PredMBBInfo && "Expecting a candidate MBB"); - if (PredMBBInfo->Modified) - fixupModifiedCond(PredMBB); - CC = PredMBBInfo->BranchCode; - // Don't do this if depth of BranchPath is 1 and PredMBB is of COND_E. - // We will short-cycle directly for this case. - if (!(CC == X86::COND_E && BranchPath.size() == 1)) - replaceBrDest(PredMBB, &MBB, MBBInfo->FBB); - - MachineBasicBlock *RootMBB = BranchPath.back(); - TargetMBBInfo *RootMBBInfo = getMBBInfo(RootMBB); - assert(RootMBBInfo && "Expecting a candidate MBB"); - if (RootMBBInfo->Modified) - fixupModifiedCond(RootMBB); - CC = RootMBBInfo->BranchCode; - - if (CC != X86::COND_E) { - MachineBasicBlock::iterator UncondBrI = findUncondBrI(RootMBB); - // RootMBB: Cond jump to the original not-taken MBB. - X86::CondCode NewCC; - switch (CC) { - case X86::COND_L: - NewCC = X86::COND_G; - break; - case X86::COND_G: - NewCC = X86::COND_L; - break; - default: - llvm_unreachable("unexpected condtional code."); - } - BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), - TII->get(X86::JCC_1)) - .addMBB(RootMBBInfo->FBB).addImm(NewCC); - - // RootMBB: Jump to TargetMBB - BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), - TII->get(X86::JMP_1)) - .addMBB(TargetMBB); - RootMBB->addSuccessor(TargetMBB); - fixPHIsInSucc(TargetMBB, &MBB, RootMBB); - RootMBB->erase(UncondBrI); - } else { - replaceBrDest(RootMBB, RootMBBInfo->TBB, TargetMBB); - } - - // Fix RootMBB's CmpValue to MBB's CmpValue to TargetMBB. Don't set Imm - // directly. Move MBB's stmt to here as the opcode might be different. - if (RootMBBInfo->CmpValue != MBBInfo->CmpValue) { - MachineInstr *NewCmp = MBBInfo->CmpInstr; - NewCmp->removeFromParent(); - RootMBB->insert(RootMBBInfo->CmpInstr, NewCmp); - RootMBBInfo->CmpInstr->eraseFromParent(); - } - - // Fix branch Probabilities. - auto fixBranchProb = [&](MachineBasicBlock *NextMBB) { - BranchProbability Prob; - for (auto &I : BranchPath) { - MachineBasicBlock *ThisMBB = I; - if (!ThisMBB->hasSuccessorProbabilities() || - !ThisMBB->isSuccessor(NextMBB)) - break; - Prob = MBPI->getEdgeProbability(ThisMBB, NextMBB); - if (Prob.isUnknown()) - break; - TargetProb = Prob * TargetProb; - Prob = Prob - TargetProb; - setBranchProb(ThisMBB, NextMBB, Prob); - if (ThisMBB == RootMBB) { - setBranchProb(ThisMBB, TargetMBB, TargetProb); - } - ThisMBB->normalizeSuccProbs(); - if (ThisMBB == RootMBB) - break; - NextMBB = ThisMBB; - } - return true; - }; - if (CC != X86::COND_E && !TargetProb.isUnknown()) - fixBranchProb(MBBInfo->FBB); - - if (CC != X86::COND_E) - RemoveList.push_back(&MBB); - - // Invalidate MBBInfo just in case. - MBBInfos[MBB.getNumber()] = nullptr; - MBBInfos[RootMBB->getNumber()] = nullptr; - - LLVM_DEBUG(dbgs() << "After optimization:\nRootMBB is: " << *RootMBB << "\n"); - if (BranchPath.size() > 1) - LLVM_DEBUG(dbgs() << "PredMBB is: " << *(BranchPath[0]) << "\n"); -} - -// Driver function for optimization: find the valid candidate and apply -// the transformation. -bool X86CondBrFolding::optimize() { - bool Changed = false; - LLVM_DEBUG(dbgs() << "***** X86CondBr Folding on Function: " << MF.getName() - << " *****\n"); - // Setup data structures. - MBBInfos.resize(MF.getNumBlockIDs()); - for (auto &MBB : MF) - MBBInfos[MBB.getNumber()] = analyzeMBB(MBB); - - for (auto &MBB : MF) { - TargetMBBInfo *MBBInfo = getMBBInfo(&MBB); - if (!MBBInfo || !MBBInfo->CmpBrOnly) - continue; - if (MBB.pred_size() != 1) - continue; - LLVM_DEBUG(dbgs() << "Work on MBB." << MBB.getNumber() - << " CmpValue: " << MBBInfo->CmpValue << "\n"); - SmallVector BranchPath; - if (!findPath(&MBB, BranchPath)) - continue; - -#ifndef NDEBUG - LLVM_DEBUG(dbgs() << "Found one path (len=" << BranchPath.size() << "):\n"); - int Index = 1; - LLVM_DEBUG(dbgs() << "Target MBB is: " << MBB << "\n"); - for (auto I = BranchPath.rbegin(); I != BranchPath.rend(); ++I, ++Index) { - MachineBasicBlock *PMBB = *I; - TargetMBBInfo *PMBBInfo = getMBBInfo(PMBB); - LLVM_DEBUG(dbgs() << "Path MBB (" << Index << " of " << BranchPath.size() - << ") is " << *PMBB); - LLVM_DEBUG(dbgs() << "CC=" << PMBBInfo->BranchCode - << " Val=" << PMBBInfo->CmpValue - << " CmpBrOnly=" << PMBBInfo->CmpBrOnly << "\n\n"); - } -#endif - optimizeCondBr(MBB, BranchPath); - Changed = true; - } - NumFixedCondBrs += RemoveList.size(); - for (auto MBBI : RemoveList) { - while (!MBBI->succ_empty()) - MBBI->removeSuccessor(MBBI->succ_end() - 1); - - MBBI->eraseFromParent(); - } - - return Changed; -} - -// Analyze instructions that generate CondCode and extract information. -bool X86CondBrFolding::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - int &CmpValue) { - unsigned SrcRegIndex = 0; - unsigned ValueIndex = 0; - switch (MI.getOpcode()) { - // TODO: handle test instructions. - default: - return false; - case X86::CMP64ri32: - case X86::CMP64ri8: - case X86::CMP32ri: - case X86::CMP32ri8: - case X86::CMP16ri: - case X86::CMP16ri8: - case X86::CMP8ri: - SrcRegIndex = 0; - ValueIndex = 1; - break; - case X86::SUB64ri32: - case X86::SUB64ri8: - case X86::SUB32ri: - case X86::SUB32ri8: - case X86::SUB16ri: - case X86::SUB16ri8: - case X86::SUB8ri: - SrcRegIndex = 1; - ValueIndex = 2; - break; - } - SrcReg = MI.getOperand(SrcRegIndex).getReg(); - if (!MI.getOperand(ValueIndex).isImm()) - return false; - CmpValue = MI.getOperand(ValueIndex).getImm(); - return true; -} - -// Analyze a candidate MBB and set the extract all the information needed. -// The valid candidate will have two successors. -// It also should have a sequence of -// Branch_instr, -// CondBr, -// UnCondBr. -// Return TargetMBBInfo if MBB is a valid candidate and nullptr otherwise. -std::unique_ptr -X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) { - MachineBasicBlock *TBB; - MachineBasicBlock *FBB; - MachineInstr *BrInstr; - MachineInstr *CmpInstr; - X86::CondCode CC; - unsigned SrcReg; - int CmpValue; - bool Modified; - bool CmpBrOnly; - - if (MBB.succ_size() != 2) - return nullptr; - - CmpBrOnly = true; - FBB = TBB = nullptr; - CmpInstr = nullptr; - MachineBasicBlock::iterator I = MBB.end(); - while (I != MBB.begin()) { - --I; - if (I->isDebugValue()) - continue; - if (I->getOpcode() == X86::JMP_1) { - if (FBB) - return nullptr; - FBB = I->getOperand(0).getMBB(); - continue; - } - if (I->isBranch()) { - if (TBB) - return nullptr; - CC = X86::getCondFromBranch(*I); - switch (CC) { - default: - return nullptr; - case X86::COND_E: - case X86::COND_L: - case X86::COND_G: - case X86::COND_NE: - case X86::COND_LE: - case X86::COND_GE: - break; - } - TBB = I->getOperand(0).getMBB(); - BrInstr = &*I; - continue; - } - if (analyzeCompare(*I, SrcReg, CmpValue)) { - if (CmpInstr) - return nullptr; - CmpInstr = &*I; - continue; - } - CmpBrOnly = false; - break; - } - - if (!TBB || !FBB || !CmpInstr) - return nullptr; - - // Simplify CondCode. Note this is only to simplify the findPath logic - // and will not change the instruction here. - switch (CC) { - case X86::COND_NE: - CC = X86::COND_E; - std::swap(TBB, FBB); - Modified = true; - break; - case X86::COND_LE: - if (CmpValue == INT_MAX) - return nullptr; - CC = X86::COND_L; - CmpValue += 1; - Modified = true; - break; - case X86::COND_GE: - if (CmpValue == INT_MIN) - return nullptr; - CC = X86::COND_G; - CmpValue -= 1; - Modified = true; - break; - default: - Modified = false; - break; - } - return std::make_unique(TargetMBBInfo{ - TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly}); -} - -bool X86CondBrFoldingPass::runOnMachineFunction(MachineFunction &MF) { - const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.threewayBranchProfitable()) - return false; - const X86InstrInfo *TII = ST.getInstrInfo(); - const MachineBranchProbabilityInfo *MBPI = - &getAnalysis(); - - X86CondBrFolding CondBr(TII, MBPI, MF); - return CondBr.optimize(); -} diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index e555dfdd638ac..c93fa082eeb60 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -468,9 +468,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Indicates target prefers AVX512 mask registers. bool PreferMaskRegisters = false; - /// Threeway branch is profitable in this subtarget. - bool ThreewayBranchProfitable = false; - /// Use Goldmont specific floating point div/sqrt costs. bool UseGLMDivSqrtCosts = false; @@ -723,7 +720,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasWAITPKG() const { return HasWAITPKG; } bool hasPCONFIG() const { return HasPCONFIG; } bool hasSGX() const { return HasSGX; } - bool threewayBranchProfitable() const { return ThreewayBranchProfitable; } bool hasINVPCID() const { return HasINVPCID; } bool hasENQCMD() const { return HasENQCMD; } bool hasSERIALIZE() const { return HasSERIALIZE; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a9ea245f7027..f660b99a4511d 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -56,11 +56,6 @@ static cl::opt EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); -static cl::opt EnableCondBrFoldingPass("x86-condbr-folding", - cl::desc("Enable the conditional branch " - "folding pass"), - cl::init(false), cl::Hidden); - extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine X(getTheX86_32Target()); @@ -84,7 +79,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); - initializeX86CondBrFoldingPassPass(PR); initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); @@ -456,8 +450,6 @@ bool X86PassConfig::addGlobalInstructionSelect() { } bool X86PassConfig::addILPOpts() { - if (EnableCondBrFoldingPass) - addPass(createX86CondBrFolding()); addPass(&EarlyIfConverterID); if (EnableMachineCombinerPass) addPass(&MachineCombinerID); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 5ccaf409790db..fb4719064613e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -62,7 +62,6 @@ class X86TTIImpl : public BasicTTIImplBase { X86::FeatureLZCNTFalseDeps, X86::FeatureBranchFusion, X86::FeatureMacroFusion, - X86::FeatureMergeToThreeWayBranch, X86::FeaturePadShortFunctions, X86::FeaturePOPCNTFalseDeps, X86::FeatureSSEUnalignedMem, diff --git a/llvm/test/CodeGen/X86/condbr_if.ll b/llvm/test/CodeGen/X86/condbr_if.ll deleted file mode 100644 index 7b92f712be6dd..0000000000000 --- a/llvm/test/CodeGen/X86/condbr_if.ll +++ /dev/null @@ -1,178 +0,0 @@ -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=sandybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=ivybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=haswell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=broadwell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skylake %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skx %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=NOTMERGE - -define i32 @length2_1(i32) { - %2 = icmp slt i32 %0, 3 - br i1 %2, label %3, label %5 - -;