From fe7c50ae94d49e69e7844882b79ec47afcde72d4 Mon Sep 17 00:00:00 2001 From: Tianqing Wang Date: Thu, 11 Jul 2024 23:16:09 +0800 Subject: [PATCH 1/6] [SimplifyCFG] Increase budget for FoldTwoEntryPHINode() if the branch is unpredictable. The `!unpredictable` metadata has been present for a long time, but it's usage in optimizations is still limited. This patch teaches `FoldTwoEntryPHINode()` to be more aggressive with an unpredictable branch to reduce mispredictions. A TTI interface `getBranchMispredictPenalty()` is added to distinguish between different hardwares to ensure we don't go too far for simpler cores. For simplicity, only a naive x86 implementation is included for the time being. --- .../llvm/Analysis/TargetTransformInfo.h | 9 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + llvm/lib/Analysis/TargetTransformInfo.cpp | 4 + .../lib/Target/X86/X86TargetTransformInfo.cpp | 5 + llvm/lib/Target/X86/X86TargetTransformInfo.h | 2 + llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 9 +- .../two-entry-phi-fold-unpredictable.ll | 96 +++++++++++++++++++ 7 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index cf378008e4c7c..bdcaeab033791 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -419,6 +419,11 @@ class TargetTransformInfo { /// this factor, it is very likely to be predicted correctly. BranchProbability getPredictableBranchThreshold() const; + // Returns an integer indicating how aggressive the target wants for + // eliminating unpredictable branches. A zero return value means extra + // optimization applied to them should be minimal. + unsigned getBranchMispredictPenalty() const; + /// Return true if branch divergence exists. /// /// Branch divergence has a significantly negative impact on GPU performance @@ -1832,6 +1837,7 @@ class TargetTransformInfo::Concept { ArrayRef Operands, TargetCostKind CostKind) = 0; virtual BranchProbability getPredictableBranchThreshold() = 0; + virtual unsigned getBranchMispredictPenalty() = 0; virtual bool hasBranchDivergence(const Function *F = nullptr) = 0; virtual bool isSourceOfDivergence(const Value *V) = 0; virtual bool isAlwaysUniform(const Value *V) = 0; @@ -2243,6 +2249,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { BranchProbability getPredictableBranchThreshold() override { return Impl.getPredictableBranchThreshold(); } + unsigned getBranchMispredictPenalty() override { + return Impl.getBranchMispredictPenalty(); + } bool hasBranchDivergence(const Function *F = nullptr) override { return Impl.hasBranchDivergence(F); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 47fde08735c0c..a87470c6446c0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -99,6 +99,8 @@ class TargetTransformInfoImplBase { return BranchProbability(99, 100); } + unsigned getBranchMispredictPenalty() const { return 0; } + bool hasBranchDivergence(const Function *F = nullptr) const { return false; } bool isSourceOfDivergence(const Value *V) const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 693f7a5bb7af5..a1fcf5482d334 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -279,6 +279,10 @@ BranchProbability TargetTransformInfo::getPredictableBranchThreshold() const { : TTIImpl->getPredictableBranchThreshold(); } +unsigned TargetTransformInfo::getBranchMispredictPenalty() const { + return TTIImpl->getBranchMispredictPenalty(); +} + bool TargetTransformInfo::hasBranchDivergence(const Function *F) const { return TTIImpl->hasBranchDivergence(F); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 32a3683355b72..984586f4ae5f6 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6756,3 +6756,8 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, return AM.Scale != 0; return -1; } + +unsigned X86TTIImpl::getBranchMispredictPenalty() const { + // TODO: Hook MispredictPenalty of SchedMachineModel into this. + return 14; +} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 5eccb1aea308d..d2b5c093e7003 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -294,6 +294,8 @@ class X86TTIImpl : public BasicTTIImplBase { bool supportsEfficientVectorElementLoadStore() const; bool enableInterleavedAccessVectorization(); + unsigned getBranchMispredictPenalty() const; + private: bool supportsGather() const; InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 8f717cb43bcb4..73687b5c31c64 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3508,7 +3508,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // jump to one specific 'then' block (if we have two of them). // It isn't beneficial to speculatively execute the code // from the block that we know is predictably not entered. - if (!DomBI->getMetadata(LLVMContext::MD_unpredictable)) { + bool IsUnpredictable = DomBI->getMetadata(LLVMContext::MD_unpredictable); + if (!IsUnpredictable) { uint64_t TWeight, FWeight; if (extractBranchWeights(*DomBI, TWeight, FWeight) && (TWeight + FWeight) != 0) { @@ -3549,8 +3550,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // that need to be moved to the dominating block. SmallPtrSet AggressiveInsts; InstructionCost Cost = 0; - InstructionCost Budget = - TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; + unsigned Threshold = TwoEntryPHINodeFoldingThreshold; + if (IsUnpredictable) + Threshold += TTI.getBranchMispredictPenalty(); + InstructionCost Budget = Threshold * TargetTransformInfo::TCC_Basic; bool Changed = false; for (BasicBlock::iterator II = BB->begin(); isa(II);) { diff --git a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll new file mode 100644 index 0000000000000..88aa8a619207d --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; Two-entry phi nodes with unpredictable conditions may get increased budget for folding. +; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-FOLD %s + +define { <2 x float>, <2 x float> } @foo(float %speed, <2 x float> %velocity.coerce0, <2 x float> %velocity.coerce1) { +; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo( +; CHECK-NOFOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) { +; CHECK-NOFOLD-NEXT: [[ENTRY:.*]]: +; CHECK-NOFOLD-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000 +; CHECK-NOFOLD-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]], !unpredictable [[META0:![0-9]+]] +; CHECK-NOFOLD: [[IF_THEN]]: +; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0 +; CHECK-NOFOLD-NEXT: [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] +; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1 +; CHECK-NOFOLD-NEXT: [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] +; CHECK-NOFOLD-NEXT: [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]] +; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0 +; CHECK-NOFOLD-NEXT: [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] +; CHECK-NOFOLD-NEXT: [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]] +; CHECK-NOFOLD-NEXT: [[TMP0:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]]) +; CHECK-NOFOLD-NEXT: [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]] +; CHECK-NOFOLD-NEXT: [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] +; CHECK-NOFOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0 +; CHECK-NOFOLD-NEXT: [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] +; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1 +; CHECK-NOFOLD-NEXT: [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] +; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0 +; CHECK-NOFOLD-NEXT: br label %[[IF_END]] +; CHECK-NOFOLD: [[IF_END]]: +; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_0_4_VEC_INSERT25]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] +; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_14_8_VEC_INSERT35]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] +; CHECK-NOFOLD-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0 +; CHECK-NOFOLD-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1 +; CHECK-NOFOLD-NEXT: ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]] +; +; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo( +; CHECK-FOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-FOLD-NEXT: [[ENTRY:.*:]] +; CHECK-FOLD-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000 +; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0 +; CHECK-FOLD-NEXT: [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] +; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1 +; CHECK-FOLD-NEXT: [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] +; CHECK-FOLD-NEXT: [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]] +; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0 +; CHECK-FOLD-NEXT: [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] +; CHECK-FOLD-NEXT: [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]] +; CHECK-FOLD-NEXT: [[TMP0:%.*]] = tail call fast float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]]) +; CHECK-FOLD-NEXT: [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]] +; CHECK-FOLD-NEXT: [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] +; CHECK-FOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0 +; CHECK-FOLD-NEXT: [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] +; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1 +; CHECK-FOLD-NEXT: [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] +; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0 +; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_0_4_VEC_INSERT25]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]] +; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_14_8_VEC_INSERT35]], <2 x float> zeroinitializer, !unpredictable [[META0]] +; CHECK-FOLD-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0 +; CHECK-FOLD-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1 +; CHECK-FOLD-NEXT: ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]] +; +entry: + %cmp = fcmp fast ogt float %speed, 0x3F747AE140000000 + br i1 %cmp, label %if.then, label %if.end, !unpredictable !0 + +if.then: + %velocity.sroa.0.0.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 0 + %mul.i.i.i.i = fmul fast float %velocity.sroa.0.0.vec.extract, %velocity.sroa.0.0.vec.extract + %velocity.sroa.0.4.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 1 + %mul8.i.i.i.i = fmul fast float %velocity.sroa.0.4.vec.extract, %velocity.sroa.0.4.vec.extract + %add.i.i.i.i = fadd fast float %mul8.i.i.i.i, %mul.i.i.i.i + %velocity.sroa.14.8.vec.extract = extractelement <2 x float> %velocity.coerce1, i64 0 + %mul13.i.i.i.i = fmul fast float %velocity.sroa.14.8.vec.extract, %velocity.sroa.14.8.vec.extract + %add14.i.i.i.i = fadd fast float %add.i.i.i.i, %mul13.i.i.i.i + %0 = tail call fast noundef float @llvm.sqrt.f32(float %add14.i.i.i.i) + %mul.i.i.i = fdiv fast float 0x3FEFD70A40000000, %0 + %sub.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.0.vec.extract + %1 = insertelement <2 x float> poison, float %sub.i, i64 0 + %sub8.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.4.vec.extract + %velocity.sroa.0.4.vec.insert25 = insertelement <2 x float> %1, float %sub8.i, i64 1 + %sub13.i = fmul fast float %mul.i.i.i, %velocity.sroa.14.8.vec.extract + %velocity.sroa.14.8.vec.insert35 = insertelement <2 x float> %velocity.coerce1, float %sub13.i, i64 0 + br label %if.end + +if.end: + %velocity.sroa.0.0 = phi nsz <2 x float> [ %velocity.sroa.0.4.vec.insert25, %if.then ], [ zeroinitializer, %entry ] + %velocity.sroa.14.0 = phi nsz <2 x float> [ %velocity.sroa.14.8.vec.insert35, %if.then ], [ zeroinitializer, %entry ] + %.fca.0.insert = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %velocity.sroa.0.0, 0 + %.fca.1.insert = insertvalue { <2 x float>, <2 x float> } %.fca.0.insert, <2 x float> %velocity.sroa.14.0, 1 + ret { <2 x float>, <2 x float> } %.fca.1.insert +} + +declare float @llvm.sqrt.f32(float) + +!0 = !{} From 645ba1fe958cc10fdd6e2c69f3f0f80d67a72e17 Mon Sep 17 00:00:00 2001 From: Tianqing Wang Date: Wed, 17 Jul 2024 15:58:29 +0800 Subject: [PATCH 2/6] Add debug log. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 73687b5c31c64..049226af047c5 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3623,8 +3623,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, [](BasicBlock *IfBlock) { return IfBlock->hasAddressTaken(); })) return Changed; - LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond - << " T: " << IfTrue->getName() + LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond; + if (IsUnpredictable) dbgs() << " (unpredictable)"; + dbgs() << " T: " << IfTrue->getName() << " F: " << IfFalse->getName() << "\n"); // If we can still promote the PHI nodes after this gauntlet of tests, From da55feb74075271d0cd7a36575b9ee5c36400dda Mon Sep 17 00:00:00 2001 From: Tianqing Wang Date: Fri, 19 Jul 2024 02:06:56 +0800 Subject: [PATCH 3/6] Returns InstructionCost for getBranchMispredictPenalty(). --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 13 +++++++------ .../include/llvm/Analysis/TargetTransformInfoImpl.h | 2 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 2 +- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 2 +- llvm/lib/Target/X86/X86TargetTransformInfo.h | 2 +- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 5 ++--- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index bdcaeab033791..2411b2b31d293 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -419,10 +419,11 @@ class TargetTransformInfo { /// this factor, it is very likely to be predicted correctly. BranchProbability getPredictableBranchThreshold() const; - // Returns an integer indicating how aggressive the target wants for - // eliminating unpredictable branches. A zero return value means extra - // optimization applied to them should be minimal. - unsigned getBranchMispredictPenalty() const; + /// Returns estimated penalty of a branch misprediction in latency. Indicates + /// how aggressive the target wants for eliminating unpredictable branches. A + /// zero return value means extra optimization applied to them should be + /// minimal. + InstructionCost getBranchMispredictPenalty() const; /// Return true if branch divergence exists. /// @@ -1837,7 +1838,7 @@ class TargetTransformInfo::Concept { ArrayRef Operands, TargetCostKind CostKind) = 0; virtual BranchProbability getPredictableBranchThreshold() = 0; - virtual unsigned getBranchMispredictPenalty() = 0; + virtual InstructionCost getBranchMispredictPenalty() = 0; virtual bool hasBranchDivergence(const Function *F = nullptr) = 0; virtual bool isSourceOfDivergence(const Value *V) = 0; virtual bool isAlwaysUniform(const Value *V) = 0; @@ -2249,7 +2250,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { BranchProbability getPredictableBranchThreshold() override { return Impl.getPredictableBranchThreshold(); } - unsigned getBranchMispredictPenalty() override { + InstructionCost getBranchMispredictPenalty() override { return Impl.getBranchMispredictPenalty(); } bool hasBranchDivergence(const Function *F = nullptr) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index a87470c6446c0..00efa474a91b5 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -99,7 +99,7 @@ class TargetTransformInfoImplBase { return BranchProbability(99, 100); } - unsigned getBranchMispredictPenalty() const { return 0; } + InstructionCost getBranchMispredictPenalty() const { return 0; } bool hasBranchDivergence(const Function *F = nullptr) const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index a1fcf5482d334..6a0fa98089ba5 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -279,7 +279,7 @@ BranchProbability TargetTransformInfo::getPredictableBranchThreshold() const { : TTIImpl->getPredictableBranchThreshold(); } -unsigned TargetTransformInfo::getBranchMispredictPenalty() const { +InstructionCost TargetTransformInfo::getBranchMispredictPenalty() const { return TTIImpl->getBranchMispredictPenalty(); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 984586f4ae5f6..dc3ac80bdf5cf 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -6757,7 +6757,7 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, return -1; } -unsigned X86TTIImpl::getBranchMispredictPenalty() const { +InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { // TODO: Hook MispredictPenalty of SchedMachineModel into this. return 14; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index d2b5c093e7003..b619090e8e1e0 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -294,7 +294,7 @@ class X86TTIImpl : public BasicTTIImplBase { bool supportsEfficientVectorElementLoadStore() const; bool enableInterleavedAccessVectorization(); - unsigned getBranchMispredictPenalty() const; + InstructionCost getBranchMispredictPenalty() const; private: bool supportsGather() const; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 049226af047c5..aa0cf7dd4fa6a 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3550,10 +3550,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // that need to be moved to the dominating block. SmallPtrSet AggressiveInsts; InstructionCost Cost = 0; - unsigned Threshold = TwoEntryPHINodeFoldingThreshold; + InstructionCost Budget = TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; if (IsUnpredictable) - Threshold += TTI.getBranchMispredictPenalty(); - InstructionCost Budget = Threshold * TargetTransformInfo::TCC_Basic; + Budget += TTI.getBranchMispredictPenalty(); bool Changed = false; for (BasicBlock::iterator II = BB->begin(); isa(II);) { From b008d5f9be21d53d6f65f6749bfaa09675682b50 Mon Sep 17 00:00:00 2001 From: Tianqing Wang Date: Fri, 19 Jul 2024 02:12:23 +0800 Subject: [PATCH 4/6] clang-format. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index aa0cf7dd4fa6a..3ab1564e93e49 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3550,7 +3550,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // that need to be moved to the dominating block. SmallPtrSet AggressiveInsts; InstructionCost Cost = 0; - InstructionCost Budget = TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; + InstructionCost Budget = + TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; if (IsUnpredictable) Budget += TTI.getBranchMispredictPenalty(); From fa967f83a252aa6c983a518a59c0cbeba767bb10 Mon Sep 17 00:00:00 2001 From: Tianqing Wang Date: Mon, 22 Jul 2024 13:33:01 +0800 Subject: [PATCH 5/6] Canonicalize the test. --- .../two-entry-phi-fold-unpredictable.ll | 164 +++++++++--------- 1 file changed, 84 insertions(+), 80 deletions(-) diff --git a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll index 88aa8a619207d..0bce8e3ed7dd3 100644 --- a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll +++ b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll @@ -3,92 +3,96 @@ ; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-FOLD %s -define { <2 x float>, <2 x float> } @foo(float %speed, <2 x float> %velocity.coerce0, <2 x float> %velocity.coerce1) { +define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x float> %arg2) { ; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo( -; CHECK-NOFOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) { -; CHECK-NOFOLD-NEXT: [[ENTRY:.*]]: -; CHECK-NOFOLD-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000 -; CHECK-NOFOLD-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]], !unpredictable [[META0:![0-9]+]] -; CHECK-NOFOLD: [[IF_THEN]]: -; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0 -; CHECK-NOFOLD-NEXT: [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] -; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1 -; CHECK-NOFOLD-NEXT: [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] -; CHECK-NOFOLD-NEXT: [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]] -; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0 -; CHECK-NOFOLD-NEXT: [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] -; CHECK-NOFOLD-NEXT: [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]] -; CHECK-NOFOLD-NEXT: [[TMP0:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]]) -; CHECK-NOFOLD-NEXT: [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]] -; CHECK-NOFOLD-NEXT: [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] -; CHECK-NOFOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0 -; CHECK-NOFOLD-NEXT: [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] -; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1 -; CHECK-NOFOLD-NEXT: [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] -; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0 -; CHECK-NOFOLD-NEXT: br label %[[IF_END]] -; CHECK-NOFOLD: [[IF_END]]: -; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_0_4_VEC_INSERT25]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] -; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_14_8_VEC_INSERT35]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] -; CHECK-NOFOLD-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0 -; CHECK-NOFOLD-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1 -; CHECK-NOFOLD-NEXT: ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]] +; CHECK-NOFOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) { +; CHECK-NOFOLD-NEXT: [[BB:.*]]: +; CHECK-NOFOLD-NEXT: [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000 +; CHECK-NOFOLD-NEXT: br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]] +; CHECK-NOFOLD: [[BB3]]: +; CHECK-NOFOLD-NEXT: [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0 +; CHECK-NOFOLD-NEXT: [[I5:%.*]] = fmul fast float [[I4]], [[I4]] +; CHECK-NOFOLD-NEXT: [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1 +; CHECK-NOFOLD-NEXT: [[I7:%.*]] = fmul fast float [[I6]], [[I6]] +; CHECK-NOFOLD-NEXT: [[I8:%.*]] = fadd fast float [[I7]], [[I5]] +; CHECK-NOFOLD-NEXT: [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0 +; CHECK-NOFOLD-NEXT: [[I10:%.*]] = fmul fast float [[I9]], [[I9]] +; CHECK-NOFOLD-NEXT: [[I11:%.*]] = fadd fast float [[I8]], [[I10]] +; CHECK-NOFOLD-NEXT: [[I12:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[I11]]) +; CHECK-NOFOLD-NEXT: [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]] +; CHECK-NOFOLD-NEXT: [[I14:%.*]] = fmul fast float [[I13]], [[I4]] +; CHECK-NOFOLD-NEXT: [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0 +; CHECK-NOFOLD-NEXT: [[I16:%.*]] = fmul fast float [[I13]], [[I6]] +; CHECK-NOFOLD-NEXT: [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1 +; CHECK-NOFOLD-NEXT: [[I18:%.*]] = fmul fast float [[I13]], [[I9]] +; CHECK-NOFOLD-NEXT: [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0 +; CHECK-NOFOLD-NEXT: br label %[[BB20]] +; CHECK-NOFOLD: [[BB20]]: +; CHECK-NOFOLD-NEXT: [[I21:%.*]] = phi nsz <2 x float> [ [[I17]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-NOFOLD-NEXT: [[I22:%.*]] = phi nsz <2 x float> [ [[I19]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-NOFOLD-NEXT: [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0 +; CHECK-NOFOLD-NEXT: [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1 +; CHECK-NOFOLD-NEXT: ret { <2 x float>, <2 x float> } [[I24]] ; ; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo( -; CHECK-FOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-FOLD-NEXT: [[ENTRY:.*:]] -; CHECK-FOLD-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000 -; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0 -; CHECK-FOLD-NEXT: [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] -; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1 -; CHECK-FOLD-NEXT: [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] -; CHECK-FOLD-NEXT: [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]] -; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0 -; CHECK-FOLD-NEXT: [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] -; CHECK-FOLD-NEXT: [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]] -; CHECK-FOLD-NEXT: [[TMP0:%.*]] = tail call fast float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]]) -; CHECK-FOLD-NEXT: [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]] -; CHECK-FOLD-NEXT: [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]] -; CHECK-FOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0 -; CHECK-FOLD-NEXT: [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]] -; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1 -; CHECK-FOLD-NEXT: [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]] -; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0 -; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_0_4_VEC_INSERT25]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]] -; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_14_8_VEC_INSERT35]], <2 x float> zeroinitializer, !unpredictable [[META0]] -; CHECK-FOLD-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0 -; CHECK-FOLD-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1 -; CHECK-FOLD-NEXT: ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]] +; CHECK-FOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-FOLD-NEXT: [[BB:.*]]: +; CHECK-FOLD-NEXT: [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000 +; CHECK-FOLD-NEXT: br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]] +; CHECK-FOLD: [[BB3]]: +; CHECK-FOLD-NEXT: [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0 +; CHECK-FOLD-NEXT: [[I5:%.*]] = fmul fast float [[I4]], [[I4]] +; CHECK-FOLD-NEXT: [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1 +; CHECK-FOLD-NEXT: [[I7:%.*]] = fmul fast float [[I6]], [[I6]] +; CHECK-FOLD-NEXT: [[I8:%.*]] = fadd fast float [[I7]], [[I5]] +; CHECK-FOLD-NEXT: [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0 +; CHECK-FOLD-NEXT: [[I10:%.*]] = fmul fast float [[I9]], [[I9]] +; CHECK-FOLD-NEXT: [[I11:%.*]] = fadd fast float [[I8]], [[I10]] +; CHECK-FOLD-NEXT: [[I12:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[I11]]) +; CHECK-FOLD-NEXT: [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]] +; CHECK-FOLD-NEXT: [[I14:%.*]] = fmul fast float [[I13]], [[I4]] +; CHECK-FOLD-NEXT: [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0 +; CHECK-FOLD-NEXT: [[I16:%.*]] = fmul fast float [[I13]], [[I6]] +; CHECK-FOLD-NEXT: [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1 +; CHECK-FOLD-NEXT: [[I18:%.*]] = fmul fast float [[I13]], [[I9]] +; CHECK-FOLD-NEXT: [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0 +; CHECK-FOLD-NEXT: br label %[[BB20]] +; CHECK-FOLD: [[BB20]]: +; CHECK-FOLD-NEXT: [[I21:%.*]] = phi nsz <2 x float> [ [[I17]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-FOLD-NEXT: [[I22:%.*]] = phi nsz <2 x float> [ [[I19]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-FOLD-NEXT: [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0 +; CHECK-FOLD-NEXT: [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1 +; CHECK-FOLD-NEXT: ret { <2 x float>, <2 x float> } [[I24]] ; -entry: - %cmp = fcmp fast ogt float %speed, 0x3F747AE140000000 - br i1 %cmp, label %if.then, label %if.end, !unpredictable !0 +bb: + %i = fcmp fast ogt float %arg, 0x3F747AE140000000 + br i1 %i, label %bb3, label %bb20, !unpredictable !0 -if.then: - %velocity.sroa.0.0.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 0 - %mul.i.i.i.i = fmul fast float %velocity.sroa.0.0.vec.extract, %velocity.sroa.0.0.vec.extract - %velocity.sroa.0.4.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 1 - %mul8.i.i.i.i = fmul fast float %velocity.sroa.0.4.vec.extract, %velocity.sroa.0.4.vec.extract - %add.i.i.i.i = fadd fast float %mul8.i.i.i.i, %mul.i.i.i.i - %velocity.sroa.14.8.vec.extract = extractelement <2 x float> %velocity.coerce1, i64 0 - %mul13.i.i.i.i = fmul fast float %velocity.sroa.14.8.vec.extract, %velocity.sroa.14.8.vec.extract - %add14.i.i.i.i = fadd fast float %add.i.i.i.i, %mul13.i.i.i.i - %0 = tail call fast noundef float @llvm.sqrt.f32(float %add14.i.i.i.i) - %mul.i.i.i = fdiv fast float 0x3FEFD70A40000000, %0 - %sub.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.0.vec.extract - %1 = insertelement <2 x float> poison, float %sub.i, i64 0 - %sub8.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.4.vec.extract - %velocity.sroa.0.4.vec.insert25 = insertelement <2 x float> %1, float %sub8.i, i64 1 - %sub13.i = fmul fast float %mul.i.i.i, %velocity.sroa.14.8.vec.extract - %velocity.sroa.14.8.vec.insert35 = insertelement <2 x float> %velocity.coerce1, float %sub13.i, i64 0 - br label %if.end +bb3: ; preds = %bb + %i4 = extractelement <2 x float> %arg1, i64 0 + %i5 = fmul fast float %i4, %i4 + %i6 = extractelement <2 x float> %arg1, i64 1 + %i7 = fmul fast float %i6, %i6 + %i8 = fadd fast float %i7, %i5 + %i9 = extractelement <2 x float> %arg2, i64 0 + %i10 = fmul fast float %i9, %i9 + %i11 = fadd fast float %i8, %i10 + %i12 = tail call fast noundef float @llvm.sqrt.f32(float %i11) + %i13 = fdiv fast float 0x3FEFD70A40000000, %i12 + %i14 = fmul fast float %i13, %i4 + %i15 = insertelement <2 x float> poison, float %i14, i64 0 + %i16 = fmul fast float %i13, %i6 + %i17 = insertelement <2 x float> %i15, float %i16, i64 1 + %i18 = fmul fast float %i13, %i9 + %i19 = insertelement <2 x float> %arg2, float %i18, i64 0 + br label %bb20 -if.end: - %velocity.sroa.0.0 = phi nsz <2 x float> [ %velocity.sroa.0.4.vec.insert25, %if.then ], [ zeroinitializer, %entry ] - %velocity.sroa.14.0 = phi nsz <2 x float> [ %velocity.sroa.14.8.vec.insert35, %if.then ], [ zeroinitializer, %entry ] - %.fca.0.insert = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %velocity.sroa.0.0, 0 - %.fca.1.insert = insertvalue { <2 x float>, <2 x float> } %.fca.0.insert, <2 x float> %velocity.sroa.14.0, 1 - ret { <2 x float>, <2 x float> } %.fca.1.insert +bb20: ; preds = %bb3, %bb + %i21 = phi nsz <2 x float> [ %i17, %bb3 ], [ zeroinitializer, %bb ] + %i22 = phi nsz <2 x float> [ %i19, %bb3 ], [ zeroinitializer, %bb ] + %i23 = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %i21, 0 + %i24 = insertvalue { <2 x float>, <2 x float> } %i23, <2 x float> %i22, 1 + ret { <2 x float>, <2 x float> } %i24 } declare float @llvm.sqrt.f32(float) From 7a8e32baab9fa05d7b8121d0178195ef59ddd470 Mon Sep 17 00:00:00 2001 From: Tianqing Wang Date: Mon, 22 Jul 2024 14:53:57 +0800 Subject: [PATCH 6/6] Add "speculate-unpredictables" SimplifyCFGOptions. --- .../Transforms/Utils/SimplifyCFGOptions.h | 5 +++++ llvm/lib/Passes/PassBuilder.cpp | 2 ++ llvm/lib/Passes/PassBuilderPipelines.cpp | 12 +++++----- .../lib/Transforms/Scalar/SimplifyCFGPass.cpp | 9 +++++++- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 8 ++++--- llvm/test/Other/new-pm-print-pipeline.ll | 4 ++-- .../two-entry-phi-fold-unpredictable.ll | 22 +++++++++---------- 7 files changed, 40 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h index 8008fc6e8422d..2ea9d64f03cb6 100644 --- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h @@ -30,6 +30,7 @@ struct SimplifyCFGOptions { bool SinkCommonInsts = false; bool SimplifyCondBranch = true; bool SpeculateBlocks = true; + bool SpeculateUnpredictables = false; AssumptionCache *AC = nullptr; @@ -75,6 +76,10 @@ struct SimplifyCFGOptions { SpeculateBlocks = B; return *this; } + SimplifyCFGOptions &speculateUnpredictables(bool B) { + SpeculateUnpredictables = B; + return *this; + } }; } // namespace llvm diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index a9d3f8ec3a4ec..ade331166f994 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -845,6 +845,8 @@ Expected parseSimplifyCFGOptions(StringRef Params) { Result.hoistCommonInsts(Enable); } else if (ParamName == "sink-common-insts") { Result.sinkCommonInsts(Enable); + } else if (ParamName == "speculate-unpredictables") { + Result.speculateUnpredictables(Enable); } else if (Enable && ParamName.consume_front("bonus-inst-threshold=")) { APInt BonusInstThreshold; if (ParamName.getAsInteger(0, BonusInstThreshold)) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 4fd5ee1946bb7..010f7247d96fd 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1505,8 +1505,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // LoopSink (and other loop passes since the last simplifyCFG) might have // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. - OptimizePM.addPass( - SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .convertSwitchRangeToICmp(true) + .speculateUnpredictables(true))); // Add the core optimizing pipeline. MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), @@ -2024,9 +2025,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, LateFPM.addPass(DivRemPairsPass()); // Delete basic blocks, which optimization passes may have killed. - LateFPM.addPass(SimplifyCFGPass( - SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts( - true))); + LateFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .convertSwitchRangeToICmp(true) + .hoistCommonInsts(true) + .speculateUnpredictables(true))); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM))); // Drop bodies of available eternally objects to improve GlobalDCE. diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index b7baf34f27c21..11de37f7a7c10 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -77,6 +77,9 @@ static cl::opt UserSinkCommonInsts( "sink-common-insts", cl::Hidden, cl::init(false), cl::desc("Sink common instructions (default = false)")); +static cl::opt UserSpeculateUnpredictables( + "speculate-unpredictables", cl::Hidden, cl::init(false), + cl::desc("Speculate unpredictable branches (default = false)")); STATISTIC(NumSimpl, "Number of blocks simplified"); @@ -325,6 +328,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) { Options.HoistCommonInsts = UserHoistCommonInsts; if (UserSinkCommonInsts.getNumOccurrences()) Options.SinkCommonInsts = UserSinkCommonInsts; + if (UserSpeculateUnpredictables.getNumOccurrences()) + Options.SpeculateUnpredictables = UserSpeculateUnpredictables; } SimplifyCFGPass::SimplifyCFGPass() { @@ -351,7 +356,9 @@ void SimplifyCFGPass::printPipeline( OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;"; OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;"; OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;"; - OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch"; + OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;"; + OS << (Options.SpeculateUnpredictables ? "" : "no-") + << "speculate-unpredictables"; OS << '>'; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 3ab1564e93e49..f23e28888931d 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3476,7 +3476,8 @@ static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI, /// Given a BB that starts with the specified two-entry PHI node, /// see if we can eliminate it. static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, - DomTreeUpdater *DTU, const DataLayout &DL) { + DomTreeUpdater *DTU, const DataLayout &DL, + bool SpeculateUnpredictables) { // Ok, this is a two entry PHI node. Check to see if this is a simple "if // statement", which has a very simple dominance structure. Basically, we // are trying to find the condition that is being branched on, which @@ -3552,7 +3553,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, InstructionCost Cost = 0; InstructionCost Budget = TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; - if (IsUnpredictable) + if (SpeculateUnpredictables && IsUnpredictable) Budget += TTI.getBranchMispredictPenalty(); bool Changed = false; @@ -7818,7 +7819,8 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { // eliminate it, do so now. if (auto *PN = dyn_cast(BB->begin())) if (PN->getNumIncomingValues() == 2) - if (FoldTwoEntryPHINode(PN, TTI, DTU, DL)) + if (FoldTwoEntryPHINode(PN, TTI, DTU, DL, + Options.SpeculateUnpredictables)) return true; } diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll index a524c9991f1bf..f2e80814f347a 100644 --- a/llvm/test/Other/new-pm-print-pipeline.ll +++ b/llvm/test/Other/new-pm-print-pipeline.ll @@ -49,8 +49,8 @@ ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(print,print)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-17 ; CHECK-17: function(print,print) -; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg,simplifycfg)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18 -; CHECK-18: function(simplifycfg,simplifycfg) +; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg,simplifycfg)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18 +; CHECK-18: function(simplifycfg,simplifycfg) ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize,loop-vectorize)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19 ; CHECK-19: function(loop-vectorize,loop-vectorize) diff --git a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll index 0bce8e3ed7dd3..82566d47b0328 100644 --- a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll +++ b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll @@ -1,11 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 ; Two-entry phi nodes with unpredictable conditions may get increased budget for folding. ; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s -; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-FOLD %s +; RUN: opt < %s -S -passes='simplifycfg' | FileCheck --check-prefix=CHECK-NOFOLD %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes='simplifycfg' | FileCheck --check-prefix=CHECK-FOLD %s -define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x float> %arg2) { +define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x float> %arg2) #0 { ; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo( -; CHECK-NOFOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) { +; CHECK-NOFOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NOFOLD-NEXT: [[BB:.*]]: ; CHECK-NOFOLD-NEXT: [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000 ; CHECK-NOFOLD-NEXT: br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]] @@ -36,10 +38,8 @@ define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x flo ; ; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo( ; CHECK-FOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-FOLD-NEXT: [[BB:.*]]: +; CHECK-FOLD-NEXT: [[BB:.*:]] ; CHECK-FOLD-NEXT: [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000 -; CHECK-FOLD-NEXT: br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]] -; CHECK-FOLD: [[BB3]]: ; CHECK-FOLD-NEXT: [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0 ; CHECK-FOLD-NEXT: [[I5:%.*]] = fmul fast float [[I4]], [[I4]] ; CHECK-FOLD-NEXT: [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1 @@ -48,7 +48,7 @@ define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x flo ; CHECK-FOLD-NEXT: [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0 ; CHECK-FOLD-NEXT: [[I10:%.*]] = fmul fast float [[I9]], [[I9]] ; CHECK-FOLD-NEXT: [[I11:%.*]] = fadd fast float [[I8]], [[I10]] -; CHECK-FOLD-NEXT: [[I12:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[I11]]) +; CHECK-FOLD-NEXT: [[I12:%.*]] = tail call fast float @llvm.sqrt.f32(float [[I11]]) ; CHECK-FOLD-NEXT: [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]] ; CHECK-FOLD-NEXT: [[I14:%.*]] = fmul fast float [[I13]], [[I4]] ; CHECK-FOLD-NEXT: [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0 @@ -56,10 +56,8 @@ define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x flo ; CHECK-FOLD-NEXT: [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1 ; CHECK-FOLD-NEXT: [[I18:%.*]] = fmul fast float [[I13]], [[I9]] ; CHECK-FOLD-NEXT: [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0 -; CHECK-FOLD-NEXT: br label %[[BB20]] -; CHECK-FOLD: [[BB20]]: -; CHECK-FOLD-NEXT: [[I21:%.*]] = phi nsz <2 x float> [ [[I17]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ] -; CHECK-FOLD-NEXT: [[I22:%.*]] = phi nsz <2 x float> [ [[I19]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-FOLD-NEXT: [[I21:%.*]] = select nsz i1 [[I]], <2 x float> [[I17]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]] +; CHECK-FOLD-NEXT: [[I22:%.*]] = select nsz i1 [[I]], <2 x float> [[I19]], <2 x float> zeroinitializer, !unpredictable [[META0]] ; CHECK-FOLD-NEXT: [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0 ; CHECK-FOLD-NEXT: [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1 ; CHECK-FOLD-NEXT: ret { <2 x float>, <2 x float> } [[I24]] @@ -97,4 +95,6 @@ bb20: ; preds = %bb3, %bb declare float @llvm.sqrt.f32(float) +attributes #0 = { nounwind } + !0 = !{}