diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 5c32e251588fd..9c664114b82cb 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -411,46 +411,50 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions - // Begin the loop pass pipeline. - if (EnableSimpleLoopUnswitch) { - // The simple loop unswitch pass relies on separate cleanup passes. Schedule - // them first so when we re-process a loop they run before other loop - // passes. - MPM.add(createLoopInstSimplifyPass()); - MPM.add(createLoopSimplifyCFGPass()); + // Do not run loop pass pipeline in "SYCL Optimization Mode". Loop + // optimizations rely on TTI, which is not accurate for SPIR target. + if (!SYCLOptimizationMode) { + // Begin the loop pass pipeline. + if (EnableSimpleLoopUnswitch) { + // The simple loop unswitch pass relies on separate cleanup passes. + // Schedule them first so when we re-process a loop they run before other + // loop passes. + MPM.add(createLoopInstSimplifyPass()); + MPM.add(createLoopSimplifyCFGPass()); + } + // Rotate Loop - disable header duplication at -Oz + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + // TODO: Investigate promotion cap for O1. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + if (EnableSimpleLoopUnswitch) + MPM.add(createSimpleLoopUnswitchLegacyPass()); + else + MPM.add( + createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + // FIXME: We break the loop pass pipeline here in order to do full + // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace + // the need for this. + MPM.add(createCFGSimplificationPass()); + MPM.add(createInstructionCombiningPass()); + // We resume loop passes creating a second loop pipeline here. + // TODO: this pass hurts performance due to promotions of induction + // variables from 32-bit value to 64-bit values. I assume it's because SPIR + // is a virtual target with unlimited # of registers and pass doesn't take + // into account that on real HW this promotion is not beneficial. + MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars + MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. + addExtensionsToPM(EP_LateLoopOptimizations, MPM); + MPM.add(createLoopDeletionPass()); // Delete dead loops + + if (EnableLoopInterchange) + MPM.add(createLoopInterchangePass()); // Interchange loops + + // Unroll small loops + MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + addExtensionsToPM(EP_LoopOptimizerEnd, MPM); + // This ends the loop pass pipelines. } - // Rotate Loop - disable header duplication at -Oz - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); - // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - if (EnableSimpleLoopUnswitch) - MPM.add(createSimpleLoopUnswitchLegacyPass()); - else - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - // FIXME: We break the loop pass pipeline here in order to do full - // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the - // need for this. - MPM.add(createCFGSimplificationPass()); - MPM.add(createInstructionCombiningPass()); - // We resume loop passes creating a second loop pipeline here. - // TODO: this pass hurts performance due to promotions of induction variables - // from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual - // target with unlimited # of registers and pass doesn't take into account - // that on real HW this promotion is not beneficial. - if (!SYCLOptimizationMode) - MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars - MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. - addExtensionsToPM(EP_LateLoopOptimizations, MPM); - MPM.add(createLoopDeletionPass()); // Delete dead loops - - if (EnableLoopInterchange) - MPM.add(createLoopInterchangePass()); // Interchange loops - - // Unroll small loops - MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - addExtensionsToPM(EP_LoopOptimizerEnd, MPM); - // This ends the loop pass pipelines. if (OptLevel > 1) { MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds @@ -819,19 +823,21 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopUnrollAndJamPass(OptLevel)); } - // Unroll small loops - MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + if (!SYCLOptimizationMode) { + // Unroll small loops + MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); - if (!DisableUnrollLoops) { - // LoopUnroll may generate some redundency to cleanup. - MPM.add(createInstructionCombiningPass()); + if (!DisableUnrollLoops) { + // LoopUnroll may generate some redundency to cleanup. + MPM.add(createInstructionCombiningPass()); - // Runtime unrolling will introduce runtime check in loop prologue. If the - // unrolled loop is a inner loop, then the prologue will be inside the - // outer loop. LICM pass can help to promote the runtime check out if the - // checked value is loop invariant. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + // Runtime unrolling will introduce runtime check in loop prologue. If the + // unrolled loop is a inner loop, then the prologue will be inside the + // outer loop. LICM pass can help to promote the runtime check out if the + // checked value is loop invariant. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + } } MPM.add(createWarnMissedTransformationsPass()); @@ -1034,13 +1040,16 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); - // Unroll small loops - PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createLoopVectorizePass(true, !LoopVectorize)); - // The vectorizer may have significantly shortened a loop body; unroll again. - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + if (!SYCLOptimizationMode) { + // Unroll small loops + PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createLoopVectorizePass(true, !LoopVectorize)); + // The vectorizer may have significantly shortened a loop body; unroll + // again. + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + } PM.add(createWarnMissedTransformationsPass()); diff --git a/sycl/test/sub_group/broadcast.cpp b/sycl/test/sub_group/broadcast.cpp index 3dbba78387b2d..49df849c1baad 100644 --- a/sycl/test/sub_group/broadcast.cpp +++ b/sycl/test/sub_group/broadcast.cpp @@ -1,4 +1,3 @@ -// XFAIL: cpu // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. diff --git a/sycl/test/sub_group/broadcast_fp64.cpp b/sycl/test/sub_group/broadcast_fp64.cpp index 9652fa6b73f46..f9f87e8f95fd9 100644 --- a/sycl/test/sub_group/broadcast_fp64.cpp +++ b/sycl/test/sub_group/broadcast_fp64.cpp @@ -1,4 +1,3 @@ -// XFAIL: cpu // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups.