@@ -429,54 +429,54 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
429429  MPM.add (createCFGSimplificationPass ());      //  Merge & remove BBs
430430  MPM.add (createReassociatePass ());           //  Reassociate expressions
431431
432-   //  Begin the loop pass pipeline.
433-   if  (EnableSimpleLoopUnswitch) {
434-     //  The simple loop unswitch pass relies on separate cleanup passes. Schedule
435-     //  them first so when we re-process a loop they run before other loop
436-     //  passes.
437-     MPM.add (createLoopInstSimplifyPass ());
438-     MPM.add (createLoopSimplifyCFGPass ());
439-   }
440-   //  Try to remove as much code from the loop header as possible,
441-   //  to reduce amount of IR that will have to be duplicated.
442-   //  TODO: Investigate promotion cap for O1.
443-   MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
444-   //  Rotate Loop - disable header duplication at -Oz
445-   MPM.add (createLoopRotatePass (SizeLevel == 2  ? 0  : -1 , PrepareForLTO));
446-   //  TODO: Investigate promotion cap for O1.
447-   MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
448-   if  (EnableSimpleLoopUnswitch)
449-     MPM.add (createSimpleLoopUnswitchLegacyPass ());
450-   else 
451-     MPM.add (createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
452-   //  FIXME: We break the loop pass pipeline here in order to do full
453-   //  simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
454-   //  need for this.
455-   MPM.add (createCFGSimplificationPass ());
456-   MPM.add (createInstructionCombiningPass ());
457-   //  We resume loop passes creating a second loop pipeline here.
458-   if  (EnableLoopFlatten) {
459-     MPM.add (createLoopFlattenPass ()); //  Flatten loops
460-     MPM.add (createLoopSimplifyCFGPass ());
432+   //  Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
433+   //  optimizations rely on TTI, which is not accurate for SPIR target.
434+   if  (!SYCLOptimizationMode) {
435+     //  Begin the loop pass pipeline.
436+     if  (EnableSimpleLoopUnswitch) {
437+       //  The simple loop unswitch pass relies on separate cleanup passes.
438+       //  Schedule them first so when we re-process a loop they run before other
439+       //  loop passes.
440+       MPM.add (createLoopInstSimplifyPass ());
441+       MPM.add (createLoopSimplifyCFGPass ());
442+     }
443+     //  Try to remove as much code from the loop header as possible,
444+     //  to reduce amount of IR that will have to be duplicated.
445+     //  TODO: Investigate promotion cap for O1.
446+     MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
447+     //  Rotate Loop - disable header duplication at -Oz
448+     MPM.add (createLoopRotatePass (SizeLevel == 2  ? 0  : -1 , PrepareForLTO));
449+     //  TODO: Investigate promotion cap for O1.
450+     MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
451+     if  (EnableSimpleLoopUnswitch)
452+       MPM.add (createSimpleLoopUnswitchLegacyPass ());
453+     else 
454+       MPM.add (
455+           createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
456+     //  FIXME: We break the loop pass pipeline here in order to do full
457+     //  simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace
458+     //  the need for this.
459+     MPM.add (createCFGSimplificationPass ());
460+     MPM.add (createInstructionCombiningPass ());
461+     //  We resume loop passes creating a second loop pipeline here.
462+     if  (EnableLoopFlatten) {
463+       MPM.add (createLoopFlattenPass ()); //  Flatten loops
464+       MPM.add (createLoopSimplifyCFGPass ());
465+     }
466+     MPM.add (createLoopIdiomPass ());      //  Recognize idioms like memset.
467+     MPM.add (createIndVarSimplifyPass ()); //  Canonicalize indvars
468+     addExtensionsToPM (EP_LateLoopOptimizations, MPM);
469+     MPM.add (createLoopDeletionPass ()); //  Delete dead loops
470+ 
471+     if  (EnableLoopInterchange)
472+       MPM.add (createLoopInterchangePass ()); //  Interchange loops
473+ 
474+     //  Unroll small loops and perform peeling.
475+     MPM.add (createSimpleLoopUnrollPass (OptLevel, DisableUnrollLoops,
476+                                        ForgetAllSCEVInLoopUnroll));
477+     addExtensionsToPM (EP_LoopOptimizerEnd, MPM);
478+     //  This ends the loop pass pipelines.
461479  }
462-   MPM.add (createLoopIdiomPass ());             //  Recognize idioms like memset.
463-   //  TODO: this pass hurts performance due to promotions of induction variables
464-   //  from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual
465-   //  target with unlimited # of registers and pass doesn't take into account
466-   //  that on real HW this promotion is not beneficial.
467-   if  (!SYCLOptimizationMode)
468-     MPM.add (createIndVarSimplifyPass ());      //  Canonicalize indvars
469-   addExtensionsToPM (EP_LateLoopOptimizations, MPM);
470-   MPM.add (createLoopDeletionPass ());          //  Delete dead loops
471- 
472-   if  (EnableLoopInterchange)
473-     MPM.add (createLoopInterchangePass ()); //  Interchange loops
474- 
475-   //  Unroll small loops and perform peeling.
476-   MPM.add (createSimpleLoopUnrollPass (OptLevel, DisableUnrollLoops,
477-                                      ForgetAllSCEVInLoopUnroll));
478-   addExtensionsToPM (EP_LoopOptimizerEnd, MPM);
479-   //  This ends the loop pass pipelines.
480480
481481  //  Break up allocas that may now be splittable after loop unrolling.
482482  MPM.add (createSROAPass ());
@@ -788,68 +788,74 @@ void PassManagerBuilder::populateModulePassManager(
788788
789789  addExtensionsToPM (EP_VectorizerStart, MPM);
790790
791-   //  Re-rotate loops in all our loop nests. These may have fallout out of
792-   //  rotated form due to GVN or other transformations, and the vectorizer relies
793-   //  on the rotated form. Disable header duplication at -Oz.
794-   MPM.add (createLoopRotatePass (SizeLevel == 2  ? 0  : -1 , PrepareForLTO));
795- 
796-   //  Distribute loops to allow partial vectorization.  I.e. isolate dependences
797-   //  into separate loop that would otherwise inhibit vectorization.  This is
798-   //  currently only performed for loops marked with the metadata
799-   //  llvm.loop.distribute=true or when -enable-loop-distribute is specified.
800-   MPM.add (createLoopDistributePass ());
801- 
802-   MPM.add (createLoopVectorizePass (!LoopsInterleaved, !LoopVectorize));
803- 
804-   //  Eliminate loads by forwarding stores from the previous iteration to loads
805-   //  of the current iteration.
806-   MPM.add (createLoopLoadEliminationPass ());
807- 
808-   //  FIXME: Because of #pragma vectorize enable, the passes below are always
809-   //  inserted in the pipeline, even when the vectorizer doesn't run (ex. when
810-   //  on -O1 and no #pragma is found). Would be good to have these two passes
811-   //  as function calls, so that we can only pass them when the vectorizer
812-   //  changed the code.
813-   MPM.add (createInstructionCombiningPass ());
814-   if  (OptLevel > 1  && ExtraVectorizerPasses) {
815-     //  At higher optimization levels, try to clean up any runtime overlap and
816-     //  alignment checks inserted by the vectorizer. We want to track correllated
817-     //  runtime checks for two inner loops in the same outer loop, fold any
818-     //  common computations, hoist loop-invariant aspects out of any outer loop,
819-     //  and unswitch the runtime checks if possible. Once hoisted, we may have
820-     //  dead (or speculatable) control flows or more combining opportunities.
821-     MPM.add (createEarlyCSEPass ());
822-     MPM.add (createCorrelatedValuePropagationPass ());
823-     MPM.add (createInstructionCombiningPass ());
824-     MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
825-     MPM.add (createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
826-     MPM.add (createCFGSimplificationPass ());
791+   if  (!SYCLOptimizationMode) {
792+     //  Re-rotate loops in all our loop nests. These may have fallout out of
793+     //  rotated form due to GVN or other transformations, and the vectorizer
794+     //  relies on the rotated form. Disable header duplication at -Oz.
795+     MPM.add (createLoopRotatePass (SizeLevel == 2  ? 0  : -1 , PrepareForLTO));
796+ 
797+     //  Distribute loops to allow partial vectorization.  I.e. isolate
798+     //  dependences into separate loop that would otherwise inhibit
799+     //  vectorization.  This is currently only performed for loops marked with
800+     //  the metadata llvm.loop.distribute=true or when -enable-loop-distribute is
801+     //  specified.
802+     MPM.add (createLoopDistributePass ());
803+ 
804+     MPM.add (createLoopVectorizePass (!LoopsInterleaved, !LoopVectorize));
805+ 
806+     //  Eliminate loads by forwarding stores from the previous iteration to loads
807+     //  of the current iteration.
808+     MPM.add (createLoopLoadEliminationPass ());
809+ 
810+     //  FIXME: Because of #pragma vectorize enable, the passes below are always
811+     //  inserted in the pipeline, even when the vectorizer doesn't run (ex. when
812+     //  on -O1 and no #pragma is found). Would be good to have these two passes
813+     //  as function calls, so that we can only pass them when the vectorizer
814+     //  changed the code.
827815    MPM.add (createInstructionCombiningPass ());
828-   }
829- 
830-   //  Cleanup after loop vectorization, etc. Simplification passes like CVP and
831-   //  GVN, loop transforms, and others have already run, so it's now better to
832-   //  convert to more optimized IR using more aggressive simplify CFG options.
833-   //  The extra sinking transform can create larger basic blocks, so do this
834-   //  before SLP vectorization.
835-   //  FIXME: study whether hoisting and/or sinking of common instructions should
836-   //         be delayed until after SLP vectorizer.
837-   MPM.add (createCFGSimplificationPass (SimplifyCFGOptions ()
838-                                           .forwardSwitchCondToPhi (true )
839-                                           .convertSwitchToLookupTable (true )
840-                                           .needCanonicalLoops (false )
841-                                           .hoistCommonInsts (true )
842-                                           .sinkCommonInsts (true )));
843- 
844-   if  (SLPVectorize) {
845-     MPM.add (createSLPVectorizerPass ()); //  Vectorize parallel scalar chains.
846816    if  (OptLevel > 1  && ExtraVectorizerPasses) {
817+       //  At higher optimization levels, try to clean up any runtime overlap and
818+       //  alignment checks inserted by the vectorizer. We want to track
819+       //  correllated runtime checks for two inner loops in the same outer loop,
820+       //  fold any common computations, hoist loop-invariant aspects out of any
821+       //  outer loop, and unswitch the runtime checks if possible. Once hoisted,
822+       //  we may have dead (or speculatable) control flows or more combining
823+       //  opportunities.
847824      MPM.add (createEarlyCSEPass ());
825+       MPM.add (createCorrelatedValuePropagationPass ());
826+       MPM.add (createInstructionCombiningPass ());
827+       MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
828+       MPM.add (
829+           createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
830+       MPM.add (createCFGSimplificationPass ());
831+       MPM.add (createInstructionCombiningPass ());
848832    }
849-   }
850833
851-   //  Enhance/cleanup vector code.
852-   MPM.add (createVectorCombinePass ());
834+     //  Cleanup after loop vectorization, etc. Simplification passes like CVP and
835+     //  GVN, loop transforms, and others have already run, so it's now better to
836+     //  convert to more optimized IR using more aggressive simplify CFG options.
837+     //  The extra sinking transform can create larger basic blocks, so do this
838+     //  before SLP vectorization.
839+     //  FIXME: study whether hoisting and/or sinking of common instructions
840+     //  should
841+     //         be delayed until after SLP vectorizer.
842+     MPM.add (createCFGSimplificationPass (SimplifyCFGOptions ()
843+                                             .forwardSwitchCondToPhi (true )
844+                                             .convertSwitchToLookupTable (true )
845+                                             .needCanonicalLoops (false )
846+                                             .hoistCommonInsts (true )
847+                                             .sinkCommonInsts (true )));
848+ 
849+     if  (SLPVectorize) {
850+       MPM.add (createSLPVectorizerPass ()); //  Vectorize parallel scalar chains.
851+       if  (OptLevel > 1  && ExtraVectorizerPasses) {
852+         MPM.add (createEarlyCSEPass ());
853+       }
854+     }
855+ 
856+     //  Enhance/cleanup vector code.
857+     MPM.add (createVectorCombinePass ());
858+   }
853859
854860  addExtensionsToPM (EP_Peephole, MPM);
855861  MPM.add (createInstructionCombiningPass ());
@@ -861,22 +867,24 @@ void PassManagerBuilder::populateModulePassManager(
861867    MPM.add (createLoopUnrollAndJamPass (OptLevel));
862868  }
863869
864-   //  Unroll small loops
865-   MPM.add (createLoopUnrollPass (OptLevel, DisableUnrollLoops,
866-                                ForgetAllSCEVInLoopUnroll));
870+   if  (!SYCLOptimizationMode) {
871+     //  Unroll small loops
872+     MPM.add (createLoopUnrollPass (OptLevel, DisableUnrollLoops,
873+                                  ForgetAllSCEVInLoopUnroll));
867874
868-   if  (!DisableUnrollLoops) {
869-     //  LoopUnroll may generate some redundency to cleanup.
870-     MPM.add (createInstructionCombiningPass ());
875+      if  (!DisableUnrollLoops) {
876+        //  LoopUnroll may generate some redundency to cleanup.
877+        MPM.add (createInstructionCombiningPass ());
871878
872-     //  Runtime unrolling will introduce runtime check in loop prologue. If the
873-     //  unrolled loop is a inner loop, then the prologue will be inside the
874-     //  outer loop. LICM pass can help to promote the runtime check out if the
875-     //  checked value is loop invariant.
876-     MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
877-   }
879+        //  Runtime unrolling will introduce runtime check in loop prologue. If the
880+        //  unrolled loop is a inner loop, then the prologue will be inside the
881+        //  outer loop. LICM pass can help to promote the runtime check out if the
882+        //  checked value is loop invariant.
883+        MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
884+      }
878885
879-   MPM.add (createWarnMissedTransformationsPass ());
886+     MPM.add (createWarnMissedTransformationsPass ());
887+   }
880888
881889  //  After vectorization and unrolling, assume intrinsics may tell us more
882890  //  about pointer alignments.
0 commit comments