diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a2360a600bfc2..318a51d11666c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6634,17 +6634,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_gnu_inline_asm, true)) CmdArgs.push_back("-fno-gnu-inline-asm"); - bool EnableSYCLEarlyOptimizations = - Args.hasFlag(options::OPT_fsycl_early_optimizations, - options::OPT_fno_sycl_early_optimizations, - Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga); - // Enable vectorization per default according to the optimization level // selected. For optimization levels that want vectorization we use the alias // option to simplify the hasFlag logic. bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false); - if (RawTriple.isSPIR() && EnableSYCLEarlyOptimizations) - EnableVec = false; // But disable vectorization for SYCL device code OptSpecifier VectorizeAliasOption = EnableVec ? options::OPT_O_Group : options::OPT_fvectorize; if (Args.hasFlag(options::OPT_fvectorize, VectorizeAliasOption, @@ -6653,8 +6646,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // -fslp-vectorize is enabled based on the optimization level selected. bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true); - if (RawTriple.isSPIR() && EnableSYCLEarlyOptimizations) - EnableSLPVec = false; // But disable vectorization for SYCL device code OptSpecifier SLPVectAliasOption = EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize; if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption, diff --git a/clang/test/Driver/sycl-device-optimizations.cpp b/clang/test/Driver/sycl-device-optimizations.cpp index 7399454a9b3ad..71e1f345df652 100644 --- a/clang/test/Driver/sycl-device-optimizations.cpp +++ b/clang/test/Driver/sycl-device-optimizations.cpp @@ -36,13 +36,3 @@ // RUN: | FileCheck -check-prefix=CHECK-DAE %s // CHECK-DAE: clang{{.*}} "-fenable-sycl-dae" // CHECK-DAE: sycl-post-link{{.*}} "-emit-param-info" - -/// Check that vectorizers are disabled by default: -// RUN: %clang -### -fsycl %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHECK-VEC-DEFAULT %s -// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-loops" -// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-slp" -/// Check that vectorizers can still be enabled manually: -// RUN: %clang -### -fsycl -fvectorize -fslp-vectorize %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHECK-VEC-ENABLE %s -// CHECK-VEC-ENABLE: clang{{.*}} "-fsycl-is-device"{{.*}}"-vectorize-loops"{{.*}}"-vectorize-slp" diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 477901e330840..cf699523c2e6f 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -429,54 +429,54 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions - // Begin the loop pass pipeline. - if (EnableSimpleLoopUnswitch) { - // The simple loop unswitch pass relies on separate cleanup passes. Schedule - // them first so when we re-process a loop they run before other loop - // passes. - MPM.add(createLoopInstSimplifyPass()); - MPM.add(createLoopSimplifyCFGPass()); - } - // Try to remove as much code from the loop header as possible, - // to reduce amount of IR that will have to be duplicated. - // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - // Rotate Loop - disable header duplication at -Oz - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO)); - // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - if (EnableSimpleLoopUnswitch) - MPM.add(createSimpleLoopUnswitchLegacyPass()); - else - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - // FIXME: We break the loop pass pipeline here in order to do full - // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the - // need for this. - MPM.add(createCFGSimplificationPass()); - MPM.add(createInstructionCombiningPass()); - // We resume loop passes creating a second loop pipeline here. - if (EnableLoopFlatten) { - MPM.add(createLoopFlattenPass()); // Flatten loops - MPM.add(createLoopSimplifyCFGPass()); + // Do not run loop pass pipeline in "SYCL Optimization Mode". Loop + // optimizations rely on TTI, which is not accurate for SPIR target. + if (!SYCLOptimizationMode) { + // Begin the loop pass pipeline. + if (EnableSimpleLoopUnswitch) { + // The simple loop unswitch pass relies on separate cleanup passes. + // Schedule them first so when we re-process a loop they run before other + // loop passes. + MPM.add(createLoopInstSimplifyPass()); + MPM.add(createLoopSimplifyCFGPass()); + } + // Try to remove as much code from the loop header as possible, + // to reduce amount of IR that will have to be duplicated. + // TODO: Investigate promotion cap for O1. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + // Rotate Loop - disable header duplication at -Oz + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO)); + // TODO: Investigate promotion cap for O1. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + if (EnableSimpleLoopUnswitch) + MPM.add(createSimpleLoopUnswitchLegacyPass()); + else + MPM.add( + createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + // FIXME: We break the loop pass pipeline here in order to do full + // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace + // the need for this. + MPM.add(createCFGSimplificationPass()); + MPM.add(createInstructionCombiningPass()); + // We resume loop passes creating a second loop pipeline here. + if (EnableLoopFlatten) { + MPM.add(createLoopFlattenPass()); // Flatten loops + MPM.add(createLoopSimplifyCFGPass()); + } + MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. + MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars + addExtensionsToPM(EP_LateLoopOptimizations, MPM); + MPM.add(createLoopDeletionPass()); // Delete dead loops + + if (EnableLoopInterchange) + MPM.add(createLoopInterchangePass()); // Interchange loops + + // Unroll small loops and perform peeling. + MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + addExtensionsToPM(EP_LoopOptimizerEnd, MPM); + // This ends the loop pass pipelines. } - MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. - // TODO: this pass hurts performance due to promotions of induction variables - // from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual - // target with unlimited # of registers and pass doesn't take into account - // that on real HW this promotion is not beneficial. - if (!SYCLOptimizationMode) - MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars - addExtensionsToPM(EP_LateLoopOptimizations, MPM); - MPM.add(createLoopDeletionPass()); // Delete dead loops - - if (EnableLoopInterchange) - MPM.add(createLoopInterchangePass()); // Interchange loops - - // Unroll small loops and perform peeling. - MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - addExtensionsToPM(EP_LoopOptimizerEnd, MPM); - // This ends the loop pass pipelines. // Break up allocas that may now be splittable after loop unrolling. MPM.add(createSROAPass()); @@ -788,68 +788,74 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_VectorizerStart, MPM); - // Re-rotate loops in all our loop nests. These may have fallout out of - // rotated form due to GVN or other transformations, and the vectorizer relies - // on the rotated form. Disable header duplication at -Oz. - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO)); - - // Distribute loops to allow partial vectorization. I.e. isolate dependences - // into separate loop that would otherwise inhibit vectorization. This is - // currently only performed for loops marked with the metadata - // llvm.loop.distribute=true or when -enable-loop-distribute is specified. - MPM.add(createLoopDistributePass()); - - MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); - - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - MPM.add(createLoopLoadEliminationPass()); - - // FIXME: Because of #pragma vectorize enable, the passes below are always - // inserted in the pipeline, even when the vectorizer doesn't run (ex. when - // on -O1 and no #pragma is found). Would be good to have these two passes - // as function calls, so that we can only pass them when the vectorizer - // changed the code. - MPM.add(createInstructionCombiningPass()); - if (OptLevel > 1 && ExtraVectorizerPasses) { - // At higher optimization levels, try to clean up any runtime overlap and - // alignment checks inserted by the vectorizer. We want to track correllated - // runtime checks for two inner loops in the same outer loop, fold any - // common computations, hoist loop-invariant aspects out of any outer loop, - // and unswitch the runtime checks if possible. Once hoisted, we may have - // dead (or speculatable) control flows or more combining opportunities. - MPM.add(createEarlyCSEPass()); - MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createInstructionCombiningPass()); - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - MPM.add(createCFGSimplificationPass()); + if (!SYCLOptimizationMode) { + // Re-rotate loops in all our loop nests. These may have fallout out of + // rotated form due to GVN or other transformations, and the vectorizer + // relies on the rotated form. Disable header duplication at -Oz. + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO)); + + // Distribute loops to allow partial vectorization. I.e. isolate + // dependences into separate loop that would otherwise inhibit + // vectorization. This is currently only performed for loops marked with + // the metadata llvm.loop.distribute=true or when -enable-loop-distribute is + // specified. + MPM.add(createLoopDistributePass()); + + MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + MPM.add(createLoopLoadEliminationPass()); + + // FIXME: Because of #pragma vectorize enable, the passes below are always + // inserted in the pipeline, even when the vectorizer doesn't run (ex. when + // on -O1 and no #pragma is found). Would be good to have these two passes + // as function calls, so that we can only pass them when the vectorizer + // changed the code. MPM.add(createInstructionCombiningPass()); - } - - // Cleanup after loop vectorization, etc. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - // The extra sinking transform can create larger basic blocks, so do this - // before SLP vectorization. - // FIXME: study whether hoisting and/or sinking of common instructions should - // be delayed until after SLP vectorizer. - MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); - - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. if (OptLevel > 1 && ExtraVectorizerPasses) { + // At higher optimization levels, try to clean up any runtime overlap and + // alignment checks inserted by the vectorizer. We want to track + // correllated runtime checks for two inner loops in the same outer loop, + // fold any common computations, hoist loop-invariant aspects out of any + // outer loop, and unswitch the runtime checks if possible. Once hoisted, + // we may have dead (or speculatable) control flows or more combining + // opportunities. MPM.add(createEarlyCSEPass()); + MPM.add(createCorrelatedValuePropagationPass()); + MPM.add(createInstructionCombiningPass()); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add( + createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + MPM.add(createCFGSimplificationPass()); + MPM.add(createInstructionCombiningPass()); } - } - // Enhance/cleanup vector code. - MPM.add(createVectorCombinePass()); + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + // FIXME: study whether hoisting and/or sinking of common instructions + // should + // be delayed until after SLP vectorizer. + MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + + if (SLPVectorize) { + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + if (OptLevel > 1 && ExtraVectorizerPasses) { + MPM.add(createEarlyCSEPass()); + } + } + + // Enhance/cleanup vector code. + MPM.add(createVectorCombinePass()); + } addExtensionsToPM(EP_Peephole, MPM); MPM.add(createInstructionCombiningPass()); @@ -861,22 +867,24 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopUnrollAndJamPass(OptLevel)); } - // Unroll small loops - MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + if (!SYCLOptimizationMode) { + // Unroll small loops + MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); - if (!DisableUnrollLoops) { - // LoopUnroll may generate some redundency to cleanup. - MPM.add(createInstructionCombiningPass()); + if (!DisableUnrollLoops) { + // LoopUnroll may generate some redundency to cleanup. + MPM.add(createInstructionCombiningPass()); - // Runtime unrolling will introduce runtime check in loop prologue. If the - // unrolled loop is a inner loop, then the prologue will be inside the - // outer loop. LICM pass can help to promote the runtime check out if the - // checked value is loop invariant. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - } + // Runtime unrolling will introduce runtime check in loop prologue. If the + // unrolled loop is a inner loop, then the prologue will be inside the + // outer loop. LICM pass can help to promote the runtime check out if the + // checked value is loop invariant. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + } - MPM.add(createWarnMissedTransformationsPass()); + MPM.add(createWarnMissedTransformationsPass()); + } // After vectorization and unrolling, assume intrinsics may tell us more // about pointer alignments.