From 587514e3b6dd9324c51c2f1d0efba16d53fe6599 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 2 Sep 2020 21:51:04 +0300 Subject: [PATCH 01/21] [SYCL] Disable loop unrolling and vectorization Loop unrolling in "SYCL optimization mode" uses default heuristic, which is tuned for CPU and might not be profitable for other devices. --- .../lib/Transforms/IPO/PassManagerBuilder.cpp | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 5c32e251588fd..372acb48e0cfc 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -447,8 +447,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createLoopInterchangePass()); // Interchange loops // Unroll small loops - MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + if (!SYCLOptimizationMode) // TODO: disable the whole loop pass pipeline? + MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); addExtensionsToPM(EP_LoopOptimizerEnd, MPM); // This ends the loop pass pipelines. @@ -819,19 +820,21 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopUnrollAndJamPass(OptLevel)); } - // Unroll small loops - MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + if (!SYCLOptimizationMode) { + // Unroll small loops + MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); - if (!DisableUnrollLoops) { - // LoopUnroll may generate some redundency to cleanup. - MPM.add(createInstructionCombiningPass()); + if (!DisableUnrollLoops) { + // LoopUnroll may generate some redundency to cleanup. + MPM.add(createInstructionCombiningPass()); - // Runtime unrolling will introduce runtime check in loop prologue. If the - // unrolled loop is a inner loop, then the prologue will be inside the - // outer loop. LICM pass can help to promote the runtime check out if the - // checked value is loop invariant. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + // Runtime unrolling will introduce runtime check in loop prologue. If the + // unrolled loop is a inner loop, then the prologue will be inside the + // outer loop. LICM pass can help to promote the runtime check out if the + // checked value is loop invariant. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + } } MPM.add(createWarnMissedTransformationsPass()); @@ -1034,13 +1037,16 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); - // Unroll small loops - PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createLoopVectorizePass(true, !LoopVectorize)); - // The vectorizer may have significantly shortened a loop body; unroll again. - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + if (!SYCLOptimizationMode) { + // Unroll small loops + PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createLoopVectorizePass(true, !LoopVectorize)); + // The vectorizer may have significantly shortened a loop body; unroll + // again. + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + } PM.add(createWarnMissedTransformationsPass()); From ab30c864b48f509d2a58b997af287ce090beb052 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 10 Sep 2020 13:22:00 +0300 Subject: [PATCH 02/21] Disable loop pass pipeline in SYCL optimization mode. This change seems to hide issues with broadcast tests on CPU. --- .../lib/Transforms/IPO/PassManagerBuilder.cpp | 79 ++++++++++--------- sycl/test/sub_group/broadcast.cpp | 1 - sycl/test/sub_group/broadcast_fp64.cpp | 1 - 3 files changed, 41 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 372acb48e0cfc..9c664114b82cb 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -411,47 +411,50 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions - // Begin the loop pass pipeline. - if (EnableSimpleLoopUnswitch) { - // The simple loop unswitch pass relies on separate cleanup passes. Schedule - // them first so when we re-process a loop they run before other loop - // passes. - MPM.add(createLoopInstSimplifyPass()); - MPM.add(createLoopSimplifyCFGPass()); - } - // Rotate Loop - disable header duplication at -Oz - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); - // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - if (EnableSimpleLoopUnswitch) - MPM.add(createSimpleLoopUnswitchLegacyPass()); - else - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - // FIXME: We break the loop pass pipeline here in order to do full - // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the - // need for this. - MPM.add(createCFGSimplificationPass()); - MPM.add(createInstructionCombiningPass()); - // We resume loop passes creating a second loop pipeline here. - // TODO: this pass hurts performance due to promotions of induction variables - // from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual - // target with unlimited # of registers and pass doesn't take into account - // that on real HW this promotion is not beneficial. - if (!SYCLOptimizationMode) - MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars - MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. - addExtensionsToPM(EP_LateLoopOptimizations, MPM); - MPM.add(createLoopDeletionPass()); // Delete dead loops - - if (EnableLoopInterchange) - MPM.add(createLoopInterchangePass()); // Interchange loops + // Do not run loop pass pipeline in "SYCL Optimization Mode". Loop + // optimizations rely on TTI, which is not accurate for SPIR target. + if (!SYCLOptimizationMode) { + // Begin the loop pass pipeline. + if (EnableSimpleLoopUnswitch) { + // The simple loop unswitch pass relies on separate cleanup passes. + // Schedule them first so when we re-process a loop they run before other + // loop passes. + MPM.add(createLoopInstSimplifyPass()); + MPM.add(createLoopSimplifyCFGPass()); + } + // Rotate Loop - disable header duplication at -Oz + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + // TODO: Investigate promotion cap for O1. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + if (EnableSimpleLoopUnswitch) + MPM.add(createSimpleLoopUnswitchLegacyPass()); + else + MPM.add( + createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + // FIXME: We break the loop pass pipeline here in order to do full + // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace + // the need for this. + MPM.add(createCFGSimplificationPass()); + MPM.add(createInstructionCombiningPass()); + // We resume loop passes creating a second loop pipeline here. + // TODO: this pass hurts performance due to promotions of induction + // variables from 32-bit value to 64-bit values. I assume it's because SPIR + // is a virtual target with unlimited # of registers and pass doesn't take + // into account that on real HW this promotion is not beneficial. + MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars + MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. + addExtensionsToPM(EP_LateLoopOptimizations, MPM); + MPM.add(createLoopDeletionPass()); // Delete dead loops + + if (EnableLoopInterchange) + MPM.add(createLoopInterchangePass()); // Interchange loops - // Unroll small loops - if (!SYCLOptimizationMode) // TODO: disable the whole loop pass pipeline? + // Unroll small loops MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, ForgetAllSCEVInLoopUnroll)); - addExtensionsToPM(EP_LoopOptimizerEnd, MPM); - // This ends the loop pass pipelines. + addExtensionsToPM(EP_LoopOptimizerEnd, MPM); + // This ends the loop pass pipelines. + } if (OptLevel > 1) { MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds diff --git a/sycl/test/sub_group/broadcast.cpp b/sycl/test/sub_group/broadcast.cpp index 3dbba78387b2d..49df849c1baad 100644 --- a/sycl/test/sub_group/broadcast.cpp +++ b/sycl/test/sub_group/broadcast.cpp @@ -1,4 +1,3 @@ -// XFAIL: cpu // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. diff --git a/sycl/test/sub_group/broadcast_fp64.cpp b/sycl/test/sub_group/broadcast_fp64.cpp index 9652fa6b73f46..f9f87e8f95fd9 100644 --- a/sycl/test/sub_group/broadcast_fp64.cpp +++ b/sycl/test/sub_group/broadcast_fp64.cpp @@ -1,4 +1,3 @@ -// XFAIL: cpu // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. From dec884d192d388f96abd5158fe53bb062965661f Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 10 Sep 2020 17:06:02 +0300 Subject: [PATCH 03/21] Disable more vectorization passes --- .../lib/Transforms/IPO/PassManagerBuilder.cpp | 113 +++++++++--------- 1 file changed, 59 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 9c664114b82cb..48f826d736fca 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -753,65 +753,70 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_VectorizerStart, MPM); - // Re-rotate loops in all our loop nests. These may have fallout out of - // rotated form due to GVN or other transformations, and the vectorizer relies - // on the rotated form. Disable header duplication at -Oz. - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); - - // Distribute loops to allow partial vectorization. I.e. isolate dependences - // into separate loop that would otherwise inhibit vectorization. This is - // currently only performed for loops marked with the metadata - // llvm.loop.distribute=true or when -enable-loop-distribute is specified. - MPM.add(createLoopDistributePass()); - - MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); - - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - MPM.add(createLoopLoadEliminationPass()); - - // FIXME: Because of #pragma vectorize enable, the passes below are always - // inserted in the pipeline, even when the vectorizer doesn't run (ex. when - // on -O1 and no #pragma is found). Would be good to have these two passes - // as function calls, so that we can only pass them when the vectorizer - // changed the code. - MPM.add(createInstructionCombiningPass()); - if (OptLevel > 1 && ExtraVectorizerPasses) { - // At higher optimization levels, try to clean up any runtime overlap and - // alignment checks inserted by the vectorizer. We want to track correllated - // runtime checks for two inner loops in the same outer loop, fold any - // common computations, hoist loop-invariant aspects out of any outer loop, - // and unswitch the runtime checks if possible. Once hoisted, we may have - // dead (or speculatable) control flows or more combining opportunities. - MPM.add(createEarlyCSEPass()); - MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createInstructionCombiningPass()); - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - MPM.add(createCFGSimplificationPass()); - MPM.add(createInstructionCombiningPass()); - } + if (!SYCLOptimizationMode) { + // Re-rotate loops in all our loop nests. These may have fallout out of + // rotated form due to GVN or other transformations, and the vectorizer + // relies on the rotated form. Disable header duplication at -Oz. + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + + // Distribute loops to allow partial vectorization. I.e. isolate + // dependences into separate loop that would otherwise inhibit + // vectorization. This is currently only performed for loops marked with + // the metadata llvm.loop.distribute=true or when -enable-loop-distribute is + // specified. + MPM.add(createLoopDistributePass()); + + MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + MPM.add(createLoopLoadEliminationPass()); - // Cleanup after loop vectorization, etc. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - // The extra sinking transform can create larger basic blocks, so do this - // before SLP vectorization. - MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .sinkCommonInsts(true))); - - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + // FIXME: Because of #pragma vectorize enable, the passes below are always + // inserted in the pipeline, even when the vectorizer doesn't run (ex. when + // on -O1 and no #pragma is found). Would be good to have these two passes + // as function calls, so that we can only pass them when the vectorizer + // changed the code. + MPM.add(createInstructionCombiningPass()); if (OptLevel > 1 && ExtraVectorizerPasses) { + // At higher optimization levels, try to clean up any runtime overlap and + // alignment checks inserted by the vectorizer. We want to track + // correllated runtime checks for two inner loops in the same outer loop, + // fold any common computations, hoist loop-invariant aspects out of any + // outer loop, and unswitch the runtime checks if possible. Once hoisted, + // we may have dead (or speculatable) control flows or more combining + // opportunities. MPM.add(createEarlyCSEPass()); + MPM.add(createCorrelatedValuePropagationPass()); + MPM.add(createInstructionCombiningPass()); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add( + createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + MPM.add(createCFGSimplificationPass()); + MPM.add(createInstructionCombiningPass()); } - } - // Enhance/cleanup vector code. - MPM.add(createVectorCombinePass()); + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .sinkCommonInsts(true))); + + if (SLPVectorize) { + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + if (OptLevel > 1 && ExtraVectorizerPasses) { + MPM.add(createEarlyCSEPass()); + } + } + + // Enhance/cleanup vector code. + MPM.add(createVectorCombinePass()); + } addExtensionsToPM(EP_Peephole, MPM); MPM.add(createInstructionCombiningPass()); From 41f88ab47e2f4a5738a286dd323676f72f5aab74 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 10 Sep 2020 18:45:20 +0300 Subject: [PATCH 04/21] Update LIT tests status. --- sycl/test/basic_tests/boolean.cpp | 4 ---- sycl/test/basic_tests/stream/stream.cpp | 4 ---- sycl/test/sub_group/broadcast.cpp | 1 + sycl/test/sub_group/broadcast_fp64.cpp | 1 + 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/sycl/test/basic_tests/boolean.cpp b/sycl/test/basic_tests/boolean.cpp index 031a0afafda01..cac65ddaa80bd 100644 --- a/sycl/test/basic_tests/boolean.cpp +++ b/sycl/test/basic_tests/boolean.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/sycl/test/basic_tests/stream/stream.cpp b/sycl/test/basic_tests/stream/stream.cpp index 441320da38f50..6b5e6925298d1 100644 --- a/sycl/test/basic_tests/stream/stream.cpp +++ b/sycl/test/basic_tests/stream/stream.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) && linux - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s // RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER diff --git a/sycl/test/sub_group/broadcast.cpp b/sycl/test/sub_group/broadcast.cpp index 49df849c1baad..3dbba78387b2d 100644 --- a/sycl/test/sub_group/broadcast.cpp +++ b/sycl/test/sub_group/broadcast.cpp @@ -1,3 +1,4 @@ +// XFAIL: cpu // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. diff --git a/sycl/test/sub_group/broadcast_fp64.cpp b/sycl/test/sub_group/broadcast_fp64.cpp index f9f87e8f95fd9..9652fa6b73f46 100644 --- a/sycl/test/sub_group/broadcast_fp64.cpp +++ b/sycl/test/sub_group/broadcast_fp64.cpp @@ -1,3 +1,4 @@ +// XFAIL: cpu // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. From 700bac400f25a0a09c5cfef1ffe5d3798366021b Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Fri, 25 Sep 2020 18:30:22 +0300 Subject: [PATCH 05/21] Revert LTO pipeline changes. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 48f826d736fca..513aa7977d9b9 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -1045,16 +1045,13 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); - if (!SYCLOptimizationMode) { - // Unroll small loops - PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createLoopVectorizePass(true, !LoopVectorize)); - // The vectorizer may have significantly shortened a loop body; unroll - // again. - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - } + // Unroll small loops + PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createLoopVectorizePass(true, !LoopVectorize)); + // The vectorizer may have significantly shortened a loop body; unroll again. + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); PM.add(createWarnMissedTransformationsPass()); From f4dbb09a5e0b49077e644155959cf85c99a91dba Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Tue, 29 Sep 2020 11:28:30 +0300 Subject: [PATCH 06/21] Revert "Revert LTO pipeline changes." This reverts commit 700bac400f25a0a09c5cfef1ffe5d3798366021b. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 8d910f31eaecb..239859273ccf6 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -1045,13 +1045,16 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); - // Unroll small loops - PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createLoopVectorizePass(true, !LoopVectorize)); - // The vectorizer may have significantly shortened a loop body; unroll again. - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + if (!SYCLOptimizationMode) { + // Unroll small loops + PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createLoopVectorizePass(true, !LoopVectorize)); + // The vectorizer may have significantly shortened a loop body; unroll + // again. + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + } PM.add(createWarnMissedTransformationsPass()); From 5f54a0abb6d275542eea6bc2d872dc9456820577 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Tue, 29 Sep 2020 20:42:21 +0300 Subject: [PATCH 07/21] Revert "Revert "Revert LTO pipeline changes."" This reverts commit f4dbb09a5e0b49077e644155959cf85c99a91dba. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 41b7c20ce2128..fd97bdd73b8f5 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -1057,16 +1057,13 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); - if (!SYCLOptimizationMode) { - // Unroll small loops - PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createLoopVectorizePass(true, !LoopVectorize)); - // The vectorizer may have significantly shortened a loop body; unroll - // again. - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - } + // Unroll small loops + PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createLoopVectorizePass(true, !LoopVectorize)); + // The vectorizer may have significantly shortened a loop body; unroll again. + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); PM.add(createWarnMissedTransformationsPass()); From 8a931aa3cf0eb5f42dd0d101d76cf8e2359a1af9 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 30 Sep 2020 13:33:25 +0300 Subject: [PATCH 08/21] Disable broken tests --- sycl/test/basic_tests/scalar_vec_access.cpp | 4 ++++ sycl/test/basic_tests/stream/stream.cpp | 4 ++++ sycl/test/reduction/reduction_nd_conditional.cpp | 4 ++++ sycl/test/reduction/reduction_nd_ext_half.cpp | 4 ++++ sycl/test/reduction/reduction_nd_s0_dw.cpp | 4 ++++ sycl/test/reduction/reduction_nd_s0_rw.cpp | 4 ++++ sycl/test/reduction/reduction_placeholder.cpp | 4 ++++ 7 files changed, 28 insertions(+) diff --git a/sycl/test/basic_tests/scalar_vec_access.cpp b/sycl/test/basic_tests/scalar_vec_access.cpp index e0e793d93d174..738e00dc532c8 100644 --- a/sycl/test/basic_tests/scalar_vec_access.cpp +++ b/sycl/test/basic_tests/scalar_vec_access.cpp @@ -1,3 +1,7 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) + // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s // RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER diff --git a/sycl/test/basic_tests/stream/stream.cpp b/sycl/test/basic_tests/stream/stream.cpp index 6b5e6925298d1..441320da38f50 100644 --- a/sycl/test/basic_tests/stream/stream.cpp +++ b/sycl/test/basic_tests/stream/stream.cpp @@ -1,3 +1,7 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) && linux + // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s // RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp index 9db6fac910809..dd7da58bc4d34 100644 --- a/sycl/test/reduction/reduction_nd_conditional.cpp +++ b/sycl/test/reduction/reduction_nd_conditional.cpp @@ -1,3 +1,7 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) + // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_nd_ext_half.cpp b/sycl/test/reduction/reduction_nd_ext_half.cpp index 4a939e9ebad4f..587fbdaaa85f4 100644 --- a/sycl/test/reduction/reduction_nd_ext_half.cpp +++ b/sycl/test/reduction/reduction_nd_ext_half.cpp @@ -1,3 +1,7 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) + // UNSUPPORTED: cuda // OpenCL C 2.x alike work-group functions not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp index 0a4a9032a3b30..c2d5b9b7f4394 100644 --- a/sycl/test/reduction/reduction_nd_s0_dw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp @@ -1,3 +1,7 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) + // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp index d346016ae4bfd..14e3d6b204c17 100644 --- a/sycl/test/reduction/reduction_nd_s0_rw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp @@ -1,3 +1,7 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) + // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_placeholder.cpp b/sycl/test/reduction/reduction_placeholder.cpp index b0ce9ddfa88b8..8eddfdbb5e612 100644 --- a/sycl/test/reduction/reduction_placeholder.cpp +++ b/sycl/test/reduction/reduction_placeholder.cpp @@ -1,3 +1,7 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) + // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. From 46b41d95d50500737ff6911c50ab58a6ffacbd83 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 30 Sep 2020 13:34:05 +0300 Subject: [PATCH 09/21] [NOT-FOR-MERGE]Revert "[BuildBot] Uplift GPU RT version to 20.37.17906 (#2504)" This reverts commit 0c8d46e642eef8f7688f16834d62abca920c1345. Just to check if new regressions are caused by the driver update. --- buildbot/dependency.conf | 6 +++--- sycl/test/esimd/on-device/matrix_transpose_glb.cpp | 2 -- sycl/test/hier_par/hier_par_wgscope.cpp | 3 +++ sycl/test/sub_group/generic_reduce.cpp | 3 +++ sycl/test/sub_group/load_store.cpp | 3 +++ 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/buildbot/dependency.conf b/buildbot/dependency.conf index 671fcf1328429..360edd747e86f 100644 --- a/buildbot/dependency.conf +++ b/buildbot/dependency.conf @@ -4,8 +4,8 @@ ocl_cpu_rt_ver=2020.11.8.0.27 # https://github.com/intel/llvm/releases/download/2020-WW36/win-oclcpuexp-2020.11.8.0.27_rel.zip ocl_cpu_rt_ver_win=2020.11.8.0.27 # Same GPU driver supports Level Zero and OpenCL: -# https://github.com/intel/compute-runtime/releases/tag/20.37.17906 -ocl_gpu_rt_ver=20.37.17906 +# https://github.com/intel/compute-runtime/releases/tag/20.35.17767 +ocl_gpu_rt_ver=20.35.17767 # Same GPU driver supports Level Zero and OpenCL: # https://downloadmirror.intel.com/29879/a08/igfx_win10_100.8778.zip ocl_gpu_rt_ver_win=27.20.100.8778 @@ -24,7 +24,7 @@ fpga_ver_win=20200811_000006 [DRIVER VERSIONS] cpu_driver_lin=2020.11.8.0.27 cpu_driver_win=2020.11.8.0.27 -gpu_driver_lin=20.37.17906 +gpu_driver_lin=20.35.17767 gpu_driver_win=27.20.100.8778 fpga_driver_lin=2020.11.8.0.27 fpga_driver_win=2020.11.8.0.27 diff --git a/sycl/test/esimd/on-device/matrix_transpose_glb.cpp b/sycl/test/esimd/on-device/matrix_transpose_glb.cpp index 5310a035924af..24f2bb1942a8b 100644 --- a/sycl/test/esimd/on-device/matrix_transpose_glb.cpp +++ b/sycl/test/esimd/on-device/matrix_transpose_glb.cpp @@ -10,8 +10,6 @@ // REQUIRES: gpu // RUN: %clangxx-esimd -fsycl %s -o %t.out // RUN: %ESIMD_RUN_PLACEHOLDER %t.out -// XFAIL: linux -// UNSUPPORTED: cuda #include "esimd_test_utils.hpp" diff --git a/sycl/test/hier_par/hier_par_wgscope.cpp b/sycl/test/hier_par/hier_par_wgscope.cpp index 5dfbb61a187bd..2167f3bfba8b4 100644 --- a/sycl/test/hier_par/hier_par_wgscope.cpp +++ b/sycl/test/hier_par/hier_par_wgscope.cpp @@ -1,3 +1,6 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && (level_zero || opencl) && linux // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out diff --git a/sycl/test/sub_group/generic_reduce.cpp b/sycl/test/sub_group/generic_reduce.cpp index a66de34b3cac8..3ef504343af48 100644 --- a/sycl/test/sub_group/generic_reduce.cpp +++ b/sycl/test/sub_group/generic_reduce.cpp @@ -1,3 +1,6 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && linux // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. diff --git a/sycl/test/sub_group/load_store.cpp b/sycl/test/sub_group/load_store.cpp index d92f0735dc1e7..8a598b82525d9 100644 --- a/sycl/test/sub_group/load_store.cpp +++ b/sycl/test/sub_group/load_store.cpp @@ -1,3 +1,6 @@ +// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. +// See https://github.com/intel/llvm/issues/2264 for more details. +// XFAIL: gpu && linux // UNSUPPORTED: cuda || cpu // CUDA compilation and runtime do not yet support sub-groups. From 2179514f23eb7827244dbef120586123cee29bee Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 1 Oct 2020 10:12:43 +0300 Subject: [PATCH 10/21] Revert "[NOT-FOR-MERGE]Revert "[BuildBot] Uplift GPU RT version to 20.37.17906 (#2504)"" This reverts commit 46b41d95d50500737ff6911c50ab58a6ffacbd83. --- buildbot/dependency.conf | 6 +++--- sycl/test/esimd/on-device/matrix_transpose_glb.cpp | 2 ++ sycl/test/hier_par/hier_par_wgscope.cpp | 3 --- sycl/test/sub_group/generic_reduce.cpp | 3 --- sycl/test/sub_group/load_store.cpp | 3 --- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/buildbot/dependency.conf b/buildbot/dependency.conf index 360edd747e86f..671fcf1328429 100644 --- a/buildbot/dependency.conf +++ b/buildbot/dependency.conf @@ -4,8 +4,8 @@ ocl_cpu_rt_ver=2020.11.8.0.27 # https://github.com/intel/llvm/releases/download/2020-WW36/win-oclcpuexp-2020.11.8.0.27_rel.zip ocl_cpu_rt_ver_win=2020.11.8.0.27 # Same GPU driver supports Level Zero and OpenCL: -# https://github.com/intel/compute-runtime/releases/tag/20.35.17767 -ocl_gpu_rt_ver=20.35.17767 +# https://github.com/intel/compute-runtime/releases/tag/20.37.17906 +ocl_gpu_rt_ver=20.37.17906 # Same GPU driver supports Level Zero and OpenCL: # https://downloadmirror.intel.com/29879/a08/igfx_win10_100.8778.zip ocl_gpu_rt_ver_win=27.20.100.8778 @@ -24,7 +24,7 @@ fpga_ver_win=20200811_000006 [DRIVER VERSIONS] cpu_driver_lin=2020.11.8.0.27 cpu_driver_win=2020.11.8.0.27 -gpu_driver_lin=20.35.17767 +gpu_driver_lin=20.37.17906 gpu_driver_win=27.20.100.8778 fpga_driver_lin=2020.11.8.0.27 fpga_driver_win=2020.11.8.0.27 diff --git a/sycl/test/esimd/on-device/matrix_transpose_glb.cpp b/sycl/test/esimd/on-device/matrix_transpose_glb.cpp index 24f2bb1942a8b..5310a035924af 100644 --- a/sycl/test/esimd/on-device/matrix_transpose_glb.cpp +++ b/sycl/test/esimd/on-device/matrix_transpose_glb.cpp @@ -10,6 +10,8 @@ // REQUIRES: gpu // RUN: %clangxx-esimd -fsycl %s -o %t.out // RUN: %ESIMD_RUN_PLACEHOLDER %t.out +// XFAIL: linux +// UNSUPPORTED: cuda #include "esimd_test_utils.hpp" diff --git a/sycl/test/hier_par/hier_par_wgscope.cpp b/sycl/test/hier_par/hier_par_wgscope.cpp index 2167f3bfba8b4..5dfbb61a187bd 100644 --- a/sycl/test/hier_par/hier_par_wgscope.cpp +++ b/sycl/test/hier_par/hier_par_wgscope.cpp @@ -1,6 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) && linux // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out diff --git a/sycl/test/sub_group/generic_reduce.cpp b/sycl/test/sub_group/generic_reduce.cpp index 3ef504343af48..a66de34b3cac8 100644 --- a/sycl/test/sub_group/generic_reduce.cpp +++ b/sycl/test/sub_group/generic_reduce.cpp @@ -1,6 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && linux // UNSUPPORTED: cuda // CUDA compilation and runtime do not yet support sub-groups. diff --git a/sycl/test/sub_group/load_store.cpp b/sycl/test/sub_group/load_store.cpp index 8a598b82525d9..d92f0735dc1e7 100644 --- a/sycl/test/sub_group/load_store.cpp +++ b/sycl/test/sub_group/load_store.cpp @@ -1,6 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && linux // UNSUPPORTED: cuda || cpu // CUDA compilation and runtime do not yet support sub-groups. From 79662d3ce244fb75c5284bc97592046b71f492e2 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Tue, 6 Oct 2020 19:18:02 +0300 Subject: [PATCH 11/21] Revert "Disable broken tests" This reverts commit 8a931aa3cf0eb5f42dd0d101d76cf8e2359a1af9. --- sycl/test/basic_tests/scalar_vec_access.cpp | 4 ---- sycl/test/basic_tests/stream/stream.cpp | 4 ---- sycl/test/reduction/reduction_nd_conditional.cpp | 4 ---- sycl/test/reduction/reduction_nd_ext_half.cpp | 4 ---- sycl/test/reduction/reduction_nd_s0_dw.cpp | 4 ---- sycl/test/reduction/reduction_nd_s0_rw.cpp | 4 ---- sycl/test/reduction/reduction_placeholder.cpp | 4 ---- 7 files changed, 28 deletions(-) diff --git a/sycl/test/basic_tests/scalar_vec_access.cpp b/sycl/test/basic_tests/scalar_vec_access.cpp index 738e00dc532c8..e0e793d93d174 100644 --- a/sycl/test/basic_tests/scalar_vec_access.cpp +++ b/sycl/test/basic_tests/scalar_vec_access.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s // RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER diff --git a/sycl/test/basic_tests/stream/stream.cpp b/sycl/test/basic_tests/stream/stream.cpp index 441320da38f50..6b5e6925298d1 100644 --- a/sycl/test/basic_tests/stream/stream.cpp +++ b/sycl/test/basic_tests/stream/stream.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) && linux - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s // RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp index dd7da58bc4d34..9db6fac910809 100644 --- a/sycl/test/reduction/reduction_nd_conditional.cpp +++ b/sycl/test/reduction/reduction_nd_conditional.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) - // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_nd_ext_half.cpp b/sycl/test/reduction/reduction_nd_ext_half.cpp index 587fbdaaa85f4..4a939e9ebad4f 100644 --- a/sycl/test/reduction/reduction_nd_ext_half.cpp +++ b/sycl/test/reduction/reduction_nd_ext_half.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) - // UNSUPPORTED: cuda // OpenCL C 2.x alike work-group functions not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp index c2d5b9b7f4394..0a4a9032a3b30 100644 --- a/sycl/test/reduction/reduction_nd_s0_dw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) - // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp index 14e3d6b204c17..d346016ae4bfd 100644 --- a/sycl/test/reduction/reduction_nd_s0_rw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) - // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. // diff --git a/sycl/test/reduction/reduction_placeholder.cpp b/sycl/test/reduction/reduction_placeholder.cpp index 8eddfdbb5e612..b0ce9ddfa88b8 100644 --- a/sycl/test/reduction/reduction_placeholder.cpp +++ b/sycl/test/reduction/reduction_placeholder.cpp @@ -1,7 +1,3 @@ -// TODO: Enable compilation w/o -fno-sycl-early-optimizations option. -// See https://github.com/intel/llvm/issues/2264 for more details. -// XFAIL: gpu && (level_zero || opencl) - // UNSUPPORTED: cuda // Reductions use work-group builtins not yet supported by CUDA. From 9e9d053bd7e34763eb15c1f480fa8f9293b6643e Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Mon, 12 Oct 2020 13:54:10 +0300 Subject: [PATCH 12/21] Revert "[SYCL] Disable vectorizers in early optimizations (#2402)" This reverts commit 20921b10e722e87b4d83665b8bf6e525c932a0ea. Previous changes make this commit unnecessary. --- clang/lib/Driver/ToolChains/Clang.cpp | 9 --------- clang/test/Driver/sycl-device-optimizations.cpp | 10 ---------- 2 files changed, 19 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index fc0510ad1b773..323e71051970d 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6030,17 +6030,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_gnu_inline_asm, true)) CmdArgs.push_back("-fno-gnu-inline-asm"); - bool EnableSYCLEarlyOptimizations = - Args.hasFlag(options::OPT_fsycl_early_optimizations, - options::OPT_fno_sycl_early_optimizations, - Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga); - // Enable vectorization per default according to the optimization level // selected. For optimization levels that want vectorization we use the alias // option to simplify the hasFlag logic. bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false); - if (UseSYCLTriple && EnableSYCLEarlyOptimizations) - EnableVec = false; // But disable vectorization for SYCL device code OptSpecifier VectorizeAliasOption = EnableVec ? options::OPT_O_Group : options::OPT_fvectorize; if (Args.hasFlag(options::OPT_fvectorize, VectorizeAliasOption, @@ -6049,8 +6042,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // -fslp-vectorize is enabled based on the optimization level selected. bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true); - if (UseSYCLTriple && EnableSYCLEarlyOptimizations) - EnableSLPVec = false; // But disable vectorization for SYCL device code OptSpecifier SLPVectAliasOption = EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize; if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption, diff --git a/clang/test/Driver/sycl-device-optimizations.cpp b/clang/test/Driver/sycl-device-optimizations.cpp index 7399454a9b3ad..71e1f345df652 100644 --- a/clang/test/Driver/sycl-device-optimizations.cpp +++ b/clang/test/Driver/sycl-device-optimizations.cpp @@ -36,13 +36,3 @@ // RUN: | FileCheck -check-prefix=CHECK-DAE %s // CHECK-DAE: clang{{.*}} "-fenable-sycl-dae" // CHECK-DAE: sycl-post-link{{.*}} "-emit-param-info" - -/// Check that vectorizers are disabled by default: -// RUN: %clang -### -fsycl %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHECK-VEC-DEFAULT %s -// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-loops" -// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-slp" -/// Check that vectorizers can still be enabled manually: -// RUN: %clang -### -fsycl -fvectorize -fslp-vectorize %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHECK-VEC-ENABLE %s -// CHECK-VEC-ENABLE: clang{{.*}} "-fsycl-is-device"{{.*}}"-vectorize-loops"{{.*}}"-vectorize-slp" From dedc5e7c81cc630049c0a435ad96b475888a29e3 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Fri, 16 Oct 2020 21:49:38 +0300 Subject: [PATCH 13/21] Re-enable LICM. --- .../lib/Transforms/IPO/PassManagerBuilder.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 9b6af7295fb8a..da95c7f4e2a26 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -855,17 +855,16 @@ void PassManagerBuilder::populateModulePassManager( // Unroll small loops MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, ForgetAllSCEVInLoopUnroll)); + } + if (!DisableUnrollLoops) { + // LoopUnroll may generate some redundency to cleanup. + MPM.add(createInstructionCombiningPass()); - if (!DisableUnrollLoops) { - // LoopUnroll may generate some redundency to cleanup. - MPM.add(createInstructionCombiningPass()); - - // Runtime unrolling will introduce runtime check in loop prologue. If the - // unrolled loop is a inner loop, then the prologue will be inside the - // outer loop. LICM pass can help to promote the runtime check out if the - // checked value is loop invariant. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - } + // Runtime unrolling will introduce runtime check in loop prologue. If the + // unrolled loop is a inner loop, then the prologue will be inside the + // outer loop. LICM pass can help to promote the runtime check out if the + // checked value is loop invariant. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); } MPM.add(createWarnMissedTransformationsPass()); From 636934fe8f306ee9d13b4f63d43a65147d0549d1 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 11 Nov 2020 13:37:44 +0300 Subject: [PATCH 14/21] [SYCL] Skip optimization pipeline customization in ESIMD mode --- clang/lib/Driver/ToolChains/Clang.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b9bd952b93345..ec2f80b003e6d 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4246,7 +4246,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_sycl_early_optimizations, Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga)) CmdArgs.push_back("-fno-sycl-early-optimizations"); - else if (IsSYCLDevice) { + else if (IsSYCLDevice && + !Args.hasFlag(options::OPT_fsycl_esimd, + options::OPT_fno_sycl_esimd, false)) { // Set `sycl-opt` option to configure LLVM passes for SPIR target CmdArgs.push_back("-mllvm"); CmdArgs.push_back("-sycl-opt"); From c9445a9b81703a372b0cf06c1d56ed08a75be42a Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Sun, 15 Nov 2020 19:57:55 +0300 Subject: [PATCH 15/21] Revert "Re-enable LICM." This reverts commit dedc5e7c81cc630049c0a435ad96b475888a29e3. --- .../lib/Transforms/IPO/PassManagerBuilder.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 194d16225bc12..5bc797796de59 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -858,16 +858,17 @@ void PassManagerBuilder::populateModulePassManager( // Unroll small loops MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, ForgetAllSCEVInLoopUnroll)); - } - if (!DisableUnrollLoops) { - // LoopUnroll may generate some redundency to cleanup. - MPM.add(createInstructionCombiningPass()); - // Runtime unrolling will introduce runtime check in loop prologue. If the - // unrolled loop is a inner loop, then the prologue will be inside the - // outer loop. LICM pass can help to promote the runtime check out if the - // checked value is loop invariant. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + if (!DisableUnrollLoops) { + // LoopUnroll may generate some redundency to cleanup. + MPM.add(createInstructionCombiningPass()); + + // Runtime unrolling will introduce runtime check in loop prologue. If the + // unrolled loop is a inner loop, then the prologue will be inside the + // outer loop. LICM pass can help to promote the runtime check out if the + // checked value is loop invariant. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + } } MPM.add(createWarnMissedTransformationsPass()); From 98b9cbab71bffd431b161558098f8db15620a140 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Tue, 20 Apr 2021 18:00:50 +0300 Subject: [PATCH 16/21] Remove fsycl-esimd option usage. --- clang/lib/Driver/ToolChains/Clang.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 320fe80536329..21cf7a85fca3c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4350,9 +4350,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_sycl_early_optimizations, Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga)) CmdArgs.push_back("-fno-sycl-early-optimizations"); - else if (RawTriple.isSPIR() && - !Args.hasFlag(options::OPT_fsycl_esimd, - options::OPT_fno_sycl_esimd, false)) { + else if (RawTriple.isSPIR()) { // Set `sycl-opt` option to configure LLVM passes for SPIR target CmdArgs.push_back("-mllvm"); CmdArgs.push_back("-sycl-opt"); From 6b579838b6d347627983a085ee9197d7d6b23ee2 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Mon, 26 Apr 2021 10:04:57 +0300 Subject: [PATCH 17/21] Recovered removed passes due to bad merge conflict resolution. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 4c1639c0d01e9..ca74354c10ad9 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -463,6 +463,19 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createLoopFlattenPass()); // Flatten loops MPM.add(createLoopSimplifyCFGPass()); } + MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. + MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars + addExtensionsToPM(EP_LateLoopOptimizations, MPM); + MPM.add(createLoopDeletionPass()); // Delete dead loops + + if (EnableLoopInterchange) + MPM.add(createLoopInterchangePass()); // Interchange loops + + // Unroll small loops and perform peeling. + MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + addExtensionsToPM(EP_LoopOptimizerEnd, MPM); + // This ends the loop pass pipelines. } // Break up allocas that may now be splittable after loop unrolling. From ec18cc2803ce530169da408f32c0bab6a0b9fe6e Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Mon, 26 Apr 2021 10:10:08 +0300 Subject: [PATCH 18/21] Apply clang-format to the previous patch. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index ca74354c10ad9..114f9ba42f3ec 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -463,17 +463,17 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createLoopFlattenPass()); // Flatten loops MPM.add(createLoopSimplifyCFGPass()); } - MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. - MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars + MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. + MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars addExtensionsToPM(EP_LateLoopOptimizations, MPM); - MPM.add(createLoopDeletionPass()); // Delete dead loops + MPM.add(createLoopDeletionPass()); // Delete dead loops if (EnableLoopInterchange) MPM.add(createLoopInterchangePass()); // Interchange loops // Unroll small loops and perform peeling. MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + ForgetAllSCEVInLoopUnroll)); addExtensionsToPM(EP_LoopOptimizerEnd, MPM); // This ends the loop pass pipelines. } From 968271a3d68b9fc1227f4ab17ad59a06323cd950 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 29 Apr 2021 09:22:12 +0300 Subject: [PATCH 19/21] Disable diagnostics for disabled loop optimizations. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 114f9ba42f3ec..57a46d1109c7b 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -882,9 +882,9 @@ void PassManagerBuilder::populateModulePassManager( // checked value is loop invariant. MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); } - } - MPM.add(createWarnMissedTransformationsPass()); + MPM.add(createWarnMissedTransformationsPass()); + } // After vectorization and unrolling, assume intrinsics may tell us more // about pointer alignments. From 4c37555cdb18600207d82a3a225251077dc22025 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Thu, 29 Apr 2021 11:48:48 +0300 Subject: [PATCH 20/21] Disable reassociation pass to check performance impact. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 57a46d1109c7b..645b003da10b2 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -427,11 +427,12 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (OptLevel > 1) MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs - MPM.add(createReassociatePass()); // Reassociate expressions // Do not run loop pass pipeline in "SYCL Optimization Mode". Loop // optimizations rely on TTI, which is not accurate for SPIR target. if (!SYCLOptimizationMode) { + MPM.add(createReassociatePass()); // Reassociate expressions + // Begin the loop pass pipeline. if (EnableSimpleLoopUnswitch) { // The simple loop unswitch pass relies on separate cleanup passes. From b9a2c3de73e1b9d665c3e29346d4eccbd95ed875 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 12 May 2021 12:02:49 +0300 Subject: [PATCH 21/21] Revert "Disable reassociation pass to check performance impact." This reverts commit 4c37555cdb18600207d82a3a225251077dc22025. --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index e545aa7c1bbc8..cf699523c2e6f 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -427,12 +427,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (OptLevel > 1) MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add(createReassociatePass()); // Reassociate expressions // Do not run loop pass pipeline in "SYCL Optimization Mode". Loop // optimizations rely on TTI, which is not accurate for SPIR target. if (!SYCLOptimizationMode) { - MPM.add(createReassociatePass()); // Reassociate expressions - // Begin the loop pass pipeline. if (EnableSimpleLoopUnswitch) { // The simple loop unswitch pass relies on separate cleanup passes.