From 6da307686056aa8f51ec831194c0e2e163842380 Mon Sep 17 00:00:00 2001 From: a00917109 Date: Thu, 18 Sep 2025 18:46:06 -0400 Subject: [PATCH] [LoopFusion] Detecting legal dependencies for fusion using DA info Loop fusion pass will use the information provided by the recent DA patch to fuse additional legal loops, including those with forward loop-carried dependencies. --- llvm/lib/Transforms/Scalar/LoopFuse.cpp | 42 ++++ .../LoopFusion/da_separate_loops.ll | 182 ++++++++++++++++++ llvm/test/Transforms/LoopFusion/simple.ll | 45 +++-- 3 files changed, 253 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Transforms/LoopFusion/da_separate_loops.ll diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index b5eb647a042b9..2073303237f69 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -100,6 +100,7 @@ STATISTIC(OnlySecondCandidateIsGuarded, "The second candidate is guarded while the first one is not"); STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); +STATISTIC(NumDA, "DA checks passed"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -1371,6 +1372,47 @@ struct LoopFuser { << "\n"); } #endif + unsigned Levels = DepResult->getLevels(); + unsigned SameSDLevels = DepResult->getSameSDLevels(); + unsigned CurLoopLevel = FC0.L->getLoopDepth(); + + // Check if DA is missing info regarding the current loop level + if (CurLoopLevel > Levels + SameSDLevels) + return false; + + // Iterating over the outer levels. + for (unsigned Level = 1; Level <= std::min(CurLoopLevel - 1, Levels); + ++Level) { + unsigned Direction = DepResult->getDirection(Level, false); + + // Check if the direction vector does not include equality. If an outer + // loop has a non-equal direction, outer indicies are different and it + // is safe to fuse. + if (!(Direction & Dependence::DVEntry::EQ)) { + LLVM_DEBUG(dbgs() << "Safe to fuse due to non-equal acceses in the " + "outer loops\n"); + NumDA++; + return true; + } + } + + assert(CurLoopLevel > Levels && "Fusion candidates are not separated"); + + unsigned CurDir = DepResult->getDirection(CurLoopLevel, true); + + // Check if the direction vector does not include greater direction. In + // that case, the dependency is not a backward loop-carried and is legal + // to fuse. For example here we have a forward dependency + // for (int i = 0; i < n; i++) + // A[i] = ...; + // for (int i = 0; i < n; i++) + // ... = A[i-1]; + if (!(CurDir & Dependence::DVEntry::GT)) { + LLVM_DEBUG(dbgs() << "Safe to fuse with no backward loop-carried " + "dependency\n"); + NumDA++; + return true; + } if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) LLVM_DEBUG( diff --git a/llvm/test/Transforms/LoopFusion/da_separate_loops.ll b/llvm/test/Transforms/LoopFusion/da_separate_loops.ll new file mode 100644 index 0000000000000..6359f48199290 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/da_separate_loops.ll @@ -0,0 +1,182 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-fusion -da-disable-delinearization-checks -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s +; STAT: 2 loop-fusion - DA checks passed + +; The two inner loops have no dependency and are allowed to be fused as in the +; outer loops, different levels are accessed to. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) +;; A[i][j][k] = i; +;; for (long int k = 0; k < n; k++) +;; temp = A[i + 3][j + 2][k + 1]; +;; } +;; } + +define void @nonequal_outer_access(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, 1 + %add13 = add nsw i64 %j.07, 2 + %add14 = add nsw i64 %i.011, 3 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %add14, i64 %add13, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} + +; The two inner loops have a forward loop-carried dependency, allowing them +; to be fused. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) +;; A[i][j][k] = i; +;; for (long int k = 0; k < n; k++) +;; temp = A[i][j][k - 1]; +;; } +;; } + +define void @forward_dep(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, -1 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} \ No newline at end of file diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll index d63890df14461..f3cd5877bd4aa 100644 --- a/llvm/test/Transforms/LoopFusion/simple.ll +++ b/llvm/test/Transforms/LoopFusion/simple.ll @@ -298,42 +298,55 @@ bb23: ; preds = %bb17, %bb ret void } +; The following IR is a representation of the provided code below. With PR +; #146383, loop fusion is able to utilize the information from dependence +; analysis, enabling the loops in the function to be fused. +; +; void forward_dep(int *arg) { +; for (int i = 0; i < 100; i++) { +; int tmp = i - 3; +; int val = tmp * (i + 3) % i; +; arg[i] = val; +; } +; +; for (int j = 0; j < 100; j++) { +; int val = arg[j - 3]; +; arg[j] = val * 3; +; } +; } +; define void @forward_dep(ptr noalias %arg) { ; CHECK-LABEL: @forward_dep( -; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB7:%.*]] +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB7:.*]] ; CHECK: bb7: -; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB14:%.*]] ] -; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[BB14]] ] +; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP15:%.*]], %[[BB25:.*]] ] +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], %[[BB25]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BB25]] ], [ 0, %[[BB]] ] ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[DOT013]], -3 ; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 ; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[TMP]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV22]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = srem i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG:%.*]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV22]] ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP13]], align 4 -; CHECK-NEXT: br label [[BB14]] +; CHECK-NEXT: br label %[[BB14:.*]] ; CHECK: bb14: -; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 -; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 -; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 -; CHECK-NEXT: br i1 [[EXITCOND4]], label [[BB7]], label [[BB19_PREHEADER:%.*]] -; CHECK: bb19.preheader: -; CHECK-NEXT: br label [[BB19:%.*]] -; CHECK: bb19: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB25:%.*]] ], [ 0, [[BB19_PREHEADER]] ] ; CHECK-NEXT: [[TMP20:%.*]] = add nsw i64 [[INDVARS_IV1]], -3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = mul nsw i32 [[TMP22]], 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV1]] ; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 -; CHECK-NEXT: br label [[BB25]] +; CHECK-NEXT: br label %[[BB25]] ; CHECK: bb25: +; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 +; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB19]], label [[BB26:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[BB7]], label %[[BB26:.*]] ; CHECK: bb26: ; CHECK-NEXT: ret void ;