Skip to content

Conversation

@madhur13490
Copy link
Contributor

If we have instructions in second loop's preheader which can be sunk, we should also be adjusting
PHI nodes to receive values from the new loop's latch block.

Fixes #128600

@llvmbot
Copy link
Member

llvmbot commented Jul 8, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Madhur Amilkanthwar (madhur13490)

Changes

If we have instructions in second loop's preheader which can be sunk, we should also be adjusting
PHI nodes to receive values from the new loop's latch block.

Fixes #128600


Full diff: https://github.com/llvm/llvm-project/pull/147501.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Scalar/LoopFuse.cpp (+29-5)
  • (added) llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll (+86)
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d6bd92d520e28..6e1556a4d90b4 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -988,8 +988,8 @@ struct LoopFuser {
 
             // If it is not safe to hoist/sink all instructions in the
             // pre-header, we cannot fuse these loops.
-            if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
-                                              SafeToSink)) {
+            if (!collectAndFixMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
+                                                    SafeToSink)) {
               LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
                                    "Fusion Candidate Pre-header.\n"
                                 << "Not Fusing.\n");
@@ -1033,8 +1033,8 @@ struct LoopFuser {
                                                FuseCounter);
 
           FusionCandidate FusedCand(
-              performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
-              FC0Copy.PP);
+              performFusion((Peel ? FC0Copy : *FC0), *FC1, SafeToSink), DT,
+              &PDT, ORE, FC0Copy.PP);
           FusedCand.verify();
           assert(FusedCand.isEligibleForFusion(SE) &&
                  "Fused candidate should be eligible for fusion!");
@@ -1176,9 +1176,31 @@ struct LoopFuser {
     return true;
   }
 
+  void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink,
+                   const FusionCandidate &FC0,
+                   const FusionCandidate &FC1) const {
+    // Iterate over SafeToSink instructions and update PHI nodes
+    // to take values from the latch block of FC0 if they are taking
+    // from the latch block of FC1.
+    for (Instruction *Inst : SafeToSink) {
+      LLVM_DEBUG(dbgs() << "UPDATING: Instruction: " << *Inst << "\n");
+      // Continue if the instruction is not a PHI node.
+      if (!isa<PHINode>(Inst))
+        continue;
+      PHINode *Phi = dyn_cast<PHINode>(Inst);
+      LLVM_DEBUG(dbgs() << "UPDATING: PHI node: " << *Phi << "\n");
+      for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) {
+        if (Phi->getIncomingBlock(I) != FC0.Latch)
+          continue;
+        assert(FC1.Latch && "FC1 latch is not set");
+        Phi->setIncomingBlock(I, FC1.Latch);
+      }
+    }
+  }
+
   /// Collect instructions in the \p FC1 Preheader that can be hoisted
   /// to the \p FC0 Preheader or sunk into the \p FC1 Body
-  bool collectMovablePreheaderInsts(
+  bool collectAndFixMovablePreheaderInsts(
       const FusionCandidate &FC0, const FusionCandidate &FC1,
       SmallVector<Instruction *, 4> &SafeToHoist,
       SmallVector<Instruction *, 4> &SafeToSink) const {
@@ -1226,6 +1248,8 @@ struct LoopFuser {
     }
     LLVM_DEBUG(
         dbgs() << "All preheader instructions could be sunk or hoisted!\n");
+
+    fixPHINodes(SafeToSink, FC0, FC1);
     return true;
   }
 
diff --git a/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
new file mode 100644
index 0000000000000..3c72df8ae19fb
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=mem2reg,loop-rotate,loop-fusion < %s 2>&1 | FileCheck %s
+define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[SUM1_02:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_INC6:.*]] ]
+; CHECK-NEXT:    [[I_01:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[I1_04:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC7:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[SUM2_03:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD5:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM1_02]], [[I_01]]
+; CHECK-NEXT:    br label %[[FOR_INC:.*]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I1_04]], [[I1_04]]
+; CHECK-NEXT:    [[ADD5]] = add nsw i32 [[SUM2_03]], [[MUL]]
+; CHECK-NEXT:    br label %[[FOR_INC6]]
+; CHECK:       [[FOR_INC6]]:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_01]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 10
+; CHECK-NEXT:    [[INC7]] = add nsw i32 [[I1_04]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[INC7]], 10
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY]], label %[[FOR_END8:.*]]
+; CHECK:       [[FOR_END8]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %retval = alloca i32, align 4
+  %sum1 = alloca i32, align 4
+  %sum2 = alloca i32, align 4
+  %i = alloca i32, align 4
+  %i1 = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %sum1, align 4
+  store i32 0, ptr %sum2, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %1 = load i32, ptr %i, align 4
+  %2 = load i32, ptr %sum1, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %sum1, align 4
+  br label %for.inc
+
+for.inc:
+  %3 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:
+  store i32 0, ptr %i1, align 4
+  br label %for.cond2
+
+for.cond2:
+  %4 = load i32, ptr %i1, align 4
+  %cmp3 = icmp slt i32 %4, 10
+  br i1 %cmp3, label %for.body4, label %for.end8
+
+for.body4:
+  %5 = load i32, ptr %i1, align 4
+  %6 = load i32, ptr %i1, align 4
+  %mul = mul nsw i32 %5, %6
+  %7 = load i32, ptr %sum2, align 4
+  %add5 = add nsw i32 %7, %mul
+  store i32 %add5, ptr %sum2, align 4
+  br label %for.inc6
+
+for.inc6:
+  %8 = load i32, ptr %i1, align 4
+  %inc7 = add nsw i32 %8, 1
+  store i32 %inc7, ptr %i1, align 4
+  br label %for.cond2
+
+for.end8:
+  %9 = load i32, ptr %sum1, align 4
+  %10 = load i32, ptr %sum2, align 4
+  ret i32 0
+}
+

Copy link
Contributor

@kasuga-fj kasuga-fj left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a few drive-by comments.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests should be as concise as possible. In this case, this test should not rely on passes other than loop-fusion. It would be better to replace the input IR with the result of applying mem2reg and loop-rotate and run only loop-fuse.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, the test was taken from the bug report and wanted to keep it the same way but I see no issue for the bug if I just run loop-fusion.

Copy link
Contributor

@kasuga-fj kasuga-fj Jul 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I meant is, replacing the IR with the output of the following:

$ opt -S --passes=mem2reg,loop-rotate sunk-phi-nodes.ll

The current test doesn't contain any PHI nodes, so it seems that the new logic you added isn't triggered.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops..missed it. Done.

Comment on lines 991 to 992
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point, it seems that whether fusion will be performed has not yet been determined. At the very least, it could potentially be rejected at the later beneficial check (although it currently always returns true). The IR should not be modified until we actually decide to perform the fusion. Maybe fixPHINodes should be executed in performFusion.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I understand. Had this in performFusion() initially, but optimized to move to collect* routine, which is an not right. Point taken.

Copy link
Contributor

@kasuga-fj kasuga-fj Jul 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I didn’t fully check the surrounding context, but considering what the each function is supposed to do, it might make more sense to call fixPHINodes inside movePreheaderInsts instead (I'm not familiar enough to determine which is better...).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to confirm, have you noticed this comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I was thinking. Initially, I wanted to change the IR after the fusion is done but I think this is better because movePreheaderInsts is changing the IR before the fusion so might as well also update the PHI nodes. Done.

Comment on lines 1186 to 1191
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the debug messages duplicated?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are different.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If Inst is a PHI node, the following messages will be printed, which seem redundant to me.

UPDATING: Instruction: phi ...
UPDATING: PHI node: phi ...

If it is not a PHI node, the following message will be displayed, but it is not actually updated.

UPDATING: Instruction: ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I'm removing them in the recent version. After seeing this, I feel they are overkill.

@madhur13490 madhur13490 force-pushed the features/madhura/lf_fix_sink branch 3 times, most recently from e6ab678 to 6cb9fdf Compare July 11, 2025 18:51
Copy link
Contributor

@kasuga-fj kasuga-fj left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears to be mostly fine at a glance. But since I’m not familiar with this, it’s probably best to wait for input from others.

@madhur13490 madhur13490 requested a review from Meinersbur July 18, 2025 04:41
@madhur13490
Copy link
Contributor Author

@Meinersbur Can you please have a look too?

Copy link
Member

@Meinersbur Meinersbur left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There also is SSAUpdater for such jobs, but would be overblown if updating manually is that simple.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink,
void fixPHINodes(ArrayRef<Instruction *> SafeToSink,

Pass an ArrayRef it itr is read-only, makes it independent of the underlaying implementation

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Comment on lines 1187 to 1189
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Continue if the instruction is not a PHI node.
if (!isa<PHINode>(Inst))
continue;
PHINode *Phi = dyn_cast<PHINode>(Inst);
PHINode *Phi = dyn_cast<PHINode>(Inst);
if (!Phi)
continue;

You don't need to dyn_cast if you already checked it's the right type. cast is sufficient. It's also an idiom to check the result of dyn_cast for nullptr.

The comment isn't helpful
1

Footnotes

  1. I think this is my new favorite meme

Copy link
Contributor Author

@madhur13490 madhur13490 Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Added some more description.

Comment on lines 1184 to 1185
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"if they are taking from the latch block of FC1." ???

I don't understant this comment, but I think it should be subsumed by a description of this helper function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added more description.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// This function fixes sunk PHI nodes after fusion.
/// This function fixes sunk PHI nodes after fusion.

Could you explain a bit more? E.g.:

Update the the PHI nodes that previously had incoming blocks from FC0 to the equivalent block fused in the fused loop (which is FC1 being reused, therefore the equivalent is not necessary for FC1).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink,
static void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink,

Copy link
Contributor Author

@madhur13490 madhur13490 Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why should it be static? Do you expect it to be called without objects of LoopFuser?

Copy link
Member

@Meinersbur Meinersbur Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other way around: The function does not use this, so why should it be tied to an object? It's like an unused (implicit) function argument.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a non-member function, so adding static here affects its visibility. It is recommended to use static in this case (see Restrict Visibility).

(It may still be preferable to add it even if it is a member function)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a private member function of struct LoopFuser. Thus, the visibility is already restricted. Users can't call it from outside even without object.

I can either have static or const qualifier (which I already have) but not both. So, I should be removing const qualifier...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a private member function of struct LoopFuser.

Sorry, I misread the indentation...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I should be removing const qualifier...

const refers to the qualifiers of this, which with static does not exist anymore, so indeed, the consequence would be to remove const.

I don't care that much, so I LGTM it already. Keep it non-static if you think it makes more sense.

If we have instructions in second loop's preheader
which can be sunk, we should also be adjusting
PHI nodes to receive values from the new loop's latch block.

Fixes llvm#128600
@madhur13490 madhur13490 force-pushed the features/madhura/lf_fix_sink branch 4 times, most recently from 4d34434 to 8ba5c5d Compare July 22, 2025 04:42
@github-actions
Copy link

github-actions bot commented Jul 22, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@madhur13490 madhur13490 force-pushed the features/madhura/lf_fix_sink branch from 8ba5c5d to b6262e0 Compare July 22, 2025 04:45
Copy link
Member

@Meinersbur Meinersbur left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@madhur13490 madhur13490 merged commit 90de4a4 into llvm:main Jul 28, 2025
9 checks passed
mahesh-attarde pushed a commit to mahesh-attarde/llvm-project that referenced this pull request Jul 28, 2025
If we have instructions in second loop's preheader which can be sunk, we
should also be adjusting PHI nodes to receive values from the fused loop's latch block.

Fixes llvm#128600
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[Clang] Optimization LoopFuse.cpp crashes on valid LLVM IR. Assertion `!verifyFunction(*FC0.Header->getParent(), &errs())' failed.

4 participants