-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[LoopFusion] Fix sink instructions #147501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LoopFusion] Fix sink instructions #147501
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: Madhur Amilkanthwar (madhur13490) ChangesIf we have instructions in second loop's preheader which can be sunk, we should also be adjusting Fixes #128600 Full diff: https://github.com/llvm/llvm-project/pull/147501.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d6bd92d520e28..6e1556a4d90b4 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -988,8 +988,8 @@ struct LoopFuser {
// If it is not safe to hoist/sink all instructions in the
// pre-header, we cannot fuse these loops.
- if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
- SafeToSink)) {
+ if (!collectAndFixMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
+ SafeToSink)) {
LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
"Fusion Candidate Pre-header.\n"
<< "Not Fusing.\n");
@@ -1033,8 +1033,8 @@ struct LoopFuser {
FuseCounter);
FusionCandidate FusedCand(
- performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
- FC0Copy.PP);
+ performFusion((Peel ? FC0Copy : *FC0), *FC1, SafeToSink), DT,
+ &PDT, ORE, FC0Copy.PP);
FusedCand.verify();
assert(FusedCand.isEligibleForFusion(SE) &&
"Fused candidate should be eligible for fusion!");
@@ -1176,9 +1176,31 @@ struct LoopFuser {
return true;
}
+ void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink,
+ const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+ // Iterate over SafeToSink instructions and update PHI nodes
+ // to take values from the latch block of FC0 if they are taking
+ // from the latch block of FC1.
+ for (Instruction *Inst : SafeToSink) {
+ LLVM_DEBUG(dbgs() << "UPDATING: Instruction: " << *Inst << "\n");
+ // Continue if the instruction is not a PHI node.
+ if (!isa<PHINode>(Inst))
+ continue;
+ PHINode *Phi = dyn_cast<PHINode>(Inst);
+ LLVM_DEBUG(dbgs() << "UPDATING: PHI node: " << *Phi << "\n");
+ for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) {
+ if (Phi->getIncomingBlock(I) != FC0.Latch)
+ continue;
+ assert(FC1.Latch && "FC1 latch is not set");
+ Phi->setIncomingBlock(I, FC1.Latch);
+ }
+ }
+ }
+
/// Collect instructions in the \p FC1 Preheader that can be hoisted
/// to the \p FC0 Preheader or sunk into the \p FC1 Body
- bool collectMovablePreheaderInsts(
+ bool collectAndFixMovablePreheaderInsts(
const FusionCandidate &FC0, const FusionCandidate &FC1,
SmallVector<Instruction *, 4> &SafeToHoist,
SmallVector<Instruction *, 4> &SafeToSink) const {
@@ -1226,6 +1248,8 @@ struct LoopFuser {
}
LLVM_DEBUG(
dbgs() << "All preheader instructions could be sunk or hoisted!\n");
+
+ fixPHINodes(SafeToSink, FC0, FC1);
return true;
}
diff --git a/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
new file mode 100644
index 0000000000000..3c72df8ae19fb
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=mem2reg,loop-rotate,loop-fusion < %s 2>&1 | FileCheck %s
+define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[SUM1_02:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_INC6:.*]] ]
+; CHECK-NEXT: [[I_01:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT: [[I1_04:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC7:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT: [[SUM2_03:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD5:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT: [[ADD]] = add nsw i32 [[SUM1_02]], [[I_01]]
+; CHECK-NEXT: br label %[[FOR_INC:.*]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I1_04]], [[I1_04]]
+; CHECK-NEXT: [[ADD5]] = add nsw i32 [[SUM2_03]], [[MUL]]
+; CHECK-NEXT: br label %[[FOR_INC6]]
+; CHECK: [[FOR_INC6]]:
+; CHECK-NEXT: [[INC]] = add nsw i32 [[I_01]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], 10
+; CHECK-NEXT: [[INC7]] = add nsw i32 [[I1_04]], 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[INC7]], 10
+; CHECK-NEXT: br i1 [[CMP3]], label %[[FOR_BODY]], label %[[FOR_END8:.*]]
+; CHECK: [[FOR_END8]]:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %retval = alloca i32, align 4
+ %sum1 = alloca i32, align 4
+ %sum2 = alloca i32, align 4
+ %i = alloca i32, align 4
+ %i1 = alloca i32, align 4
+ store i32 0, ptr %retval, align 4
+ store i32 0, ptr %sum1, align 4
+ store i32 0, ptr %sum2, align 4
+ store i32 0, ptr %i, align 4
+ br label %for.cond
+
+for.cond:
+ %0 = load i32, ptr %i, align 4
+ %cmp = icmp slt i32 %0, 10
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ %1 = load i32, ptr %i, align 4
+ %2 = load i32, ptr %sum1, align 4
+ %add = add nsw i32 %2, %1
+ store i32 %add, ptr %sum1, align 4
+ br label %for.inc
+
+for.inc:
+ %3 = load i32, ptr %i, align 4
+ %inc = add nsw i32 %3, 1
+ store i32 %inc, ptr %i, align 4
+ br label %for.cond
+
+for.end:
+ store i32 0, ptr %i1, align 4
+ br label %for.cond2
+
+for.cond2:
+ %4 = load i32, ptr %i1, align 4
+ %cmp3 = icmp slt i32 %4, 10
+ br i1 %cmp3, label %for.body4, label %for.end8
+
+for.body4:
+ %5 = load i32, ptr %i1, align 4
+ %6 = load i32, ptr %i1, align 4
+ %mul = mul nsw i32 %5, %6
+ %7 = load i32, ptr %sum2, align 4
+ %add5 = add nsw i32 %7, %mul
+ store i32 %add5, ptr %sum2, align 4
+ br label %for.inc6
+
+for.inc6:
+ %8 = load i32, ptr %i1, align 4
+ %inc7 = add nsw i32 %8, 1
+ store i32 %inc7, ptr %i1, align 4
+ br label %for.cond2
+
+for.end8:
+ %9 = load i32, ptr %sum1, align 4
+ %10 = load i32, ptr %sum2, align 4
+ ret i32 0
+}
+
|
kasuga-fj
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a few drive-by comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tests should be as concise as possible. In this case, this test should not rely on passes other than loop-fusion. It would be better to replace the input IR with the result of applying mem2reg and loop-rotate and run only loop-fuse.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, the test was taken from the bug report and wanted to keep it the same way but I see no issue for the bug if I just run loop-fusion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What I meant is, replacing the IR with the output of the following:
$ opt -S --passes=mem2reg,loop-rotate sunk-phi-nodes.llThe current test doesn't contain any PHI nodes, so it seems that the new logic you added isn't triggered.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops..missed it. Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At this point, it seems that whether fusion will be performed has not yet been determined. At the very least, it could potentially be rejected at the later beneficial check (although it currently always returns true). The IR should not be modified until we actually decide to perform the fusion. Maybe fixPHINodes should be executed in performFusion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I understand. Had this in performFusion() initially, but optimized to move to collect* routine, which is an not right. Point taken.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I didn’t fully check the surrounding context, but considering what the each function is supposed to do, it might make more sense to call fixPHINodes inside movePreheaderInsts instead (I'm not familiar enough to determine which is better...).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to confirm, have you noticed this comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I was thinking. Initially, I wanted to change the IR after the fusion is done but I think this is better because movePreheaderInsts is changing the IR before the fusion so might as well also update the PHI nodes. Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are the debug messages duplicated?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They are different.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If Inst is a PHI node, the following messages will be printed, which seem redundant to me.
UPDATING: Instruction: phi ...
UPDATING: PHI node: phi ...
If it is not a PHI node, the following message will be displayed, but it is not actually updated.
UPDATING: Instruction: ...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I'm removing them in the recent version. After seeing this, I feel they are overkill.
e6ab678 to
6cb9fdf
Compare
kasuga-fj
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It appears to be mostly fine at a glance. But since I’m not familiar with this, it’s probably best to wait for input from others.
|
@Meinersbur Can you please have a look too? |
Meinersbur
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There also is SSAUpdater for such jobs, but would be overblown if updating manually is that simple.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink, | |
| void fixPHINodes(ArrayRef<Instruction *> SafeToSink, |
Pass an ArrayRef it itr is read-only, makes it independent of the underlaying implementation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Continue if the instruction is not a PHI node. | |
| if (!isa<PHINode>(Inst)) | |
| continue; | |
| PHINode *Phi = dyn_cast<PHINode>(Inst); | |
| PHINode *Phi = dyn_cast<PHINode>(Inst); | |
| if (!Phi) | |
| continue; |
You don't need to dyn_cast if you already checked it's the right type. cast is sufficient. It's also an idiom to check the result of dyn_cast for nullptr.
The comment isn't helpful
1
Footnotes
-
I think this is my new favorite meme ↩
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Added some more description.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"if they are taking from the latch block of FC1." ???
I don't understant this comment, but I think it should be subsumed by a description of this helper function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added more description.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // This function fixes sunk PHI nodes after fusion. | |
| /// This function fixes sunk PHI nodes after fusion. |
Could you explain a bit more? E.g.:
Update the the PHI nodes that previously had incoming blocks from FC0 to the equivalent block fused in the fused loop (which is FC1 being reused, therefore the equivalent is not necessary for FC1).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink, | |
| static void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why should it be static? Do you expect it to be called without objects of LoopFuser?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The other way around: The function does not use this, so why should it be tied to an object? It's like an unused (implicit) function argument.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a non-member function, so adding static here affects its visibility. It is recommended to use static in this case (see Restrict Visibility).
(It may still be preferable to add it even if it is a member function)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a private member function of struct LoopFuser. Thus, the visibility is already restricted. Users can't call it from outside even without object.
I can either have static or const qualifier (which I already have) but not both. So, I should be removing const qualifier...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a private member function of struct
LoopFuser.
Sorry, I misread the indentation...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, I should be removing
constqualifier...
const refers to the qualifiers of this, which with static does not exist anymore, so indeed, the consequence would be to remove const.
I don't care that much, so I LGTM it already. Keep it non-static if you think it makes more sense.
If we have instructions in second loop's preheader which can be sunk, we should also be adjusting PHI nodes to receive values from the new loop's latch block. Fixes llvm#128600
4d34434 to
8ba5c5d
Compare
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
8ba5c5d to
b6262e0
Compare
Meinersbur
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If we have instructions in second loop's preheader which can be sunk, we should also be adjusting PHI nodes to receive values from the fused loop's latch block. Fixes llvm#128600
If we have instructions in second loop's preheader which can be sunk, we should also be adjusting
PHI nodes to receive values from the new loop's latch block.
Fixes #128600