-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[DAGISel] Keep flags when converting FP load/store to integer #111679
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Oliver Stannard (ostannard) ChangesThis DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed. Full diff: https://github.com/llvm/llvm-project/pull/111679.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 348db9ca7c430e..810ca458bc8787 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20322,13 +20322,11 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
!FastLD || !FastST)
return SDValue();
- SDValue NewLD =
- DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
- LD->getPointerInfo(), LD->getAlign());
+ SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(),
+ LD->getBasePtr(), LD->getMemOperand());
- SDValue NewST =
- DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
- ST->getPointerInfo(), ST->getAlign());
+ SDValue NewST = DAG.getStore(ST->getChain(), SDLoc(N), NewLD,
+ ST->getBasePtr(), ST->getMemOperand());
AddToWorklist(NewLD.getNode());
AddToWorklist(NewST.getNode());
diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
index 782894976c711c..b5f0b2ff9ef4cf 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
@@ -9,7 +9,7 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -36,7 +36,7 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -63,7 +63,7 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -90,7 +90,7 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -117,7 +117,7 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -144,7 +144,7 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -172,7 +172,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 {
; GFX8-NEXT: s_movk_i32 s4, 0x84
; GFX8-NEXT: v_mul_lo_u32 v0, v0, s4
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -184,7 +184,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 {
; GFX9-NEXT: s_movk_i32 s4, 0x84
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4
; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll b/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll
new file mode 100644
index 00000000000000..6278672d9e2397
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=arm-none-eabi -stop-after=finalize-isel < %s | FileCheck %s
+
+define void @test(ptr %vol_one, ptr %p_in, ptr %p_out, i32 %n) {
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: liveins: $r0, $r1, $r2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $r2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $r1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $r0
+ ; CHECK-NEXT: [[LDRi12_:%[0-9]+]]:gpr = LDRi12 [[COPY1]], 0, 14 /* CC::al */, $noreg :: (load (s32) from %ir.p_in)
+ ; CHECK-NEXT: STRi12 killed [[LDRi12_]], [[COPY2]], 0, 14 /* CC::al */, $noreg :: (volatile store (s32) into %ir.vol_one)
+ ; CHECK-NEXT: [[LDRi12_1:%[0-9]+]]:gpr = LDRi12 [[COPY2]], 4, 14 /* CC::al */, $noreg :: (volatile load (s32) from %ir.vol_two)
+ ; CHECK-NEXT: STRi12 killed [[LDRi12_1]], [[COPY]], 0, 14 /* CC::al */, $noreg :: (store (s32) into %ir.p_out)
+ ; CHECK-NEXT: MOVPCLR 14 /* CC::al */, $noreg
+entry:
+ %vol_two = getelementptr inbounds i8, ptr %vol_one, i32 4
+ %a = load float, ptr %p_in, align 4
+ store volatile float %a, ptr %vol_one, align 4
+ %b = load volatile float, ptr %vol_two, align 4
+ store float %b, ptr %p_out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
index ccc36530c7957b..150fa91524ab0c 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
@@ -1442,8 +1442,8 @@ define void @caller_fpr_stack() {
; 32BIT-NEXT: renamable $r4 = LWZtoc @f14, $r2 :: (load (s32) from got)
; 32BIT-NEXT: renamable $f0 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d15)
; 32BIT-NEXT: renamable $r5 = LWZtoc @f16, $r2 :: (load (s32) from got)
- ; 32BIT-NEXT: renamable $r3 = LWZ 0, killed renamable $r4 :: (load (s32) from @f14)
- ; 32BIT-NEXT: renamable $r4 = LWZ 0, killed renamable $r5 :: (load (s32) from @f16)
+ ; 32BIT-NEXT: renamable $r3 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14)
+ ; 32BIT-NEXT: renamable $r4 = LWZ 0, killed renamable $r5 :: (dereferenceable load (s32) from @f16)
; 32BIT-NEXT: ADJCALLSTACKDOWN 144, 0, implicit-def dead $r1, implicit $r1
; 32BIT-NEXT: renamable $r5 = LI 0
; 32BIT-NEXT: renamable $r6 = LIS 16352
@@ -1532,9 +1532,9 @@ define void @caller_fpr_stack() {
; 64BIT-NEXT: renamable $x3 = LDtoc @f14, $x2 :: (load (s64) from got)
; 64BIT-NEXT: renamable $x4 = LDtoc @d15, $x2 :: (load (s64) from got)
; 64BIT-NEXT: renamable $x5 = LDtoc @f16, $x2 :: (load (s64) from got)
- ; 64BIT-NEXT: renamable $r3 = LWZ 0, killed renamable $x3 :: (load (s32) from @f14)
- ; 64BIT-NEXT: renamable $x4 = LD 0, killed renamable $x4 :: (load (s64) from @d15)
- ; 64BIT-NEXT: renamable $r5 = LWZ 0, killed renamable $x5 :: (load (s32) from @f16)
+ ; 64BIT-NEXT: renamable $r3 = LWZ 0, killed renamable $x3 :: (dereferenceable load (s32) from @f14)
+ ; 64BIT-NEXT: renamable $x4 = LD 0, killed renamable $x4 :: (dereferenceable load (s64) from @d15)
+ ; 64BIT-NEXT: renamable $r5 = LWZ 0, killed renamable $x5 :: (dereferenceable load (s32) from @f16)
; 64BIT-NEXT: ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
; 64BIT-NEXT: renamable $x6 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
; 64BIT-NEXT: STW killed renamable $r5, 168, $x1 :: (store (s32))
|
@@ -9,7 +9,7 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 { | |||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | |||
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | |||
; GFX8-NEXT: v_lshrrev_b32_e64 v2, 6, s32 | |||
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 | |||
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should disappear on update
SDValue NewLD = | ||
DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(), | ||
LD->getPointerInfo(), LD->getAlign()); | ||
SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We ought to preserve the in-memory type, but we have very lax treatment of it today
…11679) This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.
We have a customer affected by this bug, which was fixed after LLVM 19, and we'd like to do a release of LLVM-ET 19 with the fix. LLVM PR: llvm/llvm-project#111679 Original commit message: [DAGISel] Keep flags when converting FP load/store to integer (#111679) This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.
We have a customer affected by this bug, which was fixed after LLVM 19, and we'd like to do a release of LLVM-ET 19 with the fix. LLVM PR: llvm/llvm-project#111679 Original commit message: [DAGISel] Keep flags when converting FP load/store to integer (#111679) This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.
This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.