[DAGISel] Keep flags when converting FP load/store to integer #111679

ostannard · 2024-10-09T13:47:45Z

This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.

llvmbot · 2024-10-09T13:48:11Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-backend-arm

@llvm/pr-subscribers-llvm-selectiondag

Author: Oliver Stannard (ostannard)

Changes

This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.

Full diff: https://github.com/llvm/llvm-project/pull/111679.diff

4 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+4-6)
(modified) llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll (+8-8)
(added) llvm/test/CodeGen/ARM/load-store-pair-volatile.ll (+24)
(modified) llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll (+5-5)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 348db9ca7c430e..810ca458bc8787 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20322,13 +20322,11 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
         !FastLD || !FastST)
       return SDValue();
 
-    SDValue NewLD =
-        DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
-                    LD->getPointerInfo(), LD->getAlign());
+    SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(),
+                                LD->getBasePtr(), LD->getMemOperand());
 
-    SDValue NewST =
-        DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
-                     ST->getPointerInfo(), ST->getAlign());
+    SDValue NewST = DAG.getStore(ST->getChain(), SDLoc(N), NewLD,
+                                 ST->getBasePtr(), ST->getMemOperand());
 
     AddToWorklist(NewLD.getNode());
     AddToWorklist(NewST.getNode());
diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
index 782894976c711c..b5f0b2ff9ef4cf 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
@@ -9,7 +9,7 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -36,7 +36,7 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -63,7 +63,7 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -90,7 +90,7 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -117,7 +117,7 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -144,7 +144,7 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -172,7 +172,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x84
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -184,7 +184,7 @@ define void @gep_inbounds_nuw_alloca_nonpow2_scale(i32 %idx, i32 %val) #0 {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x84
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll b/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll
new file mode 100644
index 00000000000000..6278672d9e2397
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/load-store-pair-volatile.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=arm-none-eabi -stop-after=finalize-isel < %s | FileCheck %s
+
+define void @test(ptr %vol_one, ptr %p_in, ptr %p_out, i32 %n) {
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $r2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $r1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $r0
+  ; CHECK-NEXT:   [[LDRi12_:%[0-9]+]]:gpr = LDRi12 [[COPY1]], 0, 14 /* CC::al */, $noreg :: (load (s32) from %ir.p_in)
+  ; CHECK-NEXT:   STRi12 killed [[LDRi12_]], [[COPY2]], 0, 14 /* CC::al */, $noreg :: (volatile store (s32) into %ir.vol_one)
+  ; CHECK-NEXT:   [[LDRi12_1:%[0-9]+]]:gpr = LDRi12 [[COPY2]], 4, 14 /* CC::al */, $noreg :: (volatile load (s32) from %ir.vol_two)
+  ; CHECK-NEXT:   STRi12 killed [[LDRi12_1]], [[COPY]], 0, 14 /* CC::al */, $noreg :: (store (s32) into %ir.p_out)
+  ; CHECK-NEXT:   MOVPCLR 14 /* CC::al */, $noreg
+entry:
+  %vol_two = getelementptr inbounds i8, ptr %vol_one, i32 4
+  %a = load float, ptr %p_in, align 4
+  store volatile float %a, ptr %vol_one, align 4
+  %b = load volatile float, ptr %vol_two, align 4
+  store float %b, ptr %p_out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
index ccc36530c7957b..150fa91524ab0c 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
@@ -1442,8 +1442,8 @@ define void @caller_fpr_stack() {
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc @f14, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f0 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d15)
   ; 32BIT-NEXT:   renamable $r5 = LWZtoc @f16, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r4 :: (load (s32) from @f14)
-  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r5 :: (load (s32) from @f16)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r5 :: (dereferenceable load (s32) from @f16)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 144, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   renamable $r5 = LI 0
   ; 32BIT-NEXT:   renamable $r6 = LIS 16352
@@ -1532,9 +1532,9 @@ define void @caller_fpr_stack() {
   ; 64BIT-NEXT:   renamable $x3 = LDtoc @f14, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $x4 = LDtoc @d15, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $x5 = LDtoc @f16, $x2 :: (load (s64) from got)
-  ; 64BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $x3 :: (load (s32) from @f14)
-  ; 64BIT-NEXT:   renamable $x4 = LD 0, killed renamable $x4 :: (load (s64) from @d15)
-  ; 64BIT-NEXT:   renamable $r5 = LWZ 0, killed renamable $x5 :: (load (s32) from @f16)
+  ; 64BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $x3 :: (dereferenceable load (s32) from @f14)
+  ; 64BIT-NEXT:   renamable $x4 = LD 0, killed renamable $x4 :: (dereferenceable load (s64) from @d15)
+  ; 64BIT-NEXT:   renamable $r5 = LWZ 0, killed renamable $x5 :: (dereferenceable load (s32) from @f16)
   ; 64BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   renamable $x6 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   STW killed renamable $r5, 168, $x1 :: (store (s32))

arsenm · 2024-10-09T13:58:20Z

llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll

@@ -9,7 +9,7 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2


Should disappear on update

arsenm · 2024-10-09T13:58:52Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

-    SDValue NewLD =
-        DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
-                    LD->getPointerInfo(), LD->getAlign());
+    SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(),


We ought to preserve the in-memory type, but we have very lax treatment of it today

…11679) This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.

We have a customer affected by this bug, which was fixed after LLVM 19, and we'd like to do a release of LLVM-ET 19 with the fix. LLVM PR: llvm/llvm-project#111679 Original commit message: [DAGISel] Keep flags when converting FP load/store to integer (#111679) This DAG combine replaces a floating-point load/store pair which has no other uses with an integer one, but did not copy the memory operand flags to the new instructions, resulting in it dropping the volatile flag. This optimisation is still valid if one or both of the instructions is volatile, so we can copy over the whole MachineMemOperand to generate volatile integer loads and stores where needed.

ostannard added 2 commits October 9, 2024 14:38

Pre-commit test showing bug

b64f783

ostannard added the llvm:SelectionDAG SelectionDAGISel as well label Oct 9, 2024

ostannard requested review from artagnon, arsenm and john-brawn-arm October 9, 2024 13:47

llvmbot added backend:ARM backend:AMDGPU backend:PowerPC labels Oct 9, 2024

arsenm approved these changes Oct 9, 2024

View reviewed changes

Merge remote-tracking branch 'origin/main' into volatile

6a4d201

ostannard merged commit 1e49670 into llvm:main Oct 10, 2024
9 checks passed

ostannard mentioned this pull request Dec 11, 2024

Add patch to fix volatile access bug ARM-software/LLVM-embedded-toolchain-for-Arm#595

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[DAGISel] Keep flags when converting FP load/store to integer #111679

[DAGISel] Keep flags when converting FP load/store to integer #111679

Uh oh!

ostannard commented Oct 9, 2024

Uh oh!

llvmbot commented Oct 9, 2024 •

edited

Loading

Uh oh!

arsenm Oct 9, 2024

Uh oh!

arsenm Oct 9, 2024

Uh oh!

Uh oh!

Uh oh!

[DAGISel] Keep flags when converting FP load/store to integer #111679

[DAGISel] Keep flags when converting FP load/store to integer #111679

Uh oh!

Conversation

ostannard commented Oct 9, 2024

Uh oh!

llvmbot commented Oct 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm Oct 9, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 9, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Oct 9, 2024 •

edited

Loading