[AMDGPU] Support image atomic no return instructions #150742

harrisonGPU · 2025-07-26T08:39:00Z

Add support for no-return variants of image atomic operations
(e.g. IMAGE_ATOMIC_ADD_NORTN, IMAGE_ATOMIC_CMPSWAP_NORTN).
These variants are generated when the return value of the intrinsic is
unused, allowing the backend to select no return type instructions.

llvmbot · 2025-07-26T08:39:30Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Harrison Hao (harrisonGPU)

Changes

Select glc=0 (no-return) for image atomics when result is unused

When the result of an image atomic instruction isn't used, we can safely set glc=0 to avoid the return value.
This avoids unnecessary waits on vmcnt and can improve performance.

Patch is 25.78 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150742.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+9-3)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+4-2)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll (+12-12)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir (+26-10)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll (+10-10)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8975486caa770..eb41f9806d860 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2123,8 +2123,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
 
   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && VDataOut && !MRI->use_empty(VDataOut))
+    CPol |= AMDGPU::CPol::GLC;
+
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return false;
@@ -2214,7 +2216,11 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
       }
 
     } else {
-      MIB.addDef(VDataOut); // vdata output
+      // If VDataOut is unused, mark it as dead to avoid unnecessary VGPR usage.
+      if (BaseOpcode->Atomic && MRI->use_empty(VDataOut))
+        MIB.addDef(VDataOut, RegState::Dead);
+      else
+        MIB.addDef(VDataOut);
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bc0fd8d4e814b..ab9f941378ded 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8780,8 +8780,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   }
 
   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !Op.getValue(0).use_empty())
+    CPol |= AMDGPU::CPol::GLC;
+
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return Op;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
index 221e2fd4f00f7..09e1fca3f2677 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
@@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX6-NEXT:    s_mov_b32 s5, s7
 ; GFX6-NEXT:    s_mov_b32 s6, s8
 ; GFX6-NEXT:    s_mov_b32 s7, s9
-; GFX6-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX6-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX8-NEXT:    s_mov_b32 s5, s7
 ; GFX8-NEXT:    s_mov_b32 s6, s8
 ; GFX8-NEXT:    s_mov_b32 s7, s9
-; GFX8-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX8-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX900-NEXT:    s_mov_b32 s5, s7
 ; GFX900-NEXT:    s_mov_b32 s6, s8
 ; GFX900-NEXT:    s_mov_b32 s7, s9
-; GFX900-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX900-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s8
 ; GFX90A-NEXT:    s_mov_b32 s7, s9
-; GFX90A-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
 ; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX6-NEXT:    s_mov_b32 s5, s7
 ; GFX6-NEXT:    s_mov_b32 s6, s8
 ; GFX6-NEXT:    s_mov_b32 s7, s9
-; GFX6-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX6-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX8-NEXT:    s_mov_b32 s5, s7
 ; GFX8-NEXT:    s_mov_b32 s6, s8
 ; GFX8-NEXT:    s_mov_b32 s7, s9
-; GFX8-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX8-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX900-NEXT:    s_mov_b32 s5, s7
 ; GFX900-NEXT:    s_mov_b32 s6, s8
 ; GFX900-NEXT:    s_mov_b32 s7, s9
-; GFX900-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX900-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s8
 ; GFX90A-NEXT:    s_mov_b32 s7, s9
-; GFX90A-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
 ; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
index 292fa4be1ca1d..a95b0ee434956 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
@@ -25,6 +25,7 @@ body: |
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si]].sub0
     ; GFX6-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -35,6 +36,7 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi]].sub0
     ; GFX8-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -45,6 +47,7 @@ body: |
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_]].sub0
     ; GFX10-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -55,6 +58,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_]].sub0
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX12-NEXT: {{  $}}
@@ -89,39 +93,43 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX6-NEXT: S_ENDPGM 0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX8-NEXT: S_ENDPGM 0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX10-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX12-NEXT: S_ENDPGM 0
     %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     %1:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
@@ -150,6 +158,7 @@ body: |
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si]].sub0_sub1
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX8-NEXT: {{  $}}
@@ -160,6 +169,7 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi]].sub0_sub1
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX10-NEXT: {{  $}}
@@ -170,6 +180,7 @@ body: |
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_]].sub0_sub1
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -180,6 +191,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_]].sub0_sub1
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX12-NEXT: {{  $}}
@@ -214,39 +226,43 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX6-NEXT: S_ENDPGM 0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX8-NEXT: S_ENDPGM 0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX10-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX12-NEXT: [[...
[truncated]

arsenm · 2025-07-27T00:26:38Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !Op.getValue(0).use_empty())
+    CPol |= AMDGPU::CPol::GLC;
+


Don't you need to also change the opcode to do this? The asm changes only change the bit and don't drop the output register.

Do you mean I need to change the opcode to an image atomic no-return type?
I haven’t seen such an instruction , I searched the programming guide and didn’t find one.
I’ve updated the patch to not drop the output register.
I believe we only need to set GLC to 0 when the result of the image atomic is unused, as the shader programming guide says:

Group Level Coherent - controls behavior of L0 cache. Atomics: 1 = return the memory value before the
atomic operation is performed. 0 = do not return anything.

I also noticed that flat atomics support GLC=0 without requiring an opcode change, so I assumed the same applies here.
What do you think? :-)

It wouldn't be in the programming guide, it would be in the tablegen instruction definitions corresponding to setting glc0/1. If we want to modify the operand structure there should be a separate MachineInstr opcode.

We do have separate RTN and no-RTN variants of the flat atomic pseudos

Thanks, I will try to investigate. :-)

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

jayfoad · 2025-08-19T09:27:50Z

Agree with Matt. Although the ISA opcode is the same, you should define separate MachineInstrs for the no-return forms which do not have the def operand for the returned value.

The current patch has two problems:

The register allocator will still allocate a register for the unused result value, which is a bit wasteful, and could actually increase vgpr usage in some cases.
These MachineInstrs still satisfy SIInstrInfo::isAtomicRet so SIInsertWaitcnts thinks that they increment VMCNT (aka LOADCNT) which will cause it to insert incorrect waitcnts in some cases.

harrisonGPU · 2025-08-19T09:40:47Z

Agree with Matt. Although the ISA opcode is the same, you should define separate MachineInstrs for the no-return forms which do not have the def operand for the returned value.

The current patch has two problems:

The register allocator will still allocate a register for the unused result value, which is a bit wasteful, and could actually increase vgpr usage in some cases.

These MachineInstrs still satisfy SIInstrInfo::isAtomicRet so SIInsertWaitcnts thinks that they increment VMCNT (aka LOADCNT) which will cause it to insert incorrect waitcnts in some cases.

Thanks, Jay. I agree with your point. I'm working on implementing no-return type image atomic intrinsics and instructions.

Even though current image atomics return a value, we usually avoid setting GLC when the result is unused, to avoid unnecessary cache. Maybe we need to think about to merge it?

I'm happy to work on adding proper no-return variants, this is my first time writing tablegen. :-)

jayfoad · 2025-08-19T10:18:42Z

I'm working on implementing no-return type image atomic intrinsics and instructions.

We don't need new intrinsics, only new MachineInstrs.

I'm happy to work on adding proper no-return variants, this is my first time writing tablegen. :-)

@sstipano has done a lot of work in MIMGInstructions.td recently and might be able to help if you need some help.

harrisonGPU · 2025-08-19T11:22:23Z

I'm working on implementing no-return type image atomic intrinsics and instructions.

We don't need new intrinsics, only new MachineInstrs.

I'm happy to work on adding proper no-return variants, this is my first time writing tablegen. :-)

@sstipano has done a lot of work in MIMGInstructions.td recently and might be able to help if you need some help.

I think we may need a new intrinsic? For example, I’ve already implemented a draft that lowers:

call void @llvm.amdgcn.image.atomic.add.1d.nortn.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)

to

image_atomic_add off, v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm

arsenm · 2025-08-19T11:27:33Z

I think we may need a new intrinsic? For example, I’ve already implemented a draft that lowers:

No, do not add a new intrinsic. Rely purely on use_empty of the result

harrisonGPU · 2025-08-19T11:38:09Z

I think we may need a new intrinsic? For example, I’ve already implemented a draft that lowers:

No, do not add a new intrinsic. Rely purely on use_empty of the result

Thanks, Matt. So I need to create new MachineInstrs for image atomics if the result is unused? For example:

define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
  ret void
}

should be lowered to:

image_atomic_cmpswap v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm

Is that right?

arsenm · 2025-08-19T12:12:29Z

Yes, this is how all of the flat and global cases are handled already

harrisonGPU · 2025-08-19T12:15:35Z

Yes, this is how all of the flat and global cases are handled already

Thanks, Matt. I will try to implement it! Do you think it makes sense to merge this PR? My thinking is that if the image atomic result is unused, we shouldn’t set GLC that seems to better match the recommendations in the Shader Programmer’s Guide. :-)

arsenm · 2025-08-19T13:14:20Z

Thanks, Matt. I will try to implement it! Do you think it makes sense to merge this PR?

Not without mutating the opcode to the no-return variant

harrisonGPU · 2025-08-19T13:33:36Z

Thanks, Matt. I will try to implement it! Do you think it makes sense to merge this PR?

Not without mutating the opcode to the no-return variant

Okay, thanks. :-)

github-actions · 2025-09-09T13:03:37Z

✅ With the latest revision this PR passed the C/C++ code formatter.

harrisonGPU · 2025-09-09T13:18:13Z

I’ve updated the patch to create new MachineInstrs for image atomics. If the result is unused, it now selects the no-return variant.
Please review it again when you have time, thanks! :-)

jayfoad

The C++ parts look mostly OK to me.

I'd prefer if someone else could comment on the MIMGInstructions.td changes. It looks like they introduce a lot of duplication. @sstipano @mbrkusanin

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

mbrkusanin

While the entries for new column for atomic instructions in ImageDimIntrinsicInfo are fine:

{ Intrinsic::amdgcn_image_atomic_add_1d, IMAGE_ATOMIC_ADD, IMAGE_ATOMIC_ADD_NORTN, ...

this now creates strange entries for existing sample opcodes that have _nortn versions

{ Intrinsic::amdgcn_image_sample_1d,       IMAGE_SAMPLE,       IMAGE_SAMPLE, ...
{ Intrinsic::amdgcn_image_sample_1d_nortn, IMAGE_SAMPLE_nortn, IMAGE_SAMPLE_nortn, ...

I understand that we are not introducing void @llvm.amdgcn.atomic.nortn.* intrinsics so the way these are used are different and sample is not supposed to rely on it, but it can cause confusion.
Would it make more sense to add another field to MIMGInfoTable instead for whether it is a noreturn version or not? It would then be picked with getMIMGOpcode just like any other variant.

llvm/lib/Target/AMDGPU/MIMGInstructions.td

mbrkusanin · 2025-09-19T17:01:13Z

The no-return image sample entries aren’t introduced by this PR. I already generate the tables separately:
foreach intr = AMDGPUImageDimIntrinsics in {
  def : ImageDimIntrinsicInfo<intr>;
}

foreach intr = AMDGPUImageDimAtomicIntrinsics in {
  def : ImageDimAtomicIntrinsicInfo<intr>;
}
This means image sample intrinsics and image atomic intrinsics are generated from different lists.

Right, but the ImageDimIntrinsicTable table is generated using same base class which is ImageDimIntrinsicInfo. NoRetBaseOpcode column is added for all entries.

An argument for keeping the NoRetBaseOpcode field here would be that ImageDimIntrinsicTable is much shorter than MIMGInfoTable. In that case I would at least rename NoRetBaseOpcode to something like AtomicNoRetBaseOpcode to clarify that it is only meant for image_atomic_* opcodes.

Other tablegen changes look fine to me.

harrisonGPU · 2025-09-20T02:43:46Z

The no-return image sample entries aren’t introduced by this PR. I already generate the tables separately:
foreach intr = AMDGPUImageDimIntrinsics in {
  def : ImageDimIntrinsicInfo<intr>;
}

foreach intr = AMDGPUImageDimAtomicIntrinsics in {
  def : ImageDimAtomicIntrinsicInfo<intr>;
}
This means image sample intrinsics and image atomic intrinsics are generated from different lists.
Right, but the ImageDimIntrinsicTable table is generated using same base class which is ImageDimIntrinsicInfo. NoRetBaseOpcode column is added for all entries.

An argument for keeping the NoRetBaseOpcode field here would be that ImageDimIntrinsicTable is much shorter than MIMGInfoTable. In that case I would at least rename NoRetBaseOpcode to something like AtomicNoRetBaseOpcode to clarify that it is only meant for image_atomic_* opcodes.

Other tablegen changes look fine to me.

Thanks, I have renamed it. :-)

mbrkusanin

LGTM for tablegen parts. Maybe @jayfoad has some further comments.

llvm/lib/Target/AMDGPU/MIMGInstructions.td

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/MIMGInstructions.td

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

jayfoad · 2025-10-16T10:19:49Z

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

  // that use the unpacked register layout, or need to repack the TFE result.

+  unsigned IntrOpcode = Intr->BaseOpcode;
+  // For image atomic: use no-return opcode if result is unused.


You are going to do this later in AMDGPUInstructionSelector so I don't understand why you need to do it in AMDGPULegalizerInfo as well. I would expect it to be in one place or the other.

So do you think it’s not necessary to do this? I just noticed there’s some code related to packed 16-bit image atomic instructions.

My suggestion would be to do it in AMDGPUInstructionSelector, not in AMDGPULegalizerInfo. But I do not have a deep understanding of this code.

Okay, my understanding is that SIISelLowering.cpp and AMDGPULegalizerInfo.cpp have the same logic for IsAtomicPacked16Bit. If I change SIISelLowering.cpp, I’ll need to update AMDGPULegalizerInfo.cpp at the same time. What do you think?

Only effect that this has is that it changes opcode from G_AMDGPU_INTRIN_IMAGE_LOAD to G_AMDGPU_INTRIN_IMAGE_LOAD_NORET which is not appropriate since we do not have "noret" image atomic intrinsics (this patch does not add them) and MachineInstr still has a return value at this point.

If you are not removing def operand here then you should not make any other change either. Real transformation happens in instruction selection (where we pick _noret opcode, set cpol bit and remove def operand).

Even check below for AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN and AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN opcodes is unnecessary because *ATOMIC*_NORTN opcodes will not show up at this point in legalizer.

Your SelectionDAG changes are only in SITargetLowering::lowerImage which is instruction selection. So you can remove all changes from AMDGPULegalizerInfo.cpp to keep it consistent between SDag and GlobalISel.

Thanks! I‘ve removed the related code. :-)

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

mbrkusanin

LGTM. Thanks

Add support for no-return variants of image atomic operations (e.g. IMAGE_ATOMIC_ADD_NORTN, IMAGE_ATOMIC_CMPSWAP_NORTN). These variants are generated when the return value of the intrinsic is unused, allowing the backend to select no return type instructions.

harrisonGPU requested review from arsenm, jayfoad, perlfu and shiltian July 26, 2025 08:39

harrisonGPU self-assigned this Jul 26, 2025

llvmbot added backend:AMDGPU llvm:globalisel labels Jul 26, 2025

arsenm reviewed Jul 27, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp Outdated Show resolved Hide resolved

harrisonGPU force-pushed the amdgpu/image-atomic-glc branch from 52cf45b to b21ad06 Compare September 9, 2025 12:59

harrisonGPU changed the title ~~[AMDGPU] Avoid setting GLC for image atomics when result is unused~~ [AMDGPU] Support image atomic no return instructions Sep 9, 2025

harrisonGPU force-pushed the amdgpu/image-atomic-glc branch from b21ad06 to 9d8b33a Compare September 9, 2025 13:06

harrisonGPU requested review from nhaehnle and ruiling September 9, 2025 13:14

harrisonGPU requested a review from piotrAMD September 12, 2025 09:55

jayfoad reviewed Sep 12, 2025

View reviewed changes

mbrkusanin reviewed Sep 12, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/MIMGInstructions.td Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/MIMGInstructions.td Outdated Show resolved Hide resolved

harrisonGPU requested review from arsenm, jayfoad and mbrkusanin September 19, 2025 10:13

mbrkusanin reviewed Oct 2, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/MIMGInstructions.td Outdated Show resolved Hide resolved

perlfu reviewed Oct 5, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/MIMGInstructions.td Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/MIMGInstructions.td Outdated Show resolved Hide resolved

arsenm reviewed Oct 10, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved

harrisonGPU force-pushed the amdgpu/image-atomic-glc branch from 213ff48 to 49ce884 Compare October 10, 2025 09:47

shiltian reviewed Oct 16, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/MIMGInstructions.td Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved

jayfoad reviewed Oct 16, 2025

View reviewed changes

harrisonGPU added 13 commits October 28, 2025 16:30

[AMDGPU] Avoid setting GLC for image atomics when result is unused

2d1c9b6

[AMDGPU] Not drop reg and use nodbg empty

12bcbe8

[AMDGPU] Support image atomic no return instructions

240326f

[AMDGPU] Update for comments.

56302e9

[AMDGPU] Update for reviewer.

d183c77

[AMDGPU] Renamed atomic no return baseopcode

358cd63

[AMDGPU] Update

f3762a6

[AMDGPU] Update for comments.

f4af97a

[AMDGPU] Update op number

c403417

[AMDGPU] Update.

f048210

[AMDGPU] Update for comments.

671c49f

[AMDGPU] Remove legalizer info code.

5736553

[AMDGPU] Fix lit test failure.

4fcd071

harrisonGPU force-pushed the amdgpu/image-atomic-glc branch from 8ef7769 to 4fcd071 Compare October 28, 2025 09:22

mbrkusanin approved these changes Oct 28, 2025

View reviewed changes

harrisonGPU merged commit d604ab6 into llvm:main Oct 29, 2025
10 checks passed

harrisonGPU deleted the amdgpu/image-atomic-glc branch October 29, 2025 02:42

[AMDGPU] Support image atomic no return instructions #150742

[AMDGPU] Support image atomic no return instructions #150742

Uh oh!

Conversation

harrisonGPU commented Jul 26, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 26, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

harrisonGPU Jul 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm Jul 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jayfoad commented Aug 19, 2025

Uh oh!

harrisonGPU commented Aug 19, 2025

Uh oh!

jayfoad commented Aug 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

harrisonGPU commented Aug 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Aug 19, 2025

Uh oh!

harrisonGPU commented Aug 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Aug 19, 2025

Uh oh!

harrisonGPU commented Aug 19, 2025

Uh oh!

arsenm commented Aug 19, 2025

Uh oh!

harrisonGPU commented Aug 19, 2025

Uh oh!

github-actions bot commented Sep 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

harrisonGPU commented Sep 9, 2025

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

mbrkusanin left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

mbrkusanin commented Sep 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

harrisonGPU commented Sep 20, 2025

Uh oh!

mbrkusanin left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

harrisonGPU commented Jul 26, 2025 •

edited

Loading

llvmbot commented Jul 26, 2025 •

edited

Loading

harrisonGPU Jul 27, 2025 •

edited

Loading

arsenm Jul 28, 2025 •

edited

Loading

jayfoad commented Aug 19, 2025 •

edited

Loading

harrisonGPU commented Aug 19, 2025 •

edited

Loading

harrisonGPU commented Aug 19, 2025 •

edited

Loading

github-actions bot commented Sep 9, 2025 •

edited

Loading

mbrkusanin commented Sep 19, 2025 •

edited

Loading