Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
struct ImageDimIntrinsicInfo {
unsigned Intr;
unsigned BaseOpcode;
unsigned AtomicNoRetBaseOpcode;
MIMGDim Dim;

uint8_t NumOffsetArgs;
Expand Down
22 changes: 16 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
unsigned IntrOpcode = Intr->BaseOpcode;

// For image atomic: use no-return opcode if result is unused.
if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
Register ResultDef = MI.getOperand(0).getReg();
if (MRI->use_nodbg_empty(ResultDef))
IntrOpcode = Intr->AtomicNoRetBaseOpcode;
}

const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);

const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);

const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;

Register VDataIn, VDataOut;
Register VDataIn = AMDGPU::NoRegister;
Register VDataOut = AMDGPU::NoRegister;
LLT VDataTy;
int NumVDataDwords = -1;
bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
Expand Down Expand Up @@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned DMaskLanes = 0;

if (BaseOpcode->Atomic) {
VDataOut = MI.getOperand(0).getReg();
if (!BaseOpcode->NoReturn)
VDataOut = MI.getOperand(0).getReg();
VDataIn = MI.getOperand(2).getReg();
LLT Ty = MRI->getType(VDataIn);

Expand Down Expand Up @@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");

unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
// Keep GLC only when the atomic's result is actually used.
if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
CPol |= AMDGPU::CPol::GLC;
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return false;
Expand Down
185 changes: 102 additions & 83 deletions llvm/lib/Target/AMDGPU/MIMGInstructions.td

Large diffs are not rendered by default.

37 changes: 28 additions & 9 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDLoc DL(Op);
MachineFunction &MF = DAG.getMachineFunction();
const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
unsigned IntrOpcode = Intr->BaseOpcode;
// For image atomic: use no-return opcode if result is unused.
if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
!Op.getNode()->hasAnyUseOfValue(0))
IntrOpcode = Intr->AtomicNoRetBaseOpcode;
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);

SmallVector<EVT, 3> ResultTypes(Op->values());
SmallVector<EVT, 3> OrigResultTypes(Op->values());
if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
ResultTypes.erase(&ResultTypes[0]);

bool IsD16 = false;
bool IsG16 = false;
bool IsA16 = false;
Expand All @@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
VData = Op.getOperand(2);

IsAtomicPacked16Bit =
(Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
(IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);

bool Is64Bit = VData.getValueSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
Expand All @@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (Is64Bit)
VData = DAG.getBitcast(MVT::v4i32, VData);

ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
if (!BaseOpcode->NoReturn)
ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;

DMask = Is64Bit ? 0xf : 0x3;
NumVDataDwords = Is64Bit ? 4 : 2;
} else {
Expand Down Expand Up @@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}

unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
// Keep GLC only when the atomic's result is actually used.
if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
CPol |= AMDGPU::CPol::GLC;
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
AMDGPU::CPol::VOLATILE))
return Op;
Expand Down Expand Up @@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
DAG.setNodeMemRefs(NewNode, {MemRef});
}

if (BaseOpcode->NoReturn) {
if (BaseOpcode->Atomic)
return DAG.getMergeValues(
{DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);

return SDValue(NewNode, 0);
}

if (BaseOpcode->AtomicX2) {
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
}
if (BaseOpcode->NoReturn)
return SDValue(NewNode, 0);

return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
NumVDataDwords, IsAtomicPacked16Bit, DL);
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX6-NEXT: s_mov_b32 s5, s7
; GFX6-NEXT: s_mov_b32 s6, s8
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return:
Expand All @@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX8-NEXT: s_mov_b32 s5, s7
; GFX8-NEXT: s_mov_b32 s6, s8
; GFX8-NEXT: s_mov_b32 s7, s9
; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return:
Expand All @@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX900-NEXT: s_mov_b32 s5, s7
; GFX900-NEXT: s_mov_b32 s6, s8
; GFX900-NEXT: s_mov_b32 s7, s9
; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX900-NEXT: s_endpgm
;
; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return:
Expand All @@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX90A-NEXT: s_mov_b32 s5, s7
; GFX90A-NEXT: s_mov_b32 s6, s8
; GFX90A-NEXT: s_mov_b32 s7, s9
; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX90A-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
Expand All @@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return:
Expand All @@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX12-NEXT: s_mov_b32 s5, s7
; GFX12-NEXT: s_mov_b32 s6, s8
; GFX12-NEXT: s_mov_b32 s7, s9
; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_endpgm
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand Down Expand Up @@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX6-NEXT: s_mov_b32 s5, s7
; GFX6-NEXT: s_mov_b32 s6, s8
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return:
Expand All @@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX8-NEXT: s_mov_b32 s5, s7
; GFX8-NEXT: s_mov_b32 s6, s8
; GFX8-NEXT: s_mov_b32 s7, s9
; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return:
Expand All @@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX900-NEXT: s_mov_b32 s5, s7
; GFX900-NEXT: s_mov_b32 s6, s8
; GFX900-NEXT: s_mov_b32 s7, s9
; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX900-NEXT: s_endpgm
;
; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return:
Expand All @@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX90A-NEXT: s_mov_b32 s5, s7
; GFX90A-NEXT: s_mov_b32 s6, s8
; GFX90A-NEXT: s_mov_b32 s7, s9
; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX90A-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
Expand All @@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return:
Expand All @@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX12-NEXT: s_mov_b32 s5, s7
; GFX12-NEXT: s_mov_b32 s6, s8
; GFX12-NEXT: s_mov_b32 s7, s9
; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_endpgm
main_body:
%v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
Expand Down
Loading