Skip to content

Commit 5f5b586

Browse files
committed
AMDGPU: Handle non-temporal loads and stores
Differential Revision: https://reviews.llvm.org/D36862 llvm-svn: 312729
1 parent 257132a commit 5f5b586

File tree

6 files changed

+586
-32
lines changed

6 files changed

+586
-32
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,15 @@ class SIMemOpInfo final {
5252
SyncScope::ID SSID = SyncScope::System;
5353
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
5454
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
55+
bool IsNonTemporal = false;
5556

5657
SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
5758
: SSID(SSID), Ordering(Ordering) {}
5859

5960
SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
60-
AtomicOrdering FailureOrdering)
61-
: SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering) {}
61+
AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
62+
: SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
63+
IsNonTemporal(IsNonTemporal) {}
6264

6365
/// \returns Info constructed from \p MI, which has at least machine memory
6466
/// operand.
@@ -81,6 +83,11 @@ class SIMemOpInfo final {
8183
AtomicOrdering getFailureOrdering() const {
8284
return FailureOrdering;
8385
}
86+
/// \returns True if memory access of the machine instruction used to
87+
/// create this SIMemOpInfo is non-temporal, false otherwise.
88+
bool isNonTemporal() const {
89+
return IsNonTemporal;
90+
}
8491

8592
/// \returns True if ordering constraint of the machine instruction used to
8693
/// create this SIMemOpInfo is unordered or higher, false otherwise.
@@ -130,6 +137,34 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
130137
/// \brief List of atomic pseudo instructions.
131138
std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
132139

140+
/// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
141+
/// true if \p MI is modified, false otherwise.
142+
template <uint16_t BitName>
143+
bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
144+
int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
145+
if (BitIdx == -1)
146+
return false;
147+
148+
MachineOperand &Bit = MI->getOperand(BitIdx);
149+
if (Bit.getImm() != 0)
150+
return false;
151+
152+
Bit.setImm(1);
153+
return true;
154+
}
155+
156+
/// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
157+
/// is modified, false otherwise.
158+
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
159+
return enableNamedBit<AMDGPU::OpName::glc>(MI);
160+
}
161+
162+
/// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
163+
/// is modified, false otherwise.
164+
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
165+
return enableNamedBit<AMDGPU::OpName::slc>(MI);
166+
}
167+
133168
/// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
134169
/// Always returns true.
135170
bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
@@ -139,10 +174,6 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
139174
bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
140175
bool Before = true) const;
141176

142-
/// \brief Sets GLC bit if present in \p MI. Returns true if \p MI is
143-
/// modified, false otherwise.
144-
bool setGLC(const MachineBasicBlock::iterator &MI) const;
145-
146177
/// \brief Removes all processed atomic pseudo instructions from the current
147178
/// function. Returns true if current function is modified, false otherwise.
148179
bool removeAtomicPseudoMIs();
@@ -199,6 +230,7 @@ Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
199230
SyncScope::ID SSID = SyncScope::SingleThread;
200231
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
201232
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
233+
bool IsNonTemporal = true;
202234

203235
// Validator should check whether or not MMOs cover the entire set of
204236
// locations accessed by the memory instruction.
@@ -217,9 +249,12 @@ Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
217249
FailureOrdering =
218250
isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
219251
FailureOrdering : MMO->getFailureOrdering();
252+
253+
if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
254+
IsNonTemporal = false;
220255
}
221256

222-
return SIMemOpInfo(SSID, Ordering, FailureOrdering);
257+
return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
223258
}
224259

225260
/* static */
@@ -343,19 +378,6 @@ bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
343378
return true;
344379
}
345380

346-
bool SIMemoryLegalizer::setGLC(const MachineBasicBlock::iterator &MI) const {
347-
int GLCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::glc);
348-
if (GLCIdx == -1)
349-
return false;
350-
351-
MachineOperand &GLC = MI->getOperand(GLCIdx);
352-
if (GLC.getImm() == 1)
353-
return false;
354-
355-
GLC.setImm(1);
356-
return true;
357-
}
358-
359381
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
360382
if (AtomicPseudoMIs.empty())
361383
return false;
@@ -378,7 +400,7 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
378400
MOI.getSSID() == MMI->getAgentSSID()) {
379401
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
380402
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
381-
Changed |= setGLC(MI);
403+
Changed |= enableGLCBit(MI);
382404

383405
if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
384406
Changed |= insertWaitcntVmcnt0(MI);
@@ -401,6 +423,13 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
401423
llvm_unreachable("Unsupported synchronization scope");
402424
}
403425

426+
// Atomic instructions do not have the nontemporal attribute.
427+
if (MOI.isNonTemporal()) {
428+
Changed |= enableGLCBit(MI);
429+
Changed |= enableSLCBit(MI);
430+
return Changed;
431+
}
432+
404433
return Changed;
405434
}
406435

@@ -429,6 +458,13 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
429458
llvm_unreachable("Unsupported synchronization scope");
430459
}
431460

461+
// Atomic instructions do not have the nontemporal attribute.
462+
if (MOI.isNonTemporal()) {
463+
Changed |= enableGLCBit(MI);
464+
Changed |= enableSLCBit(MI);
465+
return Changed;
466+
}
467+
432468
return Changed;
433469
}
434470

@@ -499,7 +535,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
499535
if (MOI.getSSID() == SyncScope::SingleThread ||
500536
MOI.getSSID() == MMI->getWorkgroupSSID() ||
501537
MOI.getSSID() == MMI->getWavefrontSSID()) {
502-
Changed |= setGLC(MI);
538+
Changed |= enableGLCBit(MI);
503539
return Changed;
504540
}
505541

@@ -536,7 +572,7 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
536572
if (MOI.getSSID() == SyncScope::SingleThread ||
537573
MOI.getSSID() == MMI->getWorkgroupSSID() ||
538574
MOI.getSSID() == MMI->getWavefrontSSID()) {
539-
Changed |= setGLC(MI);
575+
Changed |= enableGLCBit(MI);
540576
return Changed;
541577
}
542578

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
3+
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
5+
6+
declare i32 @llvm.amdgcn.workitem.id.x()
7+
8+
; GCN-LABEL: {{^}}nontemporal_load_private_0
9+
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
10+
define amdgpu_kernel void @nontemporal_load_private_0(
11+
i32* %in, i32 addrspace(4)* %out) {
12+
entry:
13+
%val = load i32, i32* %in, align 4, !nontemporal !0
14+
store i32 %val, i32 addrspace(4)* %out
15+
ret void
16+
}
17+
18+
; GCN-LABEL: {{^}}nontemporal_load_private_1
19+
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
20+
define amdgpu_kernel void @nontemporal_load_private_1(
21+
i32* %in, i32 addrspace(4)* %out) {
22+
entry:
23+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
24+
%val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
25+
%val = load i32, i32* %val.gep, align 4, !nontemporal !0
26+
store i32 %val, i32 addrspace(4)* %out
27+
ret void
28+
}
29+
30+
; GCN-LABEL: {{^}}nontemporal_load_global_0
31+
; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}}
32+
define amdgpu_kernel void @nontemporal_load_global_0(
33+
i32 addrspace(1)* %in, i32 addrspace(4)* %out) {
34+
entry:
35+
%val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
36+
store i32 %val, i32 addrspace(4)* %out
37+
ret void
38+
}
39+
40+
; GCN-LABEL: {{^}}nontemporal_load_global_1
41+
; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
42+
; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
43+
define amdgpu_kernel void @nontemporal_load_global_1(
44+
i32 addrspace(1)* %in, i32 addrspace(4)* %out) {
45+
entry:
46+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
47+
%val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
48+
%val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0
49+
store i32 %val, i32 addrspace(4)* %out
50+
ret void
51+
}
52+
53+
; GCN-LABEL: {{^}}nontemporal_load_local_0
54+
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
55+
define amdgpu_kernel void @nontemporal_load_local_0(
56+
i32 addrspace(3)* %in, i32 addrspace(4)* %out) {
57+
entry:
58+
%val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
59+
store i32 %val, i32 addrspace(4)* %out
60+
ret void
61+
}
62+
63+
; GCN-LABEL: {{^}}nontemporal_load_local_1
64+
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
65+
define amdgpu_kernel void @nontemporal_load_local_1(
66+
i32 addrspace(3)* %in, i32 addrspace(4)* %out) {
67+
entry:
68+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
69+
%val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
70+
%val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
71+
store i32 %val, i32 addrspace(4)* %out
72+
ret void
73+
}
74+
75+
; GCN-LABEL: {{^}}nontemporal_load_flat_0
76+
; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
77+
define amdgpu_kernel void @nontemporal_load_flat_0(
78+
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
79+
entry:
80+
%val = load i32, i32 addrspace(4)* %in, align 4, !nontemporal !0
81+
store i32 %val, i32 addrspace(4)* %out
82+
ret void
83+
}
84+
85+
; GCN-LABEL: {{^}}nontemporal_load_flat_1
86+
; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
87+
define amdgpu_kernel void @nontemporal_load_flat_1(
88+
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
89+
entry:
90+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
91+
%val.gep = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tid
92+
%val = load i32, i32 addrspace(4)* %val.gep, align 4, !nontemporal !0
93+
store i32 %val, i32 addrspace(4)* %out
94+
ret void
95+
}
96+
97+
!0 = !{i32 1}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
3+
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
5+
6+
declare i32 @llvm.amdgcn.workitem.id.x()
7+
8+
; GCN-LABEL: {{^}}nontemporal_store_private_0
9+
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
10+
define amdgpu_kernel void @nontemporal_store_private_0(
11+
i32 addrspace(4)* %in, i32* %out) {
12+
entry:
13+
%val = load i32, i32 addrspace(4)* %in, align 4
14+
store i32 %val, i32* %out, !nontemporal !0
15+
ret void
16+
}
17+
18+
; GCN-LABEL: {{^}}nontemporal_store_private_1
19+
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
20+
define amdgpu_kernel void @nontemporal_store_private_1(
21+
i32 addrspace(4)* %in, i32* %out) {
22+
entry:
23+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
24+
%val = load i32, i32 addrspace(4)* %in, align 4
25+
%out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
26+
store i32 %val, i32* %out.gep, !nontemporal !0
27+
ret void
28+
}
29+
30+
; GCN-LABEL: {{^}}nontemporal_store_global_0
31+
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc slc{{$}}
32+
define amdgpu_kernel void @nontemporal_store_global_0(
33+
i32 addrspace(4)* %in, i32 addrspace(1)* %out) {
34+
entry:
35+
%val = load i32, i32 addrspace(4)* %in, align 4
36+
store i32 %val, i32 addrspace(1)* %out, !nontemporal !0
37+
ret void
38+
}
39+
40+
; GCN-LABEL: {{^}}nontemporal_store_global_1
41+
; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
42+
; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
43+
define amdgpu_kernel void @nontemporal_store_global_1(
44+
i32 addrspace(4)* %in, i32 addrspace(1)* %out) {
45+
entry:
46+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
47+
%val = load i32, i32 addrspace(4)* %in, align 4
48+
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
49+
store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0
50+
ret void
51+
}
52+
53+
; GCN-LABEL: {{^}}nontemporal_store_local_0
54+
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
55+
define amdgpu_kernel void @nontemporal_store_local_0(
56+
i32 addrspace(4)* %in, i32 addrspace(3)* %out) {
57+
entry:
58+
%val = load i32, i32 addrspace(4)* %in, align 4
59+
store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
60+
ret void
61+
}
62+
63+
; GCN-LABEL: {{^}}nontemporal_store_local_1
64+
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
65+
define amdgpu_kernel void @nontemporal_store_local_1(
66+
i32 addrspace(4)* %in, i32 addrspace(3)* %out) {
67+
entry:
68+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
69+
%val = load i32, i32 addrspace(4)* %in, align 4
70+
%out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
71+
store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
72+
ret void
73+
}
74+
75+
; GCN-LABEL: {{^}}nontemporal_store_flat_0
76+
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
77+
define amdgpu_kernel void @nontemporal_store_flat_0(
78+
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
79+
entry:
80+
%val = load i32, i32 addrspace(4)* %in, align 4
81+
store i32 %val, i32 addrspace(4)* %out, !nontemporal !0
82+
ret void
83+
}
84+
85+
; GCN-LABEL: {{^}}nontemporal_store_flat_1
86+
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
87+
define amdgpu_kernel void @nontemporal_store_flat_1(
88+
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
89+
entry:
90+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
91+
%val = load i32, i32 addrspace(4)* %in, align 4
92+
%out.gep = getelementptr inbounds i32, i32 addrspace(4)* %out, i32 %tid
93+
store i32 %val, i32 addrspace(4)* %out.gep, !nontemporal !0
94+
ret void
95+
}
96+
97+
!0 = !{i32 1}

0 commit comments

Comments
 (0)