Skip to content

Commit b838f0e

Browse files
Shahab Layeghiagainull
authored andcommitted
[SYCL] Use a global flush buffer in stream
Change stream implementation to use a global buffer for all the work-items to flush their outputs into. Previously a set of local flush buffers were used for this purpose. This change is made to avoid the issue of some devices running out of local memory when available space is less than statement size times the number of work-items. Signed-off-by: Shahab Layeghi <[email protected]>
1 parent 57bad9e commit b838f0e

File tree

11 files changed

+129
-134
lines changed

11 files changed

+129
-134
lines changed

sycl/include/CL/sycl/detail/accessor_impl.hpp

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ class __SYCL_EXPORT AccessorImplHost {
7979
MElemSize(Other.MElemSize), MOffsetInBytes(Other.MOffsetInBytes),
8080
MIsSubBuffer(Other.MIsSubBuffer) {}
8181

82+
void resize(size_t GlobalSize);
83+
8284
id<3> MOffset;
8385
// The size of accessing region.
8486
range<3> MAccessRange;
@@ -96,6 +98,8 @@ class __SYCL_EXPORT AccessorImplHost {
9698
void *MData = nullptr;
9799

98100
Command *MBlockedCmd = nullptr;
101+
102+
bool PerWI = false;
99103
};
100104

101105
using AccessorImplPtr = shared_ptr_class<AccessorImplHost>;
@@ -139,23 +143,6 @@ class __SYCL_EXPORT LocalAccessorImplHost {
139143
int MDims;
140144
int MElemSize;
141145
std::vector<char> MMem;
142-
143-
bool PerWI = false;
144-
size_t LocalMemSize;
145-
size_t MaxWGSize;
146-
void resize(size_t LocalSize, size_t GlobalSize) {
147-
if (GlobalSize != 1 && LocalSize != 1) {
148-
// If local size is not specified then work group size is chosen by
149-
// runtime. That is why try to allocate based on max work group size or
150-
// global size. In the worst case allocate 80% of local memory.
151-
size_t MinEstWGSize = LocalSize ? LocalSize : GlobalSize;
152-
MinEstWGSize = MinEstWGSize > MaxWGSize ? MaxWGSize : MinEstWGSize;
153-
size_t NewSize = MinEstWGSize * MSize[0];
154-
MSize[0] =
155-
NewSize > 8 * LocalMemSize / 10 ? 8 * LocalMemSize / 10 : NewSize;
156-
MMem.resize(NewSize * MElemSize);
157-
}
158-
}
159146
};
160147

161148
using LocalAccessorImplPtr = shared_ptr_class<LocalAccessorImplHost>;

sycl/include/CL/sycl/detail/buffer_impl.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ class __SYCL_EXPORT buffer_impl final : public SYCLMemObjT {
113113
} catch (...) {
114114
}
115115
}
116+
117+
void resize(size_t size) { BaseT::MSizeInBytes = size; }
116118
};
117119

118120
} // namespace detail

sycl/include/CL/sycl/detail/stream_impl.hpp

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -89,16 +89,6 @@ class __SYCL_EXPORT stream_impl {
8989
cl::sycl::access::target::global_buffer,
9090
cl::sycl::access::placeholder::false_t>;
9191

92-
using FlushBufAccessorT =
93-
accessor<char, 1, cl::sycl::access::mode::read_write,
94-
cl::sycl::access::target::local,
95-
cl::sycl::access::placeholder::false_t>;
96-
97-
using LocalOffsetAccessorT =
98-
accessor<unsigned, 1, cl::sycl::access::mode::atomic,
99-
cl::sycl::access::target::local,
100-
cl::sycl::access::placeholder::false_t>;
101-
10292
stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH);
10393

10494
// Method to provide an access to the global stream buffer
@@ -107,6 +97,12 @@ class __SYCL_EXPORT stream_impl {
10797
CGH, range<1>(BufferSize_), id<1>(OffsetSize));
10898
}
10999

100+
// Method to provide an accessor to the global flush buffer
101+
GlobalBufAccessorT accessGlobalFlushBuf(handler &CGH) {
102+
return FlushBuf.get_access<cl::sycl::access::mode::read_write>(
103+
CGH, range<1>(MaxStatementSize_), id<1>(0));
104+
}
105+
110106
// Method to provide an atomic access to the offset in the global stream
111107
// buffer
112108
GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) {
@@ -116,12 +112,6 @@ class __SYCL_EXPORT stream_impl {
116112
CGH, range<1>(1), id<1>(0));
117113
}
118114

119-
// Method to provide an atomic access to the flush buffer size
120-
GlobalOffsetAccessorT accessFlushBufferSize(handler &CGH) {
121-
return FlushBufferSize.get_access<cl::sycl::access::mode::atomic>(
122-
CGH, range<1>(1), id<1>(0));
123-
}
124-
125115
// Copy stream buffer to the host and print the contents
126116
void flush();
127117

@@ -148,8 +138,8 @@ class __SYCL_EXPORT stream_impl {
148138
// Stream buffer
149139
buffer<char, 1> Buf;
150140

151-
// Buffer for flush buffer size
152-
buffer<unsigned, 1> FlushBufferSize;
141+
// Global flush buffer
142+
buffer<char, 1> FlushBuf;
153143
};
154144

155145
template <typename T>
@@ -309,35 +299,34 @@ inline bool updateOffset(stream_impl::GlobalOffsetAccessorT &GlobalOffset,
309299

310300
inline void flushBuffer(stream_impl::GlobalOffsetAccessorT &GlobalOffset,
311301
stream_impl::GlobalBufAccessorT &GlobalBuf,
312-
stream_impl::FlushBufAccessorT &FlushBufs,
302+
stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
313303
unsigned &WIOffset, unsigned &Offset) {
314-
// Copy data from flush buffer (local memory) to global buffer (global
315-
// memory)
304+
316305
unsigned Cur = 0;
317306
if (!updateOffset(GlobalOffset, GlobalBuf, Offset, Cur))
318307
return;
319308

320309
for (unsigned I = WIOffset; I < WIOffset + Offset; I++) {
321-
GlobalBuf[Cur++] = FlushBufs[I];
310+
GlobalBuf[Cur++] = GlobalFlushBuf[I];
322311
}
323312
// Reset the offset in the flush buffer
324313
Offset = 0;
325314
}
326315

327-
inline void write(stream_impl::FlushBufAccessorT &FlushBufs,
316+
inline void write(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
328317
size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
329318
const char *Str, unsigned Len, unsigned Padding = 0) {
330319
if ((FlushBufferSize - Offset < Len + Padding) ||
331-
(WIOffset + Offset + Len + Padding > FlushBufs.get_count()))
320+
(WIOffset + Offset + Len + Padding > GlobalFlushBuf.get_count()))
332321
// TODO: flush here
333322
return;
334323

335324
// Write padding
336325
for (size_t I = 0; I < Padding; ++I, ++Offset)
337-
FlushBufs[WIOffset + Offset] = ' ';
326+
GlobalFlushBuf[WIOffset + Offset] = ' ';
338327

339328
for (size_t I = 0; I < Len; ++I, ++Offset) {
340-
FlushBufs[WIOffset + Offset] = Str[I];
329+
GlobalFlushBuf[WIOffset + Offset] = Str[I];
341330
}
342331
}
343332

@@ -475,25 +464,25 @@ ScalarToStr(const T &Val, char *Buf, unsigned Flags, int, int Precision = -1) {
475464

476465
template <typename T>
477466
inline typename std::enable_if<std::is_integral<T>::value>::type
478-
writeIntegral(stream_impl::FlushBufAccessorT &FlushBufs, size_t FlushBufferSize,
479-
unsigned WIOffset, unsigned &Offset, unsigned Flags, int Width,
480-
const T &Val) {
467+
writeIntegral(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
468+
size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
469+
unsigned Flags, int Width, const T &Val) {
481470
char Digits[MAX_INTEGRAL_DIGITS] = {0};
482471
unsigned Len = ScalarToStr(Val, Digits, Flags, Width);
483-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len,
472+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
484473
(Width > 0 && static_cast<unsigned>(Width) > Len)
485474
? static_cast<unsigned>(Width) - Len
486475
: 0);
487476
}
488477

489478
template <typename T>
490479
inline EnableIfFP<T>
491-
writeFloatingPoint(stream_impl::FlushBufAccessorT &FlushBufs,
480+
writeFloatingPoint(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
492481
size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
493482
unsigned Flags, int Width, int Precision, const T &Val) {
494483
char Digits[MAX_FLOATING_POINT_DIGITS] = {0};
495484
unsigned Len = ScalarToStr(Val, Digits, Flags, Width, Precision);
496-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len,
485+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
497486
(Width > 0 && static_cast<unsigned>(Width) > Len)
498487
? static_cast<unsigned>(Width) - Len
499488
: 0);
@@ -531,7 +520,7 @@ VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
531520
}
532521

533522
template <typename T, int VecLength>
534-
inline void writeVec(stream_impl::FlushBufAccessorT &FlushBufs,
523+
inline void writeVec(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
535524
size_t FlushBufferSize, unsigned WIOffset,
536525
unsigned &Offset, unsigned Flags, int Width, int Precision,
537526
const vec<T, VecLength> &Vec) {
@@ -540,7 +529,7 @@ inline void writeVec(stream_impl::FlushBufAccessorT &FlushBufs,
540529
MAX_FLOATING_POINT_DIGITS * VecLength + (VecLength - 1) * 2;
541530
char VecStr[MAX_VEC_SIZE] = {0};
542531
unsigned Len = VecToStr<T, VecLength>(Vec, VecStr, Flags, Width, Precision);
543-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, VecStr, Len,
532+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, VecStr, Len,
544533
(Width > 0 && Width > Len) ? Width - Len : 0);
545534
}
546535

@@ -561,16 +550,16 @@ inline unsigned ArrayToStr(char *Buf, const array<ArrayLength> &Arr) {
561550
}
562551

563552
template <int ArrayLength>
564-
inline void writeArray(stream_impl::FlushBufAccessorT &FlushBufs,
553+
inline void writeArray(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
565554
size_t FlushBufferSize, unsigned WIOffset,
566555
unsigned &Offset, const array<ArrayLength> &Arr) {
567556
char Buf[MAX_ARRAY_SIZE];
568557
unsigned Len = ArrayToStr(Buf, Arr);
569-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
558+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
570559
}
571560

572561
template <int Dimensions>
573-
inline void writeItem(stream_impl::FlushBufAccessorT &FlushBufs,
562+
inline void writeItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
574563
size_t FlushBufferSize, unsigned WIOffset,
575564
unsigned &Offset, const item<Dimensions> &Item) {
576565
// Reserve space for 3 arrays and additional place (40 symbols) for printing
@@ -585,11 +574,11 @@ inline void writeItem(stream_impl::FlushBufAccessorT &FlushBufs,
585574
Len += append(Buf + Len, ", offset: ");
586575
Len += ArrayToStr(Buf + Len, Item.get_offset());
587576
Buf[Len++] = ')';
588-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
577+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
589578
}
590579

591580
template <int Dimensions>
592-
inline void writeNDRange(stream_impl::FlushBufAccessorT &FlushBufs,
581+
inline void writeNDRange(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
593582
size_t FlushBufferSize, unsigned WIOffset,
594583
unsigned &Offset,
595584
const nd_range<Dimensions> &ND_Range) {
@@ -605,11 +594,11 @@ inline void writeNDRange(stream_impl::FlushBufAccessorT &FlushBufs,
605594
Len += append(Buf + Len, ", offset: ");
606595
Len += ArrayToStr(Buf + Len, ND_Range.get_offset());
607596
Buf[Len++] = ')';
608-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
597+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
609598
}
610599

611600
template <int Dimensions>
612-
inline void writeNDItem(stream_impl::FlushBufAccessorT &FlushBufs,
601+
inline void writeNDItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
613602
size_t FlushBufferSize, unsigned WIOffset,
614603
unsigned &Offset, const nd_item<Dimensions> &ND_Item) {
615604
// Reserve space for 2 arrays and additional place (40 symbols) for printing
@@ -622,11 +611,11 @@ inline void writeNDItem(stream_impl::FlushBufAccessorT &FlushBufs,
622611
Len += append(Buf + Len, ", local_id: ");
623612
Len += ArrayToStr(Buf + Len, ND_Item.get_local_id());
624613
Buf[Len++] = ')';
625-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
614+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
626615
}
627616

628617
template <int Dimensions>
629-
inline void writeGroup(stream_impl::FlushBufAccessorT &FlushBufs,
618+
inline void writeGroup(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
630619
size_t FlushBufferSize, unsigned WIOffset,
631620
unsigned &Offset, const group<Dimensions> &Group) {
632621
// Reserve space for 4 arrays and additional place (60 symbols) for printing
@@ -643,7 +632,7 @@ inline void writeGroup(stream_impl::FlushBufAccessorT &FlushBufs,
643632
Len += append(Buf + Len, ", group_range: ");
644633
Len += ArrayToStr(Buf + Len, Group.get_group_range());
645634
Buf[Len++] = ')';
646-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
635+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
647636
}
648637

649638
// Space for 2 arrays and additional place (20 symbols) for printing
@@ -663,7 +652,7 @@ inline unsigned ItemToStr(char *Buf, const item<Dimensions, false> &Item) {
663652
}
664653

665654
template <int Dimensions>
666-
inline void writeHItem(stream_impl::FlushBufAccessorT &FlushBufs,
655+
inline void writeHItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
667656
size_t FlushBufferSize, unsigned WIOffset,
668657
unsigned &Offset, const h_item<Dimensions> &HItem) {
669658
// Reserve space for 3 items and additional place (60 symbols) for printing
@@ -680,7 +669,7 @@ inline void writeHItem(stream_impl::FlushBufAccessorT &FlushBufs,
680669
: HItem.get_physical_local());
681670
}
682671
Len += append(Buf + Len, "\n)");
683-
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
672+
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
684673
}
685674

686675
} // namespace detail

0 commit comments

Comments
 (0)