Skip to content

Commit 14a56cc

Browse files
committed
cuda: fix bug in histogram kernels when source memory is not aligned to 4 bytes
1 parent 298fc7b commit 14a56cc

File tree

3 files changed

+157
-96
lines changed

3 files changed

+157
-96
lines changed

modules/cudaimgproc/src/cuda/hist.cu

Lines changed: 91 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -52,38 +52,42 @@ using namespace cv::cuda::device;
5252

5353
namespace hist
5454
{
55-
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
55+
template<bool fourByteAligned>
56+
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
5657
{
5758
__shared__ int shist[256];
5859

5960
const int y = blockIdx.x * blockDim.y + threadIdx.y;
6061
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
61-
62+
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
6263
shist[tid] = 0;
6364
__syncthreads();
6465

65-
if (y < rows)
66-
{
67-
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
68-
69-
const int cols_4 = cols / 4;
70-
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
71-
{
72-
unsigned int data = rowPtr[x];
66+
if (y < rows) {
67+
const uchar* rowPtr = &src[y * step];
68+
// load uncoalesced head
69+
if (!fourByteAligned && threadIdx.x == 0) {
70+
for (int x = 0; x < min(alignedOffset, cols); x++)
71+
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
72+
}
7373

74-
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
75-
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
74+
// coalesced loads
75+
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
76+
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
77+
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
78+
const unsigned int data = rowPtrIntAligned[x];
79+
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
80+
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
7681
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
7782
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
7883
}
7984

80-
if (cols % 4 != 0 && threadIdx.x == 0)
81-
{
82-
for (int x = cols_4 * 4; x < cols; ++x)
83-
{
84-
unsigned int data = ((const uchar*)rowPtr)[x];
85-
Emulation::smem::atomicAdd(&shist[data], 1);
86-
}
85+
// load uncoalesced tail
86+
//const bool tail = fourByteAligned ? cols % 4 : (cols + offsetX) % 4;
87+
if (threadIdx.x == 0) {//}&& tail) {
88+
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
89+
for (int x = iTailStart; x < cols; x++)
90+
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
8791
}
8892
}
8993

@@ -94,61 +98,70 @@ namespace hist
9498
::atomicAdd(hist + tid, histVal);
9599
}
96100

97-
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
101+
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
98102
{
99103
const dim3 block(32, 8);
100104
const dim3 grid(divUp(src.rows, block.y));
101-
102-
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
105+
if(offsetX)
106+
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
107+
else
108+
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
103109
cudaSafeCall( cudaGetLastError() );
104110

105111
if (stream == 0)
106112
cudaSafeCall( cudaDeviceSynchronize() );
107113
}
108114

109-
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
115+
template<bool fourByteAligned>
116+
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
110117
{
111118
__shared__ int shist[256];
112119

113120
const int y = blockIdx.x * blockDim.y + threadIdx.y;
114121
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
115-
122+
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
116123
shist[tid] = 0;
117124
__syncthreads();
118125

119126
if (y < rows)
120127
{
121-
const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
122-
const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
128+
const uchar* rowPtr = &src[y * srcStep];
129+
const uchar* maskRowPtr = &mask[y * maskStep];
130+
// load uncoalesced head
131+
if (!fourByteAligned && threadIdx.x == 0) {
132+
for (int x = 0; x < min(alignedOffset, cols); x++) {
133+
if (maskRowPtr[x])
134+
Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
135+
}
136+
}
123137

124-
const int cols_4 = cols / 4;
125-
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
126-
{
127-
unsigned int data = rowPtr[x];
128-
unsigned int m = maskRowPtr[x];
138+
// coalesced loads
139+
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
140+
const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
141+
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
142+
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
143+
const unsigned int data = rowPtrIntAligned[x];
144+
const unsigned int m = maskRowPtrIntAligned[x];
129145

130-
if ((m >> 0) & 0xFFU)
131-
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
146+
if ((m >> 0) & 0xFFU)
147+
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
132148

133-
if ((m >> 8) & 0xFFU)
134-
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
149+
if ((m >> 8) & 0xFFU)
150+
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
135151

136-
if ((m >> 16) & 0xFFU)
152+
if ((m >> 16) & 0xFFU)
137153
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
138154

139-
if ((m >> 24) & 0xFFU)
155+
if ((m >> 24) & 0xFFU)
140156
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
141157
}
142158

143-
if (cols % 4 != 0 && threadIdx.x == 0)
144-
{
145-
for (int x = cols_4 * 4; x < cols; ++x)
146-
{
147-
unsigned int data = ((const uchar*)rowPtr)[x];
148-
unsigned int m = ((const uchar*)maskRowPtr)[x];
149-
150-
if (m)
151-
Emulation::smem::atomicAdd(&shist[data], 1);
159+
// load uncoalesced tail
160+
if (threadIdx.x == 0) {
161+
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
162+
for (int x = iTailStart; x < cols; x++) {
163+
if (maskRowPtr[x])
164+
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
152165
}
153166
}
154167
}
@@ -160,12 +173,15 @@ namespace hist
160173
::atomicAdd(hist + tid, histVal);
161174
}
162175

163-
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
176+
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
164177
{
165178
const dim3 block(32, 8);
166179
const dim3 grid(divUp(src.rows, block.y));
167180

168-
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
181+
if(offsetX)
182+
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
183+
else
184+
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
169185
cudaSafeCall( cudaGetLastError() );
170186

171187
if (stream == 0)
@@ -186,42 +202,44 @@ namespace hist
186202
}
187203
}
188204

189-
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
190-
int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
205+
template<bool fourByteAligned>
206+
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
207+
const int lowerLevel, const int upperLevel, const int offsetX)
191208
{
192209
extern __shared__ int shist[];
193210

194211
const int y = blockIdx.x * blockDim.y + threadIdx.y;
195212
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
196-
213+
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
197214
if (tid < binCount)
198215
shist[tid] = 0;
199-
200216
__syncthreads();
201217

202218
if (y < rows)
203219
{
204-
const uchar* rowPtr = src + y * step;
205-
const uint* rowPtr4 = (uint*) rowPtr;
206-
207-
const int cols_4 = cols / 4;
208-
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
209-
{
210-
const uint data = rowPtr4[x];
220+
const uchar* rowPtr = &src[y * step];
221+
// load uncoalesced head
222+
if (!fourByteAligned && threadIdx.x == 0) {
223+
for (int x = 0; x < min(alignedOffset, cols); x++)
224+
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
225+
}
211226

212-
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
213-
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
227+
// coalesced loads
228+
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
229+
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
230+
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
231+
const unsigned int data = rowPtrIntAligned[x];
232+
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
233+
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
214234
histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
215235
histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
216236
}
217237

218-
if (cols % 4 != 0 && threadIdx.x == 0)
219-
{
220-
for (int x = cols_4 * 4; x < cols; ++x)
221-
{
222-
const uchar data = rowPtr[x];
223-
histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
224-
}
238+
// load uncoalesced tail
239+
if (threadIdx.x == 0) {
240+
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
241+
for (int x = iTailStart; x < cols; x++)
242+
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
225243
}
226244
}
227245

@@ -236,7 +254,7 @@ namespace hist
236254
}
237255
}
238256

239-
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
257+
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
240258
{
241259
const dim3 block(32, 8);
242260
const dim3 grid(divUp(src.rows, block.y));
@@ -245,7 +263,10 @@ namespace hist
245263

246264
const size_t smem_size = binCount * sizeof(int);
247265

248-
histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
266+
if(offsetX)
267+
histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
268+
else
269+
histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
249270
cudaSafeCall( cudaGetLastError() );
250271

251272
if (stream == 0)

modules/cudaimgproc/src/histogram.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
6868

6969
namespace hist
7070
{
71-
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
72-
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
71+
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
72+
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
7373
}
7474

7575
void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
9191

9292
hist.setTo(Scalar::all(0), stream);
9393

94+
Point ofs; Size wholeSize;
95+
src.locateROI(wholeSize, ofs);
9496
if (mask.empty())
95-
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
97+
hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
9698
else
97-
hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
99+
hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
98100
}
99101

100102
////////////////////////////////////////////////////////////////////////
@@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
494496

495497
namespace hist
496498
{
497-
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
499+
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
498500
}
499501

500502
namespace
501503
{
502504
void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
503505
{
506+
Point ofs; Size wholeSize;
507+
src.locateROI(wholeSize, ofs);
504508
hist.create(1, histSize, CV_32S);
505509
cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
506-
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
510+
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
507511
}
508512
}
509513

0 commit comments

Comments
 (0)