@@ -52,38 +52,42 @@ using namespace cv::cuda::device;
5252
5353namespace hist
5454{
55- __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t step, int * hist)
55+ template <bool fourByteAligned>
56+ __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t step, int * hist, const int offsetX = 0 )
5657 {
5758 __shared__ int shist[256 ];
5859
5960 const int y = blockIdx .x * blockDim .y + threadIdx .y ;
6061 const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
61-
62+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
6263 shist[tid] = 0 ;
6364 __syncthreads ();
6465
65- if (y < rows)
66- {
67- const unsigned int * rowPtr = (const unsigned int *) (src + y * step);
68-
69- const int cols_4 = cols / 4 ;
70- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
71- {
72- unsigned int data = rowPtr[x];
66+ if (y < rows) {
67+ const uchar* rowPtr = &src[y * step];
68+ // load uncoalesced head
69+ if (!fourByteAligned && threadIdx .x == 0 ) {
70+ for (int x = 0 ; x < min (alignedOffset, cols); x++)
71+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
72+ }
7373
74- Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
75- Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
74+ // coalesced loads
75+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
76+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
77+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
78+ const unsigned int data = rowPtrIntAligned[x];
79+ Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
80+ Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
7681 Emulation::smem::atomicAdd (&shist[(data >> 16 ) & 0xFFU ], 1 );
7782 Emulation::smem::atomicAdd (&shist[(data >> 24 ) & 0xFFU ], 1 );
7883 }
7984
80- if (cols % 4 != 0 && threadIdx .x == 0 )
81- {
82- for (int x = cols_4 * 4 ; x < cols; ++x)
83- {
84- unsigned int data = ((const uchar*)rowPtr)[x];
85- Emulation::smem::atomicAdd (&shist[data], 1 );
86- }
85+ // load uncoalesced tail
86+ // const bool tail = fourByteAligned ? cols % 4 : (cols + offsetX) % 4;
87+ if (threadIdx .x == 0 ) {// }&& tail) {
88+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
89+ for (int x = iTailStart; x < cols; x++)
90+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
8791 }
8892 }
8993
@@ -94,61 +98,70 @@ namespace hist
9498 ::atomicAdd (hist + tid, histVal);
9599 }
96100
97- void histogram256 (PtrStepSzb src, int * hist, cudaStream_t stream)
101+ void histogram256 (PtrStepSzb src, int * hist, const int offsetX, cudaStream_t stream)
98102 {
99103 const dim3 block (32 , 8 );
100104 const dim3 grid (divUp (src.rows , block.y ));
101-
102- histogram256Kernel<<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist);
105+ if (offsetX)
106+ histogram256Kernel<false ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist, offsetX);
107+ else
108+ histogram256Kernel<true ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist, offsetX);
103109 cudaSafeCall ( cudaGetLastError () );
104110
105111 if (stream == 0 )
106112 cudaSafeCall ( cudaDeviceSynchronize () );
107113 }
108114
109- __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int * hist)
115+ template <bool fourByteAligned>
116+ __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int * hist, const int offsetX = 0 )
110117 {
111118 __shared__ int shist[256 ];
112119
113120 const int y = blockIdx .x * blockDim .y + threadIdx .y ;
114121 const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
115-
122+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
116123 shist[tid] = 0 ;
117124 __syncthreads ();
118125
119126 if (y < rows)
120127 {
121- const unsigned int * rowPtr = (const unsigned int *) (src + y * srcStep);
122- const unsigned int * maskRowPtr = (const unsigned int *) (mask + y * maskStep);
128+ const uchar* rowPtr = &src[y * srcStep];
129+ const uchar* maskRowPtr = &mask[y * maskStep];
130+ // load uncoalesced head
131+ if (!fourByteAligned && threadIdx .x == 0 ) {
132+ for (int x = 0 ; x < min (alignedOffset, cols); x++) {
133+ if (maskRowPtr[x])
134+ Emulation::smem::atomicAdd (&shist[rowPtr[x]], 1 );
135+ }
136+ }
123137
124- const int cols_4 = cols / 4 ;
125- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
126- {
127- unsigned int data = rowPtr[x];
128- unsigned int m = maskRowPtr[x];
138+ // coalesced loads
139+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
140+ const unsigned int * maskRowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
141+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
142+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
143+ const unsigned int data = rowPtrIntAligned[x];
144+ const unsigned int m = maskRowPtrIntAligned[x];
129145
130- if ((m >> 0 ) & 0xFFU )
131- Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
146+ if ((m >> 0 ) & 0xFFU )
147+ Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
132148
133- if ((m >> 8 ) & 0xFFU )
134- Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
149+ if ((m >> 8 ) & 0xFFU )
150+ Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
135151
136- if ((m >> 16 ) & 0xFFU )
152+ if ((m >> 16 ) & 0xFFU )
137153 Emulation::smem::atomicAdd (&shist[(data >> 16 ) & 0xFFU ], 1 );
138154
139- if ((m >> 24 ) & 0xFFU )
155+ if ((m >> 24 ) & 0xFFU )
140156 Emulation::smem::atomicAdd (&shist[(data >> 24 ) & 0xFFU ], 1 );
141157 }
142158
143- if (cols % 4 != 0 && threadIdx .x == 0 )
144- {
145- for (int x = cols_4 * 4 ; x < cols; ++x)
146- {
147- unsigned int data = ((const uchar*)rowPtr)[x];
148- unsigned int m = ((const uchar*)maskRowPtr)[x];
149-
150- if (m)
151- Emulation::smem::atomicAdd (&shist[data], 1 );
159+ // load uncoalesced tail
160+ if (threadIdx .x == 0 ) {
161+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
162+ for (int x = iTailStart; x < cols; x++) {
163+ if (maskRowPtr[x])
164+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
152165 }
153166 }
154167 }
@@ -160,12 +173,15 @@ namespace hist
160173 ::atomicAdd (hist + tid, histVal);
161174 }
162175
163- void histogram256 (PtrStepSzb src, PtrStepSzb mask, int * hist, cudaStream_t stream)
176+ void histogram256 (PtrStepSzb src, PtrStepSzb mask, int * hist, const int offsetX, cudaStream_t stream)
164177 {
165178 const dim3 block (32 , 8 );
166179 const dim3 grid (divUp (src.rows , block.y ));
167180
168- histogram256Kernel<<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist);
181+ if (offsetX)
182+ histogram256Kernel<false ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist, offsetX);
183+ else
184+ histogram256Kernel<true ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist, offsetX);
169185 cudaSafeCall ( cudaGetLastError () );
170186
171187 if (stream == 0 )
@@ -186,42 +202,44 @@ namespace hist
186202 }
187203 }
188204
189- __global__ void histEven8u (const uchar* src, const size_t step, const int rows, const int cols,
190- int * hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
205+ template <bool fourByteAligned>
206+ __global__ void histEven8u (const uchar* src, const size_t step, const int rows, const int cols, int * hist, const int binCount, const int binSize,
207+ const int lowerLevel, const int upperLevel, const int offsetX)
191208 {
192209 extern __shared__ int shist[];
193210
194211 const int y = blockIdx .x * blockDim .y + threadIdx .y ;
195212 const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
196-
213+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
197214 if (tid < binCount)
198215 shist[tid] = 0 ;
199-
200216 __syncthreads ();
201217
202218 if (y < rows)
203219 {
204- const uchar* rowPtr = src + y * step;
205- const uint* rowPtr4 = (uint*) rowPtr;
206-
207- const int cols_4 = cols / 4 ;
208- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
209- {
210- const uint data = rowPtr4[x];
220+ const uchar* rowPtr = &src[y * step];
221+ // load uncoalesced head
222+ if (!fourByteAligned && threadIdx .x == 0 ) {
223+ for (int x = 0 ; x < min (alignedOffset, cols); x++)
224+ histEvenInc (shist, rowPtr[x], binSize, lowerLevel, upperLevel);
225+ }
211226
212- histEvenInc (shist, (data >> 0 ) & 0xFFU , binSize, lowerLevel, upperLevel);
213- histEvenInc (shist, (data >> 8 ) & 0xFFU , binSize, lowerLevel, upperLevel);
227+ // coalesced loads
228+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
229+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
230+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
231+ const unsigned int data = rowPtrIntAligned[x];
232+ histEvenInc (shist, (data >> 0 ) & 0xFFU , binSize, lowerLevel, upperLevel);
233+ histEvenInc (shist, (data >> 8 ) & 0xFFU , binSize, lowerLevel, upperLevel);
214234 histEvenInc (shist, (data >> 16 ) & 0xFFU , binSize, lowerLevel, upperLevel);
215235 histEvenInc (shist, (data >> 24 ) & 0xFFU , binSize, lowerLevel, upperLevel);
216236 }
217237
218- if (cols % 4 != 0 && threadIdx .x == 0 )
219- {
220- for (int x = cols_4 * 4 ; x < cols; ++x)
221- {
222- const uchar data = rowPtr[x];
223- histEvenInc (shist, data, binSize, lowerLevel, upperLevel);
224- }
238+ // load uncoalesced tail
239+ if (threadIdx .x == 0 ) {
240+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
241+ for (int x = iTailStart; x < cols; x++)
242+ histEvenInc (shist, rowPtr[x], binSize, lowerLevel, upperLevel);
225243 }
226244 }
227245
@@ -236,7 +254,7 @@ namespace hist
236254 }
237255 }
238256
239- void histEven8u (PtrStepSzb src, int * hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
257+ void histEven8u (PtrStepSzb src, int * hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
240258 {
241259 const dim3 block (32 , 8 );
242260 const dim3 grid (divUp (src.rows , block.y ));
@@ -245,7 +263,10 @@ namespace hist
245263
246264 const size_t smem_size = binCount * sizeof (int );
247265
248- histEven8u<<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel);
266+ if (offsetX)
267+ histEven8u<false ><<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
268+ else
269+ histEven8u<true ><<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
249270 cudaSafeCall ( cudaGetLastError () );
250271
251272 if (stream == 0 )
0 commit comments