Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 90 additions & 70 deletions modules/cudaimgproc/src/cuda/hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,38 +52,41 @@ using namespace cv::cuda::device;

namespace hist
{
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
template<bool fourByteAligned>
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
{
__shared__ int shist[256];

const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;

const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
shist[tid] = 0;
__syncthreads();

if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);

const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
if (y < rows) {
const uchar* rowPtr = &src[y * step];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++)
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}

Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}

if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
Emulation::smem::atomicAdd(&shist[data], 1);
}
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++)
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
}

Expand All @@ -94,61 +97,70 @@ namespace hist
::atomicAdd(hist + tid, histVal);
}

void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));

histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
if(offsetX)
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
else
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
cudaSafeCall( cudaGetLastError() );

if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}

__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
template<bool fourByteAligned>
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
{
__shared__ int shist[256];

const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;

const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
shist[tid] = 0;
__syncthreads();

if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
const uchar* rowPtr = &src[y * srcStep];
const uchar* maskRowPtr = &mask[y * maskStep];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++) {
if (maskRowPtr[x])
Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
}
}

const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
unsigned int m = maskRowPtr[x];
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
const unsigned int m = maskRowPtrIntAligned[x];

if ((m >> 0) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
if ((m >> 0) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);

if ((m >> 8) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
if ((m >> 8) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);

if ((m >> 16) & 0xFFU)
if ((m >> 16) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);

if ((m >> 24) & 0xFFU)
if ((m >> 24) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}

if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
unsigned int m = ((const uchar*)maskRowPtr)[x];

if (m)
Emulation::smem::atomicAdd(&shist[data], 1);
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++) {
if (maskRowPtr[x])
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
}
}
Expand All @@ -160,12 +172,15 @@ namespace hist
::atomicAdd(hist + tid, histVal);
}

void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));

histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
if(offsetX)
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
else
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
cudaSafeCall( cudaGetLastError() );

if (stream == 0)
Expand All @@ -186,42 +201,44 @@ namespace hist
}
}

__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
template<bool fourByteAligned>
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
const int lowerLevel, const int upperLevel, const int offsetX)
{
extern __shared__ int shist[];

const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;

const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
if (tid < binCount)
shist[tid] = 0;

__syncthreads();

if (y < rows)
{
const uchar* rowPtr = src + y * step;
const uint* rowPtr4 = (uint*) rowPtr;

const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
const uint data = rowPtr4[x];
const uchar* rowPtr = &src[y * step];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++)
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
}

histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
}

if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
const uchar data = rowPtr[x];
histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
}
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++)
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
}
}

Expand All @@ -236,7 +253,7 @@ namespace hist
}
}

void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
Expand All @@ -245,7 +262,10 @@ namespace hist

const size_t smem_size = binCount * sizeof(int);

histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
if(offsetX)
histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
else
histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
cudaSafeCall( cudaGetLastError() );

if (stream == 0)
Expand Down
16 changes: 10 additions & 6 deletions modules/cudaimgproc/src/histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no

namespace hist
{
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
}

void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
Expand All @@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St

hist.setTo(Scalar::all(0), stream);

Point ofs; Size wholeSize;
src.locateROI(wholeSize, ofs);
if (mask.empty())
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
else
hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
}

////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int

namespace hist
{
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
}

namespace
{
void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
{
Point ofs; Size wholeSize;
src.locateROI(wholeSize, ofs);
hist.create(1, histSize, CV_32S);
cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
}
}

Expand Down
Loading