@@ -403,23 +403,25 @@ namespace cv { namespace cuda { namespace device
403403 callers[winsz2](left, right, disp, maxdisp, stream);
404404 }
405405
406+ __device__ inline int clamp (int x, int a, int b)
407+ {
408+ return ::max (a, ::min (b, x));
409+ }
410+
406411 // ////////////////////////////////////////////////////////////////////////////////////////////////
407412 // ///////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
408413 // ////////////////////////////////////////////////////////////////////////////////////////////////
409414
410- texture<unsigned char , 2 , cudaReadModeElementType> texForSobel;
411-
412- __global__ void prefilter_kernel (PtrStepSzb output, int prefilterCap)
415+ __global__ void prefilter_kernel_xsobel (PtrStepSzb input, PtrStepSzb output, int prefilterCap)
413416 {
414417 int x = blockDim .x * blockIdx .x + threadIdx .x ;
415418 int y = blockDim .y * blockIdx .y + threadIdx .y ;
416419
417420 if (x < output.cols && y < output.rows )
418421 {
419- int conv = (int )tex2D (texForSobel, x - 1 , y - 1 ) * (-1 ) + (int )tex2D (texForSobel, x + 1 , y - 1 ) * (1 ) +
420- (int )tex2D (texForSobel, x - 1 , y ) * (-2 ) + (int )tex2D (texForSobel, x + 1 , y ) * (2 ) +
421- (int )tex2D (texForSobel, x - 1 , y + 1 ) * (-1 ) + (int )tex2D (texForSobel, x + 1 , y + 1 ) * (1 );
422-
422+ int conv = input.ptr (::max (0 ,y-1 ))[::max (0 ,x-1 )] * (-1 ) + input.ptr (::max (0 , y-1 ))[::min (x+1 , input.cols -1 )] * (1 ) +
423+ input.ptr (y )[::max (0 ,x-1 )] * (-2 ) + input.ptr (y )[::min (x+1 , input.cols -1 )] * (2 ) +
424+ input.ptr (::min (y+1 , input.rows -1 ))[::max (0 ,x-1 )] * (-1 ) + input.ptr (::min (y+1 , input.rows -1 ))[::min (x+1 ,input.cols -1 )] * (1 );
423425
424426 conv = ::min (::min (::max (-prefilterCap, conv), prefilterCap) + prefilterCap, 255 );
425427 output.ptr (y)[x] = conv & 0xFF ;
@@ -428,22 +430,65 @@ namespace cv { namespace cuda { namespace device
428430
429431 void prefilter_xsobel (const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
430432 {
431- cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char >();
432- cudaSafeCall ( cudaBindTexture2D ( 0 , texForSobel, input.data , desc, input.cols , input.rows , input.step ) );
433-
434433 dim3 threads (16 , 16 , 1 );
435434 dim3 grid (1 , 1 , 1 );
436435
437436 grid.x = divUp (input.cols , threads.x );
438437 grid.y = divUp (input.rows , threads.y );
439438
440- prefilter_kernel <<<grid, threads, 0 , stream>>> (output, prefilterCap);
439+ prefilter_kernel_xsobel <<<grid, threads, 0 , stream>>> (input, output, prefilterCap);
441440 cudaSafeCall ( cudaGetLastError () );
442441
443442 if (stream == 0 )
444443 cudaSafeCall ( cudaDeviceSynchronize () );
444+ }
445445
446- cudaSafeCall ( cudaUnbindTexture (texForSobel ) );
446+ // ////////////////////////////////////////////////////////////////////////////////////////////////
447+ // ///////////////////////////////////// Norm Prefiler ///////////////////////////////////////////
448+ // ////////////////////////////////////////////////////////////////////////////////////////////////
449+
450+ __global__ void prefilter_kernel_norm (PtrStepSzb input, PtrStepSzb output, int prefilterCap, int scale_g, int scale_s, int winsize)
451+ {
452+ // prefilterCap in range 1..63, checked in StereoBMImpl::compute
453+ int x = blockDim .x * blockIdx .x + threadIdx .x ;
454+ int y = blockDim .y * blockIdx .y + threadIdx .y ;
455+ int cols = input.cols ;
456+ int rows = input.rows ;
457+ int WSZ2 = winsize / 2 ;
458+
459+ if (x < cols && y < rows)
460+ {
461+ int cov1 = input.ptr (::max (y-1 , 0 ))[x] * 1 +
462+ input.ptr (y)[::min (x+1 , cols-1 )] * 1 + input.ptr (y )[x] * 4 + input.ptr (y)[::min (x+1 , cols-1 )] * 1 +
463+ input.ptr (::min (y+1 , rows-1 ))[x] * 1 ;
464+
465+ int cov2 = 0 ;
466+ for (int i = -WSZ2; i < WSZ2+1 ; i++)
467+ for (int j = -WSZ2; j < WSZ2+1 ; j++)
468+ cov2 += input.ptr (clamp (y+i, 0 , rows-1 ))[clamp (x+j, 0 , cols-1 )];
469+
470+ int res = (cov1*scale_g - cov2*scale_s)>>10 ;
471+ res = clamp (res, -prefilterCap, prefilterCap) + prefilterCap;
472+ output.ptr (y)[x] = res;
473+ }
474+ }
475+
476+ void prefilter_norm (const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream)
477+ {
478+ dim3 threads (16 , 16 , 1 );
479+ dim3 grid (1 , 1 , 1 );
480+
481+ grid.x = divUp (input.cols , threads.x );
482+ grid.y = divUp (input.rows , threads.y );
483+
484+ int scale_g = winsize*winsize/8 , scale_s = (1024 + scale_g)/(scale_g*2 );
485+ scale_g *= scale_s;
486+
487+ prefilter_kernel_norm<<<grid, threads, 0 , stream>>> (input, output, prefilterCap, scale_g, scale_s, winsize);
488+ cudaSafeCall ( cudaGetLastError () );
489+
490+ if (stream == 0 )
491+ cudaSafeCall ( cudaDeviceSynchronize () );
447492 }
448493
449494
0 commit comments