Merge pull request #3132 from asmorkalov:as/stereo_prefilter

alalek · alalek · commit 80ff29a6539f · 2021-12-14T11:40:07.000Z
diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu
@@ -403,23 +403,25 @@ namespace cv { namespace cuda { namespace device
             callers[winsz2](left, right, disp, maxdisp, stream);
         }
 
+        __device__ inline int clamp(int x, int a, int b)
+        {
+            return ::max(a, ::min(b, x));
+        }
+
         //////////////////////////////////////////////////////////////////////////////////////////////////
         /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
         //////////////////////////////////////////////////////////////////////////////////////////////////
 
-        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
-
-        __global__ void prefilter_kernel(PtrStepSzb output, int prefilterCap)
+        __global__ void prefilter_kernel_xsobel(PtrStepSzb input, PtrStepSzb output, int prefilterCap)
         {
             int x = blockDim.x * blockIdx.x + threadIdx.x;
             int y = blockDim.y * blockIdx.y + threadIdx.y;
 
             if (x < output.cols && y < output.rows)
             {
-                int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
-                           (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
-                           (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
-
+                int conv = input.ptr(::max(0,y-1))[::max(0,x-1)] * (-1) + input.ptr(::max(0, y-1))[::min(x+1, input.cols-1)] * (1) +
+                           input.ptr(y  )[::max(0,x-1)] * (-2) + input.ptr(y  )[::min(x+1, input.cols-1)] * (2) +
+                           input.ptr(::min(y+1, input.rows-1))[::max(0,x-1)] * (-1) + input.ptr(::min(y+1, input.rows-1))[::min(x+1,input.cols-1)] * (1);
 
                 conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
                 output.ptr(y)[x] = conv & 0xFF;
@@ -428,22 +430,65 @@ namespace cv { namespace cuda { namespace device
 
         void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
         {
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
-
             dim3 threads(16, 16, 1);
             dim3 grid(1, 1, 1);
 
             grid.x = divUp(input.cols, threads.x);
             grid.y = divUp(input.rows, threads.y);
 
-            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
+            prefilter_kernel_xsobel<<<grid, threads, 0, stream>>>(input, output, prefilterCap);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
+        }
 
-            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        ///////////////////////////////////////  Norm Prefiler ///////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        __global__ void prefilter_kernel_norm(PtrStepSzb input, PtrStepSzb output, int prefilterCap, int scale_g, int scale_s, int winsize)
+        {
+            // prefilterCap in range 1..63, checked in StereoBMImpl::compute
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+            int cols = input.cols;
+            int rows = input.rows;
+            int WSZ2 = winsize / 2;
+
+            if(x < cols && y < rows)
+            {
+                int cov1 =                               input.ptr(::max(y-1, 0))[x] * 1 +
+                    input.ptr(y)[::min(x+1, cols-1)] * 1 + input.ptr(y  )[x] * 4 + input.ptr(y)[::min(x+1, cols-1)] * 1 +
+                                                         input.ptr(::min(y+1, rows-1))[x] * 1;
+
+                int cov2 = 0;
+                for(int i = -WSZ2; i < WSZ2+1; i++)
+                    for(int j = -WSZ2; j < WSZ2+1; j++)
+                        cov2 += input.ptr(clamp(y+i, 0, rows-1))[clamp(x+j, 0, cols-1)];
+
+                int res = (cov1*scale_g - cov2*scale_s)>>10;
+                res = clamp(res, -prefilterCap, prefilterCap) + prefilterCap;
+                output.ptr(y)[x] = res;
+            }
+        }
+
+        void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, threads.y);
+
+            int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
+            scale_g *= scale_s;
+
+            prefilter_kernel_norm<<<grid, threads, 0, stream>>>(input, output, prefilterCap, scale_g, scale_s, winsize);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
         }
 
 
diff --git a/modules/cudastereo/src/stereobm.cpp b/modules/cudastereo/src/stereobm.cpp
@@ -57,6 +57,7 @@ namespace cv { namespace cuda { namespace device
     {
         void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
         void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
+        void prefilter_norm(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, int winsize, cudaStream_t & stream);
         void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
     }
 }}}
@@ -92,8 +93,8 @@ namespace
         int getPreFilterType() const { return preset_; }
         void setPreFilterType(int preFilterType) { preset_ = preFilterType; }
 
-        int getPreFilterSize() const { return 0; }
-        void setPreFilterSize(int /*preFilterSize*/) {}
+        int getPreFilterSize() const { return preFilterSize_; }
+        void setPreFilterSize(int preFilterSize) { preFilterSize_ = preFilterSize; }
 
         int getPreFilterCap() const { return preFilterCap_; }
         void setPreFilterCap(int preFilterCap) { preFilterCap_ = preFilterCap; }
@@ -119,12 +120,13 @@ namespace
         int winSize_;
         int preFilterCap_;
         float avergeTexThreshold_;
+        int preFilterSize_;
 
         GpuMat minSSD_, leBuf_, riBuf_;
     };
 
     StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
-        : preset_(0), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3)
+        : preset_(-1), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3), preFilterSize_(9)
     {
     }
 
@@ -169,6 +171,17 @@ namespace
             le_for_bm = leBuf_;
             ri_for_bm = riBuf_;
         }
+        else if(preset_ == cv::StereoBM::PREFILTER_NORMALIZED_RESPONSE)
+        {
+            cuda::ensureSizeIsEnough(left.size(), left.type(), leBuf_);
+            cuda::ensureSizeIsEnough(right.size(), right.type(), riBuf_);
+
+            prefilter_norm( left, leBuf_, preFilterCap_, preFilterSize_, stream);
+            prefilter_norm(right, riBuf_, preFilterCap_, preFilterSize_, stream);
+
+            le_for_bm = leBuf_;
+            ri_for_bm = riBuf_;
+        }
 
         stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
 
diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp
@@ -79,6 +79,49 @@ CUDA_TEST_P(StereoBM, Regression)
     EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
 }
 
+CUDA_TEST_P(StereoBM, PrefilterXSobelRegression)
+{
+    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat disp_gold   = readImage("stereobm/aloe-disp-prefilter-xsobel.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
+    cv::cuda::GpuMat disp;
+
+    bm->setPreFilterType(cv::StereoBM::PREFILTER_XSOBEL);
+    bm->compute(loadMat(left_image), loadMat(right_image), disp);
+
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
+}
+
+CUDA_TEST_P(StereoBM, PrefilterNormRegression)
+{
+    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat disp_gold   = readImage("stereobm/aloe-disp-prefilter-norm.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
+    cv::cuda::GpuMat disp;
+
+    bm->setPreFilterType(cv::StereoBM::PREFILTER_NORMALIZED_RESPONSE);
+    bm->setPreFilterSize(9);
+    bm->compute(loadMat(left_image), loadMat(right_image), disp);
+
+    cv::Mat disp_cpu;
+    disp.download(disp_cpu);
+    cv::imwrite("aloe-disp-prefilter-norm.png", disp_cpu);
+
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES);
 
 //////////////////////////////////////////////////////////////////////////