From 10e29b268389a51155f2685eb1e48726308b9b32 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Wed, 19 Apr 2023 16:41:33 +0300
Subject: [PATCH 01/11] cuda: fix bug in histogram kernels when source memory
 is not aligned to 4 bytes

---
 modules/cudaimgproc/src/cuda/hist.cu        | 160 +++++++++++---------
 modules/cudaimgproc/src/histogram.cpp       |  16 +-
 modules/cudaimgproc/test/test_histogram.cpp |  76 +++++++---
 3 files changed, 156 insertions(+), 96 deletions(-)
diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu
index 6bc5f15e6c..a6d0ce7e98 100644
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@@ -52,38 +52,41 @@ using namespace cv::cuda::device;
 
 namespace hist
 {
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
     {
         __shared__ int shist[256];
 
         const int y = blockIdx.x * blockDim.y + threadIdx.y;
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
         shist[tid] = 0;
         __syncthreads();
 
-        if (y < rows)
-        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
+        if (y < rows) {
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
+            }
 
-                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
                 Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
                 Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
             }
 
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    Emulation::smem::atomicAdd(&shist[data], 1);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
             }
         }
 
@@ -94,61 +97,70 @@ namespace hist
             ::atomicAdd(hist + tid, histVal);
     }
 
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.rows, block.y));
-
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
     {
         __shared__ int shist[256];
 
         const int y = blockIdx.x * blockDim.y + threadIdx.y;
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
         shist[tid] = 0;
         __syncthreads();
 
         if (y < rows)
         {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
-            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+            const uchar* rowPtr = &src[y * srcStep];
+            const uchar* maskRowPtr = &mask[y * maskStep];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
+                }
+            }
 
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
-                unsigned int m = maskRowPtr[x];
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
+            const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                const unsigned int m = maskRowPtrIntAligned[x];
 
-                if ((m >>  0) & 0xFFU)
-                    Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                if ((m >> 0) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
 
-                if ((m >>  8) & 0xFFU)
-                    Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                if ((m >> 8) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
 
-                if ((m >>  16) & 0xFFU)
+                if ((m >> 16) & 0xFFU)
                     Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
 
-                if ((m >>  24) & 0xFFU)
+                if ((m >> 24) & 0xFFU)
                     Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
             }
 
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    unsigned int m = ((const uchar*)maskRowPtr)[x];
-
-                    if (m)
-                        Emulation::smem::atomicAdd(&shist[data], 1);
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
                 }
             }
         }
@@ -160,12 +172,15 @@ namespace hist
             ::atomicAdd(hist + tid, histVal);
     }
 
-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.rows, block.y));
 
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
@@ -186,42 +201,44 @@ namespace hist
         }
     }
 
-    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
-                               int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+    template<bool fourByteAligned>
+    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
+        const int lowerLevel, const int upperLevel, const int offsetX)
     {
         extern __shared__ int shist[];
 
         const int y = blockIdx.x * blockDim.y + threadIdx.y;
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
         if (tid < binCount)
             shist[tid] = 0;
-
         __syncthreads();
 
         if (y < rows)
         {
-            const uchar* rowPtr = src + y * step;
-            const uint* rowPtr4 = (uint*) rowPtr;
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                const uint data = rowPtr4[x];
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
+            }
 
-                histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
-                histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
                 histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
                 histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
             }
 
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    const uchar data = rowPtr[x];
-                    histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
             }
         }
 
@@ -236,7 +253,7 @@ namespace hist
         }
     }
 
-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.rows, block.y));
@@ -245,7 +262,10 @@ namespace hist
 
         const size_t smem_size = binCount * sizeof(int);
 
-        histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+        if(offsetX)
+            histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
+        else
+            histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index c252abc451..177bf75b1a 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
 
 namespace hist
 {
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
 }
 
 void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
 
     hist.setTo(Scalar::all(0), stream);
 
+    Point ofs; Size wholeSize;
+    src.locateROI(wholeSize, ofs);
     if (mask.empty())
-        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
     else
-        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
 
 namespace hist
 {
-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
 }
 
 namespace
 {
     void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
     {
+        Point ofs; Size wholeSize;
+        src.locateROI(wholeSize, ofs);
         hist.create(1, histSize, CV_32S);
         cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
-        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
     }
 }
 
diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp
index a92eefde8c..0be400ab6a 100644
--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -49,15 +49,40 @@ namespace opencv_test { namespace {
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HistEven
 
-PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
+typedef tuple<Size, int> hist_size_to_roi_offset_params_t;
+const hist_size_to_roi_offset_params_t hist_size_to_roi_offset_params[] =
+{
+    // uchar reads only
+    hist_size_to_roi_offset_params_t(Size(1,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 0),
+    hist_size_to_roi_offset_params_t(Size(3,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 0),
+    hist_size_to_roi_offset_params_t(Size(4,32), 1),
+    hist_size_to_roi_offset_params_t(Size(4,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 3),
+    // uchar and int reads
+    hist_size_to_roi_offset_params_t(Size(129,32), 0),
+    hist_size_to_roi_offset_params_t(Size(129,32), 1),
+    hist_size_to_roi_offset_params_t(Size(129,32), 2),
+    hist_size_to_roi_offset_params_t(Size(129,32), 3),
+    // int reads only
+    hist_size_to_roi_offset_params_t(Size(128,32), 0)
+};
+
+PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
+    int roiOffsetX;
 
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -66,19 +91,21 @@ PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(HistEven, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
     int hbins = 30;
     float hranges[] = {50.0f, 200.0f};
 
     cv::cuda::GpuMat hist;
-    cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);
+    cv::cuda::GpuMat srcDevice = loadMat(src);
+    cv::cuda::histEven(srcDevice(roi), hist, hbins, (int)hranges[0], (int)hranges[1]);
 
     cv::Mat hist_gold;
 
     int histSize[] = {hbins};
     const float* ranges[] = {hranges};
     int channels[] = {0};
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
 
     hist_gold = hist_gold.t();
     hist_gold.convertTo(hist_gold, CV_32S);
@@ -87,22 +114,24 @@ CUDA_TEST_P(HistEven, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CalcHist
 
-PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
     cv::cuda::DeviceInfo devInfo;
 
     cv::Size size;
 
+    int roiOffsetX;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -111,9 +140,10 @@ PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHist, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
     cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), hist);
+    GpuMat srcDevice = loadMat(src);
+    cv::cuda::calcHist(srcDevice(roi), hist);
 
     cv::Mat hist_gold;
 
@@ -123,7 +153,8 @@ CUDA_TEST_P(CalcHist, Accuracy)
     const float* ranges[] = {hranges};
     const int channels[] = {0};
 
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
     hist_gold = hist_gold.reshape(1, 1);
     hist_gold.convertTo(hist_gold, CV_32S);
 
@@ -131,19 +162,21 @@ CUDA_TEST_P(CalcHist, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
 
-PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
     cv::cuda::DeviceInfo devInfo;
 
     cv::Size size;
 
+    int roiOffsetX;
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -152,11 +185,14 @@ PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHistWithMask, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
     cv::Mat mask = randomMat(size, CV_8UC1);
     cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
 
     cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+    GpuMat srcDevice = loadMat(src);
+    GpuMat maskDevice = loadMat(mask);
+    cv::cuda::calcHist(srcDevice(roi), maskDevice(roi), hist);
 
     cv::Mat hist_gold;
 
@@ -166,7 +202,8 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
     const float* ranges[] = {hranges};
     const int channels[] = {0};
 
-    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, mask(roi), hist_gold, 1, histSize, ranges);
     hist_gold = hist_gold.reshape(1, 1);
     hist_gold.convertTo(hist_gold, CV_32S);
 
@@ -174,8 +211,7 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // EqualizeHist

From 72080b8957cd2bce529c0c0fb38285e4616fbbc7 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Tue, 25 Apr 2023 08:02:29 +0300
Subject: [PATCH 02/11] cudacodec::VideoReader - amend FFmpeg codec resolution

---
 modules/cudacodec/src/ffmpeg_video_source.cpp | 32 +++++++++++++++--
 modules/cudacodec/test/test_video.cpp         | 35 +++++++++++--------
 2 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/modules/cudacodec/src/ffmpeg_video_source.cpp b/modules/cudacodec/src/ffmpeg_video_source.cpp
index de0f349c4d..20a02f84b5 100644
--- a/modules/cudacodec/src/ffmpeg_video_source.cpp
+++ b/modules/cudacodec/src/ffmpeg_video_source.cpp
@@ -66,8 +66,10 @@ static std::string fourccToString(int fourcc)
         (i32_c.c[3] >= ' ' && i32_c.c[3] < 128) ? i32_c.c[3] : '?');
 }
 
+// handle old FFmpeg backend - remove when windows shared library is updated
+#ifdef _WIN32
 static
-Codec FourccToCodec(int codec)
+Codec FourccToCodecWin32Old(int codec)
 {
     switch (codec)
     {
@@ -100,9 +102,34 @@ Codec FourccToCodec(int codec)
     case CV_FOURCC_MACRO('a', 'v', '0', '1'): // fallthru
     case CV_FOURCC_MACRO('A', 'V', '0', '1'): return AV1;
     default:
-        break;
+        return NumCodecs;
     }
+}
+#endif
 
+static
+Codec FourccToCodec(int codec)
+{
+#ifdef _WIN32 // handle old FFmpeg backend - remove when windows shared library is updated
+    Codec win32OldCodec = FourccToCodecWin32Old(codec);
+    if(win32OldCodec != NumCodecs)
+        return win32OldCodec;
+#endif
+    switch (codec)
+    {
+    case CV_FOURCC_MACRO('m', 'p', 'g', '1'): return MPEG1;
+    case CV_FOURCC_MACRO('m', 'p', 'g', '2'): return MPEG2;
+    case CV_FOURCC_MACRO('F', 'M', 'P', '4'): return MPEG4;
+    case CV_FOURCC_MACRO('W', 'V', 'C', '1'): return VC1;
+    case CV_FOURCC_MACRO('h', '2', '6', '4'): return H264;
+    case CV_FOURCC_MACRO('h', 'e', 'v', 'c'): return HEVC;
+    case CV_FOURCC_MACRO('M', 'J', 'P', 'G'): return JPEG;
+    case CV_FOURCC_MACRO('V', 'P', '8', '0'): return VP8;
+    case CV_FOURCC_MACRO('V', 'P', '9', '0'): return VP9;
+    case CV_FOURCC_MACRO('A', 'V', '0', '1'): return AV1;
+    default:
+        break;
+    }
     std::string msg = cv::format("Unknown codec FOURCC: 0x%08X (%s)", codec, fourccToString(codec).c_str());
     CV_LOG_WARNING(NULL, msg);
     CV_Error(Error::StsUnsupportedFormat, msg);
@@ -163,7 +190,6 @@ cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname,
 
     int codec = (int)cap.get(CAP_PROP_FOURCC);
     int pixelFormat = (int)cap.get(CAP_PROP_CODEC_PIXEL_FORMAT);
-
     format_.codec = FourccToCodec(codec);
     format_.height = cap.get(CAP_PROP_FRAME_HEIGHT);
     format_.width = cap.get(CAP_PROP_FRAME_WIDTH);
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 720f952075..49ec7d742e 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -281,15 +281,11 @@ CUDA_TEST_P(DisplayResolution, Reader)
 CUDA_TEST_P(Video, Reader)
 {
     cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const std::string relativeFilePath = GET_PARAM(1);
 
     // CUDA demuxer has to fall back to ffmpeg to process "cv/video/768x576.avi"
-    if (GET_PARAM(1) == "cv/video/768x576.avi" && !videoio_registry::hasBackend(CAP_FFMPEG))
-        throw SkipTestException("FFmpeg backend not found");
-
-#ifdef _WIN32  // handle old FFmpeg backend
-    if (GET_PARAM(1) == "/cv/tracking/faceocc2/data/faceocc2.webm")
-        throw SkipTestException("Feature not yet supported by Windows FFmpeg shared library!");
-#endif
+    if (relativeFilePath == "cv/video/768x576.avi" && !videoio_registry::hasBackend(CAP_FFMPEG))
+        throw SkipTestException("FFmpeg backend not found  - SKIP");
 
     const std::vector<std::pair< cudacodec::ColorFormat, int>> formatsToChannels = {
         {cudacodec::ColorFormat::GRAY,1},
@@ -298,7 +294,7 @@ CUDA_TEST_P(Video, Reader)
         {cudacodec::ColorFormat::NV_NV12,1}
     };
 
-    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + GET_PARAM(1);
+    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + relativeFilePath;
     cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
     ASSERT_FALSE(reader->set(cudacodec::ColorFormat::RGB));
     cv::cudacodec::FormatInfo fmt = reader->format();
@@ -818,13 +814,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, Scaling, testing::Combine(
 
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, DisplayResolution, ALL_DEVICES);
 
-#define VIDEO_SRC_R "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
+#ifdef _WIN32 // handle old FFmpeg backend - remove when windows shared library is updated
+#define VIDEO_SRC_R testing::Values("highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
     "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg", \
-    "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
-    "cv/tracking/faceocc2/data/faceocc2.webm"
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(VIDEO_SRC_R)));
+    "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
+    //, "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
+    "cv/tracking/faceocc2/data/faceocc2.webm")
+#else
+#define VIDEO_SRC_R testing::Values("highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
+    "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg", \
+    "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
+    //, "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
+    "cv/tracking/faceocc2/data/faceocc2.webm", "highgui/video/sample_322x242_15frames.yuv420p.mpeg2video.mp4", "highgui/video/sample_322x242_15frames.yuv420p.mjpeg.mp4")
+#endif
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(ALL_DEVICES,VIDEO_SRC_R));
 
 const color_conversion_params_t color_conversion_params[] =
 {
@@ -859,9 +862,11 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckExtraData, testing::Combine(
     ALL_DEVICES,
     testing::ValuesIn(check_extra_data_params)));
 
+#define VIDEO_SRC_KEY "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
+    "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg"
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckKeyFrame, testing::Combine(
     ALL_DEVICES,
-    testing::Values(VIDEO_SRC_R)));
+    testing::Values(VIDEO_SRC_KEY)));
 
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckParams, ALL_DEVICES);
 

From cc5c2760081b856374229fdf8ea32904edcf698b Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Thu, 22 Jun 2023 11:23:58 +0300
Subject: [PATCH 03/11] cudacodec::VideoReader - fix cudart defines

---
 modules/cudacodec/src/video_decoder.cpp | 4 ++--
 modules/cudacodec/src/video_reader.cpp  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/cudacodec/src/video_decoder.cpp b/modules/cudacodec/src/video_decoder.cpp
index 470c402108..23a349c3b2 100644
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -97,10 +97,10 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
                             cudaVideoCodec_UYVY     == _codec;
 
 #if defined (HAVE_CUDA)
-#if (CUDART_VERSION >= 6500)
+#if (CUDART_VERSION >= 6050)
     codecSupported |=       cudaVideoCodec_HEVC     == _codec;
 #endif
-#if  ((CUDART_VERSION == 7500) || (CUDART_VERSION >= 9000))
+#if  ((CUDART_VERSION == 7050) || (CUDART_VERSION >= 9000))
     codecSupported |=       cudaVideoCodec_VP8      == _codec ||
                             cudaVideoCodec_VP9      == _codec ||
                             cudaVideoCodec_AV1      == _codec ||
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index 5b231f2294..cf47d8399a 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -67,14 +67,14 @@ void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int he
         outFrame.create(height, width, CV_8UC3);
         Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
         NppiSize oSizeROI = { width,height };
-#if (CUDART_VERSION < 10100)
+#if (CUDART_VERSION < 10010)
         cv::cuda::NppStreamHandler h(stream);
         if (videoFullRangeFlag)
             nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
         else {
             nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
         }
-#elif (CUDART_VERSION >= 10100)
+#elif (CUDART_VERSION >= 10010)
         NppStreamContext nppStreamCtx;
         nppSafeCall(nppGetStreamContext(&nppStreamCtx));
         nppStreamCtx.hStream = StreamAccessor::getStream(stream);
@@ -316,7 +316,7 @@ namespace
     bool VideoReaderImpl::set(const ColorFormat colorFormat_) {
         if (!ValidColorFormat(colorFormat_)) return false;
         if (colorFormat_ == ColorFormat::BGR) {
-#if (CUDART_VERSION < 9200)
+#if (CUDART_VERSION < 9020)
             CV_LOG_DEBUG(NULL, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
             return false;
 #elif (CUDART_VERSION < 11000)

From b677ac8910c3c99b56b0c0bdd25584638bd323db Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Thu, 22 Jun 2023 13:52:37 +0300
Subject: [PATCH 04/11] FFmpeg/4.x: update FFmpeg wrapper 2023.6

---
 modules/cudacodec/test/test_video.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 49ec7d742e..7ecc2924bb 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -814,19 +814,12 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, Scaling, testing::Combine(
 
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, DisplayResolution, ALL_DEVICES);
 
-#ifdef _WIN32 // handle old FFmpeg backend - remove when windows shared library is updated
-#define VIDEO_SRC_R testing::Values("highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
-    "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg", \
-    "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
-    //, "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
-    "cv/tracking/faceocc2/data/faceocc2.webm")
-#else
 #define VIDEO_SRC_R testing::Values("highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
     "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg", \
     "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
     //, "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
     "cv/tracking/faceocc2/data/faceocc2.webm", "highgui/video/sample_322x242_15frames.yuv420p.mpeg2video.mp4", "highgui/video/sample_322x242_15frames.yuv420p.mjpeg.mp4")
-#endif
+
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(ALL_DEVICES,VIDEO_SRC_R));
 
 const color_conversion_params_t color_conversion_params[] =

From d545bab37844341e543847d17c6e1708dea97b60 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Tue, 27 Jun 2023 22:20:24 +0300
Subject: [PATCH 05/11] Fixed memory leak in EllipseDetector and Mat
 addressing.

---
 modules/ximgproc/src/find_ellipses.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/ximgproc/src/find_ellipses.cpp b/modules/ximgproc/src/find_ellipses.cpp
index 6b52be0eb3..66c61d7759 100644
--- a/modules/ximgproc/src/find_ellipses.cpp
+++ b/modules/ximgproc/src/find_ellipses.cpp
@@ -1272,9 +1272,9 @@ void EllipseDetectorImpl::preProcessing(Mat1b &image, Mat1b &dp, Mat1b &dn) {
 
     // buffer
     int *magBuffer[3];
-    void *buffer = malloc((imgSize.width + 2) * (imgSize.height + 2) +
-                          (imgSize.width + 2) * 3 * sizeof(int));
-    magBuffer[0] = (int *) buffer;
+    AutoBuffer<int> buffer((imgSize.width + 2) * (imgSize.height + 2) +
+                          (imgSize.width + 2) * 3);
+    magBuffer[0] = buffer.data();
     magBuffer[1] = magBuffer[0] + imgSize.width + 2;
     magBuffer[2] = magBuffer[1] + imgSize.width + 2;
     uchar *map = (uchar *) (magBuffer[2] + imgSize.width + 2);
@@ -1300,8 +1300,8 @@ void EllipseDetectorImpl::preProcessing(Mat1b &image, Mat1b &dp, Mat1b &dn) {
     // 2 - the pixel does belong to an edge
     for (int i = 0; i <= imgSize.height; i++) {
         int *tmpMag = magBuffer[(i > 0) + 1] + 1;
-        const short *tmpDx = (short *) (dx[i]);
-        const short *tmpDy = (short *) (dy[i]);
+        const short *tmpDx = dx.ptr<short>(i);
+        const short *tmpDy = dy.ptr<short>(i);
         uchar *tmpMap;
         int prevFlag = 0;
 
@@ -1980,4 +1980,4 @@ void findEllipses(
     Mat(_ellipses).copyTo(ellipses);
 }
 } // namespace ximgproc
-} // namespace cv
\ No newline at end of file
+} // namespace cv

From e963ec2e100cb93ab13656924369a09c8d510e22 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Mon, 10 Jul 2023 17:08:58 +0300
Subject: [PATCH 06/11] Wstringop-overflow warning fix in bio inspired module.

---
 .../src/transientareassegmentationmodule.cpp         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/bioinspired/src/transientareassegmentationmodule.cpp b/modules/bioinspired/src/transientareassegmentationmodule.cpp
index 0f36d6dd47..01360dc68c 100644
--- a/modules/bioinspired/src/transientareassegmentationmodule.cpp
+++ b/modules/bioinspired/src/transientareassegmentationmodule.cpp
@@ -136,7 +136,7 @@ class TransientAreasSegmentationModuleImpl : protected BasicRetinaFilter
     /**
      * @return the current parameters setup
      */
-    struct SegmentationParameters getParameters();
+    SegmentationParameters getParameters();
 
     /**
      * parameters setup display method
@@ -202,7 +202,7 @@ class TransientAreasSegmentationModuleImpl : protected BasicRetinaFilter
      */
     inline const std::valarray<float> &getMotionContextPicture() const {return _contextMotionEnergy;}
 
-    struct cv::bioinspired::SegmentationParameters _segmentationParameters;
+    cv::bioinspired::SegmentationParameters _segmentationParameters;
     // template buffers and related acess pointers
     std::valarray<float> _inputToSegment;
     std::valarray<float> _contextMotionEnergy;
@@ -233,7 +233,7 @@ class TransientAreasSegmentationModuleImpl_: public  TransientAreasSegmentationM
     inline virtual void setup(cv::FileStorage &fs, const bool applyDefaultSetupOnFailure) CV_OVERRIDE { _segmTool.setup(fs, applyDefaultSetupOnFailure); }
     inline virtual void setup(SegmentationParameters newParameters) CV_OVERRIDE { _segmTool.setup(newParameters); }
     inline virtual String printSetup() CV_OVERRIDE { return _segmTool.printSetup(); }
-    inline virtual struct SegmentationParameters getParameters() CV_OVERRIDE { return _segmTool.getParameters(); }
+    inline virtual SegmentationParameters getParameters() CV_OVERRIDE { return _segmTool.getParameters(); }
     inline virtual void write( String fs ) const CV_OVERRIDE { _segmTool.write(fs); }
     inline virtual void run(InputArray inputToSegment, const int channelIndex) CV_OVERRIDE { _segmTool.run(inputToSegment, channelIndex); }
     inline virtual void getSegmentationPicture(OutputArray transientAreas) CV_OVERRIDE { return _segmTool.getSegmentationPicture(transientAreas); }
@@ -285,7 +285,7 @@ void TransientAreasSegmentationModuleImpl::clearAllBuffers()
     _segmentedAreas=0;
 }
 
-struct SegmentationParameters TransientAreasSegmentationModuleImpl::getParameters()
+SegmentationParameters TransientAreasSegmentationModuleImpl::getParameters()
 {
     return _segmentationParameters;
 }
@@ -343,7 +343,7 @@ void TransientAreasSegmentationModuleImpl::setup(cv::FileStorage &fs, const bool
         std::cout<<"Retina::setup: resetting retina with default parameters"<<std::endl;
         if (applyDefaultSetupOnFailure)
         {
-            struct cv::bioinspired::SegmentationParameters defaults;
+            cv::bioinspired::SegmentationParameters defaults;
             setup(defaults);
         }
         std::cout<<"SegmentationModule::setup: wrong/unappropriate xml parameter file : error report :`n=>"<<e.what()<<std::endl;
@@ -356,7 +356,7 @@ void TransientAreasSegmentationModuleImpl::setup(cv::bioinspired::SegmentationPa
 {
 
     // copy structure contents
-    memcpy(&_segmentationParameters, &newParameters, sizeof(cv::bioinspired::SegmentationParameters));
+    _segmentationParameters = newParameters;
     // apply setup
     // init local motion energy extraction low pass filter
     BasicRetinaFilter::setLPfilterParameters(0, newParameters.localEnergy_temporalConstant, newParameters.localEnergy_spatialConstant);

From c944ec5f52dc20e929d5b778d838e1d4ada2f6ba Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Tue, 11 Jul 2023 18:41:04 +0300
Subject: [PATCH 07/11] Add Ubuntu 22.04 to CI.

---
 .github/workflows/PR-4.x.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 91e5b9b08b..250b5e157a 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -12,6 +12,9 @@ jobs:
   Ubuntu2004-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-U20.yaml@main
 
+  Ubuntu2204-x64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-U22.yaml@main
+
   Ubuntu2004-x64-CUDA:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-U20-Cuda.yaml@main
 

From 0fe30d84d00ea32a2c052e1a7e27ec1b64aa5a78 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Mon, 17 Jul 2023 12:26:44 +0300
Subject: [PATCH 08/11] Added Java bindings for dnn_superres module.

---
 modules/dnn_superres/CMakeLists.txt                |  4 ++--
 .../misc/java/test/DnnSuperresTest.java            | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 modules/dnn_superres/misc/java/test/DnnSuperresTest.java

diff --git a/modules/dnn_superres/CMakeLists.txt b/modules/dnn_superres/CMakeLists.txt
index e16f0b6d41..a3bba2f6de 100644
--- a/modules/dnn_superres/CMakeLists.txt
+++ b/modules/dnn_superres/CMakeLists.txt
@@ -6,5 +6,5 @@ ocv_define_module(
     opencv_imgproc
     opencv_dnn
     OPTIONAL opencv_quality
-    WRAP python
-)
\ No newline at end of file
+    WRAP python java
+)
diff --git a/modules/dnn_superres/misc/java/test/DnnSuperresTest.java b/modules/dnn_superres/misc/java/test/DnnSuperresTest.java
new file mode 100644
index 0000000000..710f063b5d
--- /dev/null
+++ b/modules/dnn_superres/misc/java/test/DnnSuperresTest.java
@@ -0,0 +1,14 @@
+package org.opencv.test.tracking;
+
+import org.opencv.core.Core;
+import org.opencv.core.CvException;
+import org.opencv.test.OpenCVTestCase;
+import org.opencv.dnn_superres.DnnSuperResImpl;
+
+public class DnnSuperresTest extends OpenCVTestCase {
+
+    public void testCreateSuperres() {
+        DnnSuperResImpl sr = DnnSuperResImpl.create();
+    }
+
+}

From c49a4202e0bcaa7f72e9dab5f18fcf082ec30d7e Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Sat, 10 Jun 2023 08:10:25 +0300
Subject: [PATCH 09/11] cudacodec: return luma hist from VideoReader::nextFrame
 if requested

---
 .../cudacodec/include/opencv2/cudacodec.hpp   | 37 +++++++++--
 .../misc/python/test/test_cudacodec.py        | 41 ++++++++----
 modules/cudacodec/src/video_decoder.cpp       | 55 +++++++++++-----
 modules/cudacodec/src/video_decoder.hpp       |  6 +-
 modules/cudacodec/src/video_parser.cpp        | 39 +++++------
 modules/cudacodec/src/video_reader.cpp        | 65 ++++++++++++++-----
 modules/cudacodec/src/video_source.cpp        |  3 +-
 modules/cudacodec/test/test_video.cpp         | 54 +++++++++++++++
 8 files changed, 231 insertions(+), 69 deletions(-)

diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index af8f169c19..bb664386e8 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -295,7 +295,7 @@ enum ChromaFormat
 
 /** @brief Deinterlacing mode used by decoder.
 * @param Weave Weave both fields (no deinterlacing). For progressive content and for content that doesn't need deinterlacing.
-* Bob Drop one field.
+* @param Bob Drop one field.
 * @param Adaptive Adaptive deinterlacing needs more video memory than other deinterlacing modes.
 * */
 enum DeinterlaceMode
@@ -305,12 +305,22 @@ enum DeinterlaceMode
     Adaptive = 2
 };
 
+/** @brief Utility function demonstrating how to map the luma histogram when FormatInfo::videoFullRangeFlag == false
+    @param hist Luma histogram \a hist returned from VideoReader::nextFrame(GpuMat& frame, GpuMat& hist, Stream& stream).
+    @param histFull Host histogram equivelent to downloading \a hist after calling cuda::calcHist(InputArray frame, OutputArray hist, Stream& stream).
+
+    @note
+    -   This function demonstrates how to map the luma histogram back so that it is equivalent to the result obtained from cuda::calcHist()
+    if the returned frame was colorFormat::GRAY.
+ */
+CV_EXPORTS_W void MapHist(const GpuMat& hist, CV_OUT Mat& histFull);
+
 /** @brief Struct providing information about video file format. :
  */
 struct CV_EXPORTS_W_SIMPLE FormatInfo
 {
-    CV_WRAP FormatInfo() : nBitDepthMinus8(-1), nBitDepthChromaMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
-        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false) {};
+    CV_WRAP FormatInfo() : nBitDepthMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
+        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false), enableHistogram(false), nCounterBitDepth(0), nMaxHistogramBins(0){};
 
     CV_PROP_RW Codec codec;
     CV_PROP_RW ChromaFormat chromaFormat;
@@ -331,6 +341,9 @@ struct CV_EXPORTS_W_SIMPLE FormatInfo
     CV_PROP_RW cv::Rect srcRoi;//!< Region of interest decoded from video source.
     CV_PROP_RW cv::Rect targetRoi;//!< Region of interest in the output frame containing the decoded frame.
     CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.  Internally the conversion from NV12 to BGR obeys ITU 709.
+    CV_PROP_RW bool enableHistogram;//!< Flag requesting histogram output if supported. Exception will be thrown when requested but not supported.
+    CV_PROP_RW int nCounterBitDepth;//!< Bit depth of histogram bins if histogram output is requested and supported.
+    CV_PROP_RW int nMaxHistogramBins;//!< Max number of histogram bins if histogram output is requested and supported.
 };
 
 /** @brief cv::cudacodec::VideoReader generic properties identifier.
@@ -376,6 +389,20 @@ class CV_EXPORTS_W VideoReader
      */
     CV_WRAP virtual bool nextFrame(CV_OUT GpuMat& frame, Stream &stream = Stream::Null()) = 0;
 
+    /** @brief Grabs, decodes and returns the next video frame and frame luma histogram.
+
+    @param [out] frame The video frame.
+    @param [out] histogram Histogram of the luma component of the encoded frame, see note.
+    @param stream Stream for the asynchronous version.
+    @return `false` if no frames have been grabbed.
+
+    If no frames have been grabbed (there are no more frames in video file), the methods return false.
+    The method throws an Exception if error occurs.
+
+    @note Histogram data is collected by NVDEC during the decoding process resulting in zero performance penalty. NVDEC computes the histogram data for only the luma component of decoded output, not on post-processed frame(i.e. when scaling, cropping, etc. applied).  If the source is encoded using a limited range of luma values (FormatInfo::videoFullRangeFlag == false) then the histogram bin values will correspond to to this limited range of values and will need to be mapped to contain the same output as cuda::calcHist().  The MapHist() utility function can be used to perform this mapping on the host if required.
+     */
+    CV_WRAP_AS(nextFrameWithHist) virtual bool nextFrame(CV_OUT GpuMat& frame, CV_OUT GpuMat& histogram, Stream& stream = Stream::Null()) = 0;
+
     /** @brief Returns information about video file format.
     */
     CV_WRAP virtual FormatInfo format() const = 0;
@@ -535,9 +562,10 @@ but it cannot go below the number determined by NVDEC.
 @param srcRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) decoded from video source, defaults to the full frame.
 @param targetRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) within the output frame to copy and resize the decoded frame to,
 defaults to the full frame.
+@param enableHistogram Request output of decoded luma histogram \a hist from VideoReader::nextFrame(GpuMat& frame, GpuMat& hist, Stream& stream), if hardware supported.
 */
 struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
-    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0) {};
+    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0), enableHistogram(false){};
     CV_PROP_RW bool udpSource;
     CV_PROP_RW bool allowFrameDrop;
     CV_PROP_RW int minNumDecodeSurfaces;
@@ -545,6 +573,7 @@ struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
     CV_PROP_RW cv::Size targetSz;
     CV_PROP_RW cv::Rect srcRoi;
     CV_PROP_RW cv::Rect targetRoi;
+    CV_PROP_RW bool enableHistogram;
 };
 
 /** @brief Creates video reader.
diff --git a/modules/cudacodec/misc/python/test/test_cudacodec.py b/modules/cudacodec/misc/python/test/test_cudacodec.py
index 3f41c3bbed..1e5d3755c8 100644
--- a/modules/cudacodec/misc/python/test/test_cudacodec.py
+++ b/modules/cudacodec/misc/python/test/test_cudacodec.py
@@ -14,36 +14,53 @@ def setUp(self):
     @unittest.skipIf('OPENCV_TEST_DATA_PATH' not in os.environ,
                      "OPENCV_TEST_DATA_PATH is not defined")
     def test_reader(self):
-        #Test the functionality but not the results of the video reader
+        # Test the functionality but not the results of the VideoReader
 
-        vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/cv/video/1920x1080.avi'
+        vid_path = os.environ['OPENCV_TEST_DATA_PATH'] + '/highgui/video/big_buck_bunny.h264'
         try:
             reader = cv.cudacodec.createVideoReader(vid_path)
             format_info = reader.format()
             ret, gpu_mat = reader.nextFrame()
             self.assertTrue(ret)
-            self.assertTrue('GpuMat' in str(type(gpu_mat)), msg=type(gpu_mat))
+            self.assertTrue(isinstance(gpu_mat, cv.cuda.GpuMat), msg=type(gpu_mat))
             #TODO: print(cv.utils.dumpInputArray(gpu_mat)) # - no support for GpuMat
 
+            # Retrieve format info
             if(not format_info.valid):
               format_info = reader.format()
             sz = gpu_mat.size()
             self.assertTrue(sz[0] == format_info.width and sz[1] == format_info.height)
 
             # not checking output, therefore sepearate tests for different signatures is unecessary
-            ret, _gpu_mat2 = reader.nextFrame(gpu_mat)
-            #TODO: self.assertTrue(gpu_mat == gpu_mat2)
-            self.assertTrue(ret)
+            ret, gpu_mat_ = reader.nextFrame(gpu_mat)
+            self.assertTrue(ret and gpu_mat_.cudaPtr() == gpu_mat.cudaPtr())
 
+            # Pass VideoReaderInitParams to the decoder and initialization params to the source (cv::VideoCapture)
             params = cv.cudacodec.VideoReaderInitParams()
             params.rawMode = True
+            params.enableHistogramOutput = True
             ms_gs = 1234
+            post_processed_sz = (gpu_mat.size()[0]*2, gpu_mat.size()[1]*2)
+            params.targetSz = post_processed_sz
             reader = cv.cudacodec.createVideoReader(vid_path,[cv.CAP_PROP_OPEN_TIMEOUT_MSEC, ms_gs], params)
             ret, ms = reader.get(cv.CAP_PROP_OPEN_TIMEOUT_MSEC)
             self.assertTrue(ret and ms == ms_gs)
             ret, raw_mode = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_RAW_MODE)
             self.assertTrue(ret and raw_mode)
 
+            # Retrieve image histogram
+            ret, gpu_mat, hist = reader.nextFrameWithHist()
+            self.assertTrue(ret and not gpu_mat.empty() and hist.size() == (256,1))
+            ret, gpu_mat_, hist_ = reader.nextFrameWithHist(gpu_mat, hist)
+            self.assertTrue(ret and not gpu_mat.empty() and hist.size() == (256,1))
+            self.assertTrue(gpu_mat_.cudaPtr() == gpu_mat.cudaPtr() and hist_.cudaPtr() == hist.cudaPtr())
+            hist_host = cv.cudacodec.MapHist(hist)
+            self.assertTrue(hist_host.shape == (1,256) and isinstance(hist_host, np.ndarray))
+
+            # Check post processing applied
+            self.assertTrue(gpu_mat.size() == post_processed_sz)
+
+            # Change color format
             ret, colour_code = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_COLOR_FORMAT)
             self.assertTrue(ret and colour_code == cv.cudacodec.ColorFormat_BGRA)
             colour_code_gs = cv.cudacodec.ColorFormat_GRAY
@@ -51,6 +68,7 @@ def test_reader(self):
             ret, colour_code = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_COLOR_FORMAT)
             self.assertTrue(ret and colour_code == colour_code_gs)
 
+            # Read raw encoded bitstream
             ret, i_base = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_RAW_PACKAGES_BASE_INDEX)
             self.assertTrue(ret and i_base == 2.0)
             self.assertTrue(reader.grab())
@@ -75,8 +93,8 @@ def test_reader(self):
             else:
                 self.skipTest(e.err)
 
-    def test_writer_existence(self):
-        #Test at least the existence of wrapped functions for now
+    def test_writer(self):
+        # Test the functionality but not the results of the VideoWriter
 
         try:
             fd, fname = tempfile.mkstemp(suffix=".h264")
@@ -91,11 +109,12 @@ def test_writer_existence(self):
             writer.write(blankFrameIn)
             writer.release()
             encoder_params_out = writer.getEncoderParams()
-            self.assert_true(encoder_params_in.gopLength == encoder_params_out.gopLength)
+            self.assertTrue(encoder_params_in.gopLength == encoder_params_out.gopLength)
             cap = cv.VideoCapture(fname,cv.CAP_FFMPEG)
-            self.assert_true(cap.isOpened())
+            self.assertTrue(cap.isOpened())
             ret, blankFrameOut = cap.read()
-            self.assert_true(ret and blankFrameOut.shape == blankFrameIn.download().shape)
+            self.assertTrue(ret and blankFrameOut.shape == blankFrameIn.download().shape)
+            cap.release()
         except cv.error as e:
             self.assertEqual(e.code, cv.Error.StsNotImplemented)
             self.skipTest("Either NVCUVENC or a GPU hardware encoder is missing or the encoding profile is not supported.")
diff --git a/modules/cudacodec/src/video_decoder.cpp b/modules/cudacodec/src/video_decoder.cpp
index 23a349c3b2..10008d9b03 100644
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -96,18 +96,18 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
                             cudaVideoCodec_YUYV     == _codec ||
                             cudaVideoCodec_UYVY     == _codec;
 
-#if defined (HAVE_CUDA)
 #if (CUDART_VERSION >= 6050)
-    codecSupported |=       cudaVideoCodec_HEVC     == _codec;
+    codecSupported |= cudaVideoCodec_HEVC == _codec;
+#endif
+#if (CUDART_VERSION >= 7050)
+    codecSupported |= cudaVideoCodec_YUV420 == _codec;
 #endif
 #if  ((CUDART_VERSION == 7050) || (CUDART_VERSION >= 9000))
-    codecSupported |=       cudaVideoCodec_VP8      == _codec ||
-                            cudaVideoCodec_VP9      == _codec ||
-                            cudaVideoCodec_AV1      == _codec ||
-                            cudaVideoCodec_YUV420   == _codec;
+    codecSupported |= cudaVideoCodec_VP8 == _codec || cudaVideoCodec_VP9 == _codec;
 #endif
+#if (CUDART_VERSION >= 9000)
+    codecSupported |= cudaVideoCodec_AV1;
 #endif
-
     CV_Assert(codecSupported);
     CV_Assert(  cudaVideoChromaFormat_Monochrome == _chromaFormat ||
                 cudaVideoChromaFormat_420        == _chromaFormat ||
@@ -123,31 +123,55 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
     cuSafeCall(cuCtxPushCurrent(ctx_));
     cuSafeCall(cuvidGetDecoderCaps(&decodeCaps));
     cuSafeCall(cuCtxPopCurrent(NULL));
-    if (!(decodeCaps.bIsSupported && (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)))){
-        CV_LOG_ERROR(NULL, "Video source is not supported by hardware video decoder.");
-        CV_Error(Error::StsUnsupportedFormat, "Video source is not supported by hardware video decoder");
+    if (!(decodeCaps.bIsSupported && (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)))) {
+        CV_Error(Error::StsUnsupportedFormat, "Video source is not supported by hardware video decoder refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of the video source's codec.");
+    }
+
+    if (videoFormat.enableHistogram) {
+        if (!decodeCaps.bIsHistogramSupported) {
+            CV_Error(Error::StsBadArg, "Luma histogram output is not supported for current codec and/or on current device.");
+        }
+
+        if (decodeCaps.nCounterBitDepth != 32) {
+            std::ostringstream error;
+            error << "Luma histogram output disabled due to current device using " << decodeCaps.nCounterBitDepth << " bit bins. Histogram output only supports 32 bit bins.";
+            CV_Error(Error::StsBadArg, error.str());
+        }
+        else {
+            videoFormat_.nCounterBitDepth = decodeCaps.nCounterBitDepth;
+            videoFormat_.nMaxHistogramBins = decodeCaps.nMaxHistogramBins;
+        }
     }
+
     CV_Assert(videoFormat.ulWidth >= decodeCaps.nMinWidth &&
         videoFormat.ulHeight >= decodeCaps.nMinHeight &&
         videoFormat.ulWidth <= decodeCaps.nMaxWidth &&
         videoFormat.ulHeight <= decodeCaps.nMaxHeight);
 
-    CV_Assert((videoFormat.width >> 4)* (videoFormat.height >> 4) <= decodeCaps.nMaxMBCount);
+    CV_Assert((videoFormat.width >> 4) * (videoFormat.height >> 4) <= decodeCaps.nMaxMBCount);
+#else
+    if (videoFormat.enableHistogram) {
+        CV_Error(Error::StsBadArg, "Luma histogram output is not supported when CUDA Toolkit version <= 9.0.");
+    }
 #endif
+
     // Create video decoder
     CUVIDDECODECREATEINFO createInfo_ = {};
+#if (CUDART_VERSION >= 9000)
+    createInfo_.enableHistogram = videoFormat.enableHistogram;
+    createInfo_.bitDepthMinus8 = videoFormat.nBitDepthMinus8;
+    createInfo_.ulMaxWidth = videoFormat.ulMaxWidth;
+    createInfo_.ulMaxHeight = videoFormat.ulMaxHeight;
+#endif
     createInfo_.CodecType           = _codec;
     createInfo_.ulWidth             = videoFormat.ulWidth;
     createInfo_.ulHeight            = videoFormat.ulHeight;
     createInfo_.ulNumDecodeSurfaces = videoFormat.ulNumDecodeSurfaces;
     createInfo_.ChromaFormat    = _chromaFormat;
-    createInfo_.bitDepthMinus8 = videoFormat.nBitDepthMinus8;
     createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
     createInfo_.DeinterlaceMode = static_cast<cudaVideoDeinterlaceMode>(videoFormat.deinterlaceMode);
     createInfo_.ulTargetWidth       = videoFormat.width;
     createInfo_.ulTargetHeight      = videoFormat.height;
-    createInfo_.ulMaxWidth          = videoFormat.ulMaxWidth;
-    createInfo_.ulMaxHeight         = videoFormat.ulMaxHeight;
     createInfo_.display_area.left   = videoFormat.displayArea.x;
     createInfo_.display_area.right  = videoFormat.displayArea.x + videoFormat.displayArea.width;
     createInfo_.display_area.top    = videoFormat.displayArea.y;
@@ -169,12 +193,10 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
 
 int cv::cudacodec::detail::VideoDecoder::reconfigure(const FormatInfo& videoFormat) {
     if (videoFormat.nBitDepthMinus8 != videoFormat_.nBitDepthMinus8 || videoFormat.nBitDepthChromaMinus8 != videoFormat_.nBitDepthChromaMinus8) {
-        CV_LOG_ERROR(NULL, "Reconfigure Not supported for bit depth change");
         CV_Error(Error::StsUnsupportedFormat, "Reconfigure Not supported for bit depth change");
     }
 
     if (videoFormat.chromaFormat != videoFormat_.chromaFormat) {
-        CV_LOG_ERROR(NULL, "Reconfigure Not supported for chroma format change");
         CV_Error(Error::StsUnsupportedFormat, "Reconfigure Not supported for chroma format change");
     }
 
@@ -183,7 +205,6 @@ int cv::cudacodec::detail::VideoDecoder::reconfigure(const FormatInfo& videoForm
     if ((videoFormat.ulWidth > videoFormat_.ulMaxWidth) || (videoFormat.ulHeight > videoFormat_.ulMaxHeight)) {
         // For VP9, let driver  handle the change if new width/height > maxwidth/maxheight
         if (videoFormat.codec != Codec::VP9) {
-            CV_LOG_ERROR(NULL, "Reconfigure Not supported when width/height > maxwidth/maxheight");
             CV_Error(Error::StsUnsupportedFormat, "Reconfigure Not supported when width/height > maxwidth/maxheight");
         }
     }
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
index 96338d7e4b..a32bf1bb0a 100644
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -49,11 +49,12 @@ namespace cv { namespace cudacodec { namespace detail {
 class VideoDecoder
 {
 public:
-    VideoDecoder(const Codec& codec, const int minNumDecodeSurfaces, cv::Size targetSz, cv::Rect srcRoi, cv::Rect targetRoi, CUcontext ctx, CUvideoctxlock lock) :
+    VideoDecoder(const Codec& codec, const int minNumDecodeSurfaces, cv::Size targetSz, cv::Rect srcRoi, cv::Rect targetRoi, const bool enableHistogram, CUcontext ctx, CUvideoctxlock lock) :
         ctx_(ctx), lock_(lock), decoder_(0)
     {
         videoFormat_.codec = codec;
         videoFormat_.ulNumDecodeSurfaces = minNumDecodeSurfaces;
+        videoFormat_.enableHistogram = enableHistogram;
         // alignment enforced by nvcuvid, likely due to chroma subsampling
         videoFormat_.targetSz.width = targetSz.width - targetSz.width % 2; videoFormat_.targetSz.height = targetSz.height - targetSz.height % 2;
         videoFormat_.srcRoi.x = srcRoi.x - srcRoi.x % 4; videoFormat_.srcRoi.width = srcRoi.width - srcRoi.width % 4;
@@ -88,13 +89,14 @@ class VideoDecoder
 
     cudaVideoChromaFormat chromaFormat() const { return static_cast<cudaVideoChromaFormat>(videoFormat_.chromaFormat); }
     int nBitDepthMinus8() const { return videoFormat_.nBitDepthMinus8; }
+    bool enableHistogram() const { return videoFormat_.enableHistogram; }
 
     bool decodePicture(CUVIDPICPARAMS* picParams)
     {
         return cuvidDecodePicture(decoder_, picParams) == CUDA_SUCCESS;
     }
 
-    cuda::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
+    GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
     {
         CUdeviceptr ptr;
         unsigned int pitch;
diff --git a/modules/cudacodec/src/video_parser.cpp b/modules/cudacodec/src/video_parser.cpp
index 459db17da9..1aba16d585 100644
--- a/modules/cudacodec/src/video_parser.cpp
+++ b/modules/cudacodec/src/video_parser.cpp
@@ -78,9 +78,18 @@ bool cv::cudacodec::detail::VideoParser::parseVideoData(const unsigned char* dat
     if (rawMode)
         currentFramePackets.push_back(RawPacket(data, size, containsKeyFrame));
 
-    if (cuvidParseVideoData(parser_, &packet) != CUDA_SUCCESS)
-    {
-        CV_LOG_ERROR(NULL, "Call to cuvidParseVideoData failed!");
+    CUresult retVal = CUDA_SUCCESS;
+    try {
+        retVal = cuvidParseVideoData(parser_, &packet);
+    }
+    catch(const cv::Exception& e) {
+        CV_LOG_ERROR(NULL, e.msg);
+        hasError_ = true;
+        frameQueue_->endDecode();
+        return false;
+    }
+
+    if (retVal != CUDA_SUCCESS) {
         hasError_ = true;
         frameQueue_->endDecode();
         return false;
@@ -149,26 +158,18 @@ int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userDa
         maxH = format->coded_height;
     newFormat.ulMaxWidth = maxW;
     newFormat.ulMaxHeight = maxH;
+    newFormat.enableHistogram = thiz->videoDecoder_->enableHistogram();
 
     thiz->frameQueue_->waitUntilEmpty();
     int retVal = newFormat.ulNumDecodeSurfaces;
-    try
-    {
-        if (thiz->videoDecoder_->inited()) {
-            retVal = thiz->videoDecoder_->reconfigure(newFormat);
-            if (retVal > 1 && newFormat.ulNumDecodeSurfaces != thiz->frameQueue_->getMaxSz())
-                thiz->frameQueue_->resize(newFormat.ulNumDecodeSurfaces);
-        }
-        else {
-            thiz->frameQueue_->init(newFormat.ulNumDecodeSurfaces);
-            thiz->videoDecoder_->create(newFormat);
-        }
+    if (thiz->videoDecoder_->inited()) {
+        retVal = thiz->videoDecoder_->reconfigure(newFormat);
+        if (retVal > 1 && newFormat.ulNumDecodeSurfaces != thiz->frameQueue_->getMaxSz())
+            thiz->frameQueue_->resize(newFormat.ulNumDecodeSurfaces);
     }
-    catch (const cv::Exception&)
-    {
-        CV_LOG_ERROR(NULL, "Attempt to configure Nvidia decoder failed!");
-        thiz->hasError_ = true;
-        retVal = 0;
+    else {
+        thiz->frameQueue_->init(newFormat.ulNumDecodeSurfaces);
+        thiz->videoDecoder_->create(newFormat);
     }
     return retVal;
 }
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index cf47d8399a..b6ef2ca537 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -50,6 +50,7 @@ using namespace cv::cudacodec;
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const String&, const std::vector<int>&, const VideoReaderInitParams) { throw_no_cuda(); return Ptr<VideoReader>(); }
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&, const VideoReaderInitParams) { throw_no_cuda(); return Ptr<VideoReader>(); }
+void cv::cudacodec::MapHist(const GpuMat&, Mat&) { throw_no_cuda(); }
 
 #else // HAVE_NVCUVID
 
@@ -111,11 +112,13 @@ namespace
     {
     public:
         explicit VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop = false , const bool udpSource = false,
-            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect());
+            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect(), const bool enableHistogram = false);
         ~VideoReaderImpl();
 
         bool nextFrame(GpuMat& frame, Stream& stream) CV_OVERRIDE;
 
+        bool nextFrame(GpuMat& frame, GpuMat& histogram, Stream& stream) CV_OVERRIDE;
+
         FormatInfo format() const CV_OVERRIDE;
 
         bool grab(Stream& stream) CV_OVERRIDE;
@@ -132,7 +135,7 @@ namespace
         bool get(const int propertyId, double& propertyVal) const CV_OVERRIDE;
 
     private:
-        bool internalGrab(GpuMat& frame, Stream& stream);
+        bool internalGrab(GpuMat & frame, GpuMat & histogram, Stream & stream);
         void waitForDecoderInit();
 
         Ptr<VideoSource> videoSource_;
@@ -145,13 +148,16 @@ namespace
 
         std::deque< std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> > frames_;
         std::vector<RawPacket> rawPackets;
-        GpuMat lastFrame;
+        GpuMat lastFrame, lastHistogram;
         static const int decodedFrameIdx = 0;
         static const int extraDataIdx = 1;
         static const int rawPacketsBaseIdx = 2;
         ColorFormat colorFormat = ColorFormat::BGRA;
+        static const String errorMsg;
     };
 
+    const String VideoReaderImpl::errorMsg = "Parsing/Decoding video source failed, check GPU memory is available and GPU supports requested functionality.";
+
     FormatInfo VideoReaderImpl::format() const
     {
         return videoSource_->format();
@@ -161,13 +167,13 @@ namespace
         for (;;) {
             if (videoDecoder_->inited()) break;
             if (videoParser_->hasError() || frameQueue_->isEndOfDecode())
-                CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
+                CV_Error(Error::StsError, errorMsg);
             Thread::sleep(1);
         }
     }
 
     VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource,
-        const Size targetSz, const Rect srcRoi, const Rect targetRoi) :
+        const Size targetSz, const Rect srcRoi, const Rect targetRoi, const bool enableHistogram) :
         videoSource_(source),
         lock_(0)
     {
@@ -179,7 +185,7 @@ namespace
         cuSafeCall( cuCtxGetCurrent(&ctx) );
         cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
         frameQueue_.reset(new FrameQueue());
-        videoDecoder_.reset(new VideoDecoder(videoSource_->format().codec, minNumDecodeSurfaces, targetSz, srcRoi, targetRoi, ctx, lock_));
+        videoDecoder_.reset(new VideoDecoder(videoSource_->format().codec, minNumDecodeSurfaces, targetSz, srcRoi, targetRoi, enableHistogram, ctx, lock_));
         videoParser_.reset(new VideoParser(videoDecoder_, frameQueue_, allowFrameDrop, udpSource));
         videoSource_->setVideoParser(videoParser_);
         videoSource_->start();
@@ -203,10 +209,10 @@ namespace
         CUvideoctxlock m_lock;
     };
 
-    bool VideoReaderImpl::internalGrab(GpuMat& frame, Stream& stream) {
+    bool VideoReaderImpl::internalGrab(GpuMat& frame, GpuMat& histogram, Stream& stream) {
         if (videoParser_->hasError())
-            CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
-
+            CV_Error(Error::StsError, errorMsg);
+        cudacodec::FormatInfo fmt;
         if (frames_.empty())
         {
             CUVIDPARSERDISPINFO displayInfo;
@@ -217,7 +223,7 @@ namespace
                     break;
 
                 if (videoParser_->hasError())
-                    CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
+                    CV_Error(Error::StsError, errorMsg);
 
                 if (frameQueue_->isEndOfDecode())
                     return false;
@@ -228,7 +234,8 @@ namespace
 
             bool isProgressive = displayInfo.progressive_frame != 0;
             const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
-            videoSource_->updateFormat(videoDecoder_->format());
+            fmt = videoDecoder_->format();
+            videoSource_->updateFormat(fmt);
 
             for (int active_field = 0; active_field < num_fields; ++active_field)
             {
@@ -254,11 +261,21 @@ namespace
         {
             VideoCtxAutoLock autoLock(lock_);
 
+            unsigned long long cuHistogramPtr = 0;
+            if (fmt.enableHistogram)
+                frameInfo.second.histogram_dptr = &cuHistogramPtr;
+
             // map decoded video frame to CUDA surface
             GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
 
             cvtFromNv12(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, videoDecoder_->format().videoFullRangeFlag, stream);
 
+            if (fmt.enableHistogram) {
+                const size_t histogramSz = 4 * fmt.nMaxHistogramBins;
+                histogram.create(1, fmt.nMaxHistogramBins, CV_32S);
+                cuSafeCall(cuMemcpyDtoDAsync((CUdeviceptr)(histogram.data), cuHistogramPtr, histogramSz, StreamAccessor::getStream(stream)));
+            }
+
             // unmap video frame
             // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
             videoDecoder_->unmapFrame(decodedFrame);
@@ -272,7 +289,7 @@ namespace
     }
 
     bool VideoReaderImpl::grab(Stream& stream) {
-        return internalGrab(lastFrame, stream);
+        return internalGrab(lastFrame, lastHistogram, stream);
     }
 
     bool VideoReaderImpl::retrieve(OutputArray frame, const size_t idx) const {
@@ -387,7 +404,13 @@ namespace
 
     bool VideoReaderImpl::nextFrame(GpuMat& frame, Stream& stream)
     {
-        if (!internalGrab(frame, stream))
+        GpuMat tmp;
+        return nextFrame(frame, tmp, stream);
+    }
+
+    bool VideoReaderImpl::nextFrame(GpuMat& frame, GpuMat& histogram, Stream& stream)
+    {
+        if (!internalGrab(frame, histogram, stream))
             return false;
         return true;
     }
@@ -412,14 +435,26 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const
     }
 
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi);
+        params.srcRoi, params.targetRoi, params.enableHistogram);
 }
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>& source, const VideoReaderInitParams params)
 {
     Ptr<VideoSource> videoSource(new RawVideoSourceWrapper(source, params.rawMode));
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi);
+        params.srcRoi, params.targetRoi, params.enableHistogram);
+}
+
+void cv::cudacodec::MapHist(const GpuMat& hist, Mat& histFull) {
+    Mat histHost; hist.download(histHost);
+    histFull.create(histHost.size(), histHost.type());
+    histFull = 0;
+    const float scale = 255.0f / 219.0f;
+    const int offset = 16;
+    for (int iScaled = 0; iScaled < histHost.cols; iScaled++) {
+        const int iHistFull = std::min(std::max(0, static_cast<int>(std::round((iScaled - offset) * scale))), static_cast<int>(histFull.total()) - 1);
+        histFull.at<int>(iHistFull) += histHost.at<int>(iScaled);
+    }
 }
 
 #endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/video_source.cpp b/modules/cudacodec/src/video_source.cpp
index b58d753f74..a81b75e366 100644
--- a/modules/cudacodec/src/video_source.cpp
+++ b/modules/cudacodec/src/video_source.cpp
@@ -137,7 +137,8 @@ void cv::cudacodec::detail::RawVideoSourceWrapper::readLoop(void* userData)
             break;
     }
 
-    thiz->parseVideoData(0, 0, false, false, true);
+    if(!thiz->hasError_)
+        thiz->parseVideoData(0, 0, false, false, true);
 }
 
 #endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 7ecc2924bb..ead5fa944c 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -92,6 +92,11 @@ PARAM_TEST_CASE(VideoReadRaw, cv::cuda::DeviceInfo, std::string)
 {
 };
 
+typedef tuple<std::string, bool> histogram_params_t;
+PARAM_TEST_CASE(Histogram, cv::cuda::DeviceInfo, histogram_params_t)
+{
+};
+
 PARAM_TEST_CASE(CheckKeyFrame, cv::cuda::DeviceInfo, std::string)
 {
 };
@@ -480,6 +485,46 @@ CUDA_TEST_P(VideoReadRaw, Reader)
     ASSERT_EQ(0, remove(fileNameOut.c_str()));
 }
 
+CUDA_TEST_P(Histogram, Reader)
+{
+    cuda::setDevice(GET_PARAM(0).deviceID());
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(1));
+    const bool histAvailable = get<1>(GET_PARAM(1));
+    cudacodec::VideoReaderInitParams params;
+    params.enableHistogram = histAvailable;
+    Ptr<cudacodec::VideoReader> reader;
+    try {
+        reader = cudacodec::createVideoReader(inputFile, {}, params);
+    }
+    catch (const cv::Exception& e) {
+        throw SkipTestException(e.msg);
+    }
+    const cudacodec::FormatInfo fmt = reader->format();
+    ASSERT_EQ(histAvailable, fmt.enableHistogram);
+    reader->set(cudacodec::ColorFormat::GRAY);
+    GpuMat frame, hist;
+    reader->nextFrame(frame, hist);
+    if (histAvailable) {
+        ASSERT_TRUE(!hist.empty());
+        Mat frameHost, histGsHostFloat, histGs, histHost;
+        frame.download(frameHost);
+        const int histSize = 256;
+        const float range[] = { 0, 256 };
+        const float* histRange[] = { range };
+        cv::calcHist(&frameHost, 1, 0, Mat(), histGsHostFloat, 1, &histSize, histRange);
+        histGsHostFloat.convertTo(histGs, CV_32S);
+        if (fmt.videoFullRangeFlag)
+            hist.download(histHost);
+        else
+            cudacodec::MapHist(hist, histHost);
+        const double err = cv::norm(histGs.t(), histHost, NORM_INF);
+        ASSERT_EQ(err, 0);
+    }
+    else {
+        ASSERT_TRUE(hist.empty());
+    }
+}
+
 CUDA_TEST_P(CheckParams, Reader)
 {
     std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
@@ -844,6 +889,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, VideoReadRaw, testing::Combine(
     ALL_DEVICES,
     testing::Values(VIDEO_SRC_RW)));
 
+const histogram_params_t histogram_params[] =
+{
+    histogram_params_t("highgui/video/big_buck_bunny.mp4", false),
+    histogram_params_t("highgui/video/big_buck_bunny.h264", true),
+    histogram_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
+};
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Histogram, testing::Combine(ALL_DEVICES,testing::ValuesIn(histogram_params)));
+
 const check_extra_data_params_t check_extra_data_params[] =
 {
     check_extra_data_params_t("highgui/video/big_buck_bunny.mp4", 45),

From a06aecea955b1fa571cfa0302ffc8faeda85c48f Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Thu, 20 Jul 2023 10:58:38 +0300
Subject: [PATCH 10/11] Fixed several test failures in Python tests for CUDA
 modules.

---
 .../misc/python/test/test_cudacodec.py        | 20 +++++++++++--------
 .../python/test/test_nvidiaopticalflow.py     | 12 +++++------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/modules/cudacodec/misc/python/test/test_cudacodec.py b/modules/cudacodec/misc/python/test/test_cudacodec.py
index 1e5d3755c8..cf20daeb18 100644
--- a/modules/cudacodec/misc/python/test/test_cudacodec.py
+++ b/modules/cudacodec/misc/python/test/test_cudacodec.py
@@ -38,7 +38,7 @@ def test_reader(self):
             # Pass VideoReaderInitParams to the decoder and initialization params to the source (cv::VideoCapture)
             params = cv.cudacodec.VideoReaderInitParams()
             params.rawMode = True
-            params.enableHistogramOutput = True
+            params.enableHistogram = False
             ms_gs = 1234
             post_processed_sz = (gpu_mat.size()[0]*2, gpu_mat.size()[1]*2)
             params.targetSz = post_processed_sz
@@ -48,14 +48,12 @@ def test_reader(self):
             ret, raw_mode = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_RAW_MODE)
             self.assertTrue(ret and raw_mode)
 
-            # Retrieve image histogram
+            # Retrieve image histogram. Not all GPUs support histogram. Just check the method is called correctly
             ret, gpu_mat, hist = reader.nextFrameWithHist()
-            self.assertTrue(ret and not gpu_mat.empty() and hist.size() == (256,1))
+            self.assertTrue(ret and not gpu_mat.empty())
             ret, gpu_mat_, hist_ = reader.nextFrameWithHist(gpu_mat, hist)
-            self.assertTrue(ret and not gpu_mat.empty() and hist.size() == (256,1))
-            self.assertTrue(gpu_mat_.cudaPtr() == gpu_mat.cudaPtr() and hist_.cudaPtr() == hist.cudaPtr())
-            hist_host = cv.cudacodec.MapHist(hist)
-            self.assertTrue(hist_host.shape == (1,256) and isinstance(hist_host, np.ndarray))
+            self.assertTrue(ret and not gpu_mat.empty())
+            self.assertTrue(gpu_mat_.cudaPtr() == gpu_mat.cudaPtr())
 
             # Check post processing applied
             self.assertTrue(gpu_mat.size() == post_processed_sz)
@@ -93,6 +91,12 @@ def test_reader(self):
             else:
                 self.skipTest(e.err)
 
+    def test_map_histogram(self):
+        hist = cv.cuda_GpuMat((1,256), cv.CV_8UC1)
+        hist.setTo(1)
+        hist_host = cv.cudacodec.MapHist(hist)
+        self.assertTrue(hist_host.shape == (256, 1) and isinstance(hist_host, np.ndarray))
+
     def test_writer(self):
         # Test the functionality but not the results of the VideoWriter
 
@@ -122,4 +126,4 @@ def test_writer(self):
         os.remove(fname)
 
 if __name__ == '__main__':
-    NewOpenCVTests.bootstrap()
\ No newline at end of file
+    NewOpenCVTests.bootstrap()
diff --git a/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py b/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py
index 94822c4081..7ba6abd013 100644
--- a/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py
+++ b/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py
@@ -22,15 +22,15 @@ def test_calc(self):
         cuMat1 = cv.cuda_GpuMat(npMat1)
         cuMat2 = cv.cuda_GpuMat(npMat2)
         try:
-            nvof = cv.cuda_NvidiaOpticalFlow_1_0.create(cuMat1.shape[1], cuMat1.shape[0], 5, False, False, False, 0)
-            flow = nvof.calc(cuMat1, cuMat2, None)
-            self.assertTrue(flow.shape[1] > 0 and flow.shape[0] > 0)
-            flowUpSampled = nvof.upSampler(flow[0], cuMat1.shape[1], cuMat1.shape[0], nvof.getGridSize(), None)
+            nvof = cv.cuda_NvidiaOpticalFlow_1_0.create((npMat1.shape[1], npMat1.shape[0]), 5, False, False, False, 0)
+            flow, cost = nvof.calc(cuMat1, cuMat2, None)
+            self.assertTrue(flow.size()[1] > 0 and flow.size()[0] > 0)
+            flowUpSampled = nvof.upSampler(flow, (npMat1.shape[1], npMat1.shape[0]), nvof.getGridSize(), None)
             nvof.collectGarbage()
+            self.assertTrue(flowUpSampled.size()[1] > 0 and flowUpSampled.size()[0] > 0)
         except cv.error as e:
             if e.code == cv.Error.StsBadFunc or e.code == cv.Error.StsBadArg or e.code == cv.Error.StsNullPtr:
                 self.skipTest("Algorithm is not supported in the current environment")
-        self.assertTrue(flowUpSampled.shape[1] > 0 and flowUpSampled.shape[0] > 0)
 
 if __name__ == '__main__':
-    NewOpenCVTests.bootstrap()
\ No newline at end of file
+    NewOpenCVTests.bootstrap()

From 68c708d833db30b9c455da7aede189be8523b5b6 Mon Sep 17 00:00:00 2001
From: Vadim Levin <vadim.levin@opencv.ai>
Date: Wed, 19 Jul 2023 21:56:27 +0300
Subject: [PATCH 11/11] fix: remove using namespace cuda from public header

---
 modules/cudacodec/include/opencv2/cudacodec.hpp | 16 +++++++---------
 modules/cudacodec/src/video_decoder.hpp         |  2 +-
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index bb664386e8..42325c6461 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -59,8 +59,6 @@
 
 namespace cv { namespace cudacodec {
 
-using namespace cuda;  // Stream
-
 //! @addtogroup cudacodec
 //! @{
 
@@ -264,7 +262,7 @@ class CV_EXPORTS_W VideoWriter
 @param stream Stream for frame pre-processing.
 */
 CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec = Codec::H264, const double fps = 25.0,
-    const ColorFormat colorFormat = ColorFormat::BGR, Ptr<EncoderCallback> encoderCallback = 0, const Stream& stream = Stream::Null());
+    const ColorFormat colorFormat = ColorFormat::BGR, Ptr<EncoderCallback> encoderCallback = 0, const cuda::Stream& stream = cuda::Stream::Null());
 
 /** @brief Creates video writer.
 
@@ -278,7 +276,7 @@ CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileNam
 @param stream Stream for frame pre-processing.
 */
 CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec, const double fps,  const ColorFormat colorFormat,
-    const EncoderParams& params, Ptr<EncoderCallback> encoderCallback = 0, const Stream& stream = Stream::Null());
+    const EncoderParams& params, Ptr<EncoderCallback> encoderCallback = 0, const cuda::Stream& stream = cuda::Stream::Null());
 
 ////////////////////////////////// Video Decoding //////////////////////////////////////////
 
@@ -313,7 +311,7 @@ enum DeinterlaceMode
     -   This function demonstrates how to map the luma histogram back so that it is equivalent to the result obtained from cuda::calcHist()
     if the returned frame was colorFormat::GRAY.
  */
-CV_EXPORTS_W void MapHist(const GpuMat& hist, CV_OUT Mat& histFull);
+CV_EXPORTS_W void MapHist(const cuda::GpuMat& hist, CV_OUT Mat& histFull);
 
 /** @brief Struct providing information about video file format. :
  */
@@ -387,7 +385,7 @@ class CV_EXPORTS_W VideoReader
     If no frames have been grabbed (there are no more frames in video file), the methods return false.
     The method throws an Exception if error occurs.
      */
-    CV_WRAP virtual bool nextFrame(CV_OUT GpuMat& frame, Stream &stream = Stream::Null()) = 0;
+    CV_WRAP virtual bool nextFrame(CV_OUT cuda::GpuMat& frame, cuda::Stream &stream = cuda::Stream::Null()) = 0;
 
     /** @brief Grabs, decodes and returns the next video frame and frame luma histogram.
 
@@ -401,7 +399,7 @@ class CV_EXPORTS_W VideoReader
 
     @note Histogram data is collected by NVDEC during the decoding process resulting in zero performance penalty. NVDEC computes the histogram data for only the luma component of decoded output, not on post-processed frame(i.e. when scaling, cropping, etc. applied).  If the source is encoded using a limited range of luma values (FormatInfo::videoFullRangeFlag == false) then the histogram bin values will correspond to to this limited range of values and will need to be mapped to contain the same output as cuda::calcHist().  The MapHist() utility function can be used to perform this mapping on the host if required.
      */
-    CV_WRAP_AS(nextFrameWithHist) virtual bool nextFrame(CV_OUT GpuMat& frame, CV_OUT GpuMat& histogram, Stream& stream = Stream::Null()) = 0;
+    CV_WRAP_AS(nextFrameWithHist) virtual bool nextFrame(CV_OUT cuda::GpuMat& frame, CV_OUT cuda::GpuMat& histogram, cuda::Stream& stream = cuda::Stream::Null()) = 0;
 
     /** @brief Returns information about video file format.
     */
@@ -418,7 +416,7 @@ class CV_EXPORTS_W VideoReader
     The primary use of the function is for reading both the encoded and decoded video data when rawMode is enabled.  With rawMode enabled
     retrieve() can be called following grab() to retrieve all the data associated with the current video source since the last call to grab() or the creation of the VideoReader.
      */
-    CV_WRAP virtual bool grab(Stream& stream = Stream::Null()) = 0;
+    CV_WRAP virtual bool grab(cuda::Stream& stream = cuda::Stream::Null()) = 0;
 
     /** @brief Returns previously grabbed video data.
 
@@ -457,7 +455,7 @@ class CV_EXPORTS_W VideoReader
     The method returns data associated with the current video source since the last call to grab(). If no data is present
     the method returns false and the function returns an empty image.
      */
-    CV_WRAP inline bool retrieve(CV_OUT GpuMat& frame) const {
+    CV_WRAP inline bool retrieve(CV_OUT cuda::GpuMat& frame) const {
         return retrieve(OutputArray(frame));
     }
 
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
index a32bf1bb0a..bea1536901 100644
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -96,7 +96,7 @@ class VideoDecoder
         return cuvidDecodePicture(decoder_, picParams) == CUDA_SUCCESS;
     }
 
-    GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
+    cuda::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
     {
         CUdeviceptr ptr;
         unsigned int pitch;