Skip to content

Commit 24384b4

Browse files
committed
Code review fixes.
1 parent 5d55c31 commit 24384b4

File tree

1 file changed

+53
-75
lines changed

1 file changed

+53
-75
lines changed

modules/cudastereo/src/cuda/stereobm.cu

Lines changed: 53 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,38 @@ namespace cv { namespace cuda { namespace device
9494
return col_ssd[0] + cache + cache2;
9595
}
9696

97+
template<int RADIUS>
98+
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int* ssd)
99+
{
100+
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
101+
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
102+
__syncthreads();
103+
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
104+
__syncthreads();
105+
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
106+
__syncthreads();
107+
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
108+
__syncthreads();
109+
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
110+
__syncthreads();
111+
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
112+
__syncthreads();
113+
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
114+
__syncthreads();
115+
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
116+
117+
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
118+
119+
int bestIdx = 0;
120+
for (int i = 0; i < N_DISPARITIES; i++)
121+
{
122+
if (mssd == ssd[i])
123+
bestIdx = i;
124+
}
125+
126+
return make_uint2(mssd, bestIdx);
127+
}
128+
97129
template<int RADIUS>
98130
__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
99131
{
@@ -254,42 +286,17 @@ namespace cv { namespace cuda { namespace device
254286
if (x_tex + BLOCK_W < cwidth)
255287
InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
256288

257-
__syncthreads(); //before CalcSSD function
289+
__syncthreads(); //before MinSSD function
258290

259291
if (Y < cheight - RADIUS)
260292
{
261-
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
262-
batch_ssds[0] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
263-
__syncthreads();
264-
batch_ssds[1] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
265-
__syncthreads();
266-
batch_ssds[2] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
267-
__syncthreads();
268-
batch_ssds[3] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
269-
__syncthreads();
270-
batch_ssds[4] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
271-
__syncthreads();
272-
batch_ssds[5] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
273-
__syncthreads();
274-
batch_ssds[6] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
275-
__syncthreads();
276-
batch_ssds[7] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
277-
278-
uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])),
279-
::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7])));
280-
281-
int bestIdx = 0;
282-
for (int i = 0; i < N_DISPARITIES; i++)
283-
{
284-
if (mssd == batch_ssds[i])
285-
bestIdx = i;
286-
}
293+
uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
287294

288295
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
289-
// computed "minSSD" value is not used at all.
296+
// computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
290297
//
291-
// However, since the batch_ssds computation has "__syncthreads" call in its body, those threads
292-
// must also call "CalcSSD" to avoid deadlock. (#13850)
298+
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
299+
// must also call "MinSSD" to avoid deadlock. (#13850)
293300
//
294301
// From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
295302
// could be an option, but the shared memory access pattern does not allow this option,
@@ -298,7 +305,7 @@ namespace cv { namespace cuda { namespace device
298305
if (X < cwidth - RADIUS)
299306
{
300307
unsigned int last_opt = line_ssd_tails[3*0 + 0];
301-
unsigned int opt = ::min(last_opt, mssd);
308+
unsigned int opt = ::min(last_opt, batch_opt.x);
302309

303310
if (uniquenessRatio > 0)
304311
{
@@ -308,10 +315,10 @@ namespace cv { namespace cuda { namespace device
308315
float thresh = thresh_scale * opt;
309316
int dtest = local_disparity[0];
310317

311-
if(mssd < last_opt)
318+
if(batch_opt.x < last_opt)
312319
{
313320
uniqueness_approved[0] = 1;
314-
dtest = d + bestIdx;
321+
dtest = d + batch_opt.y;
315322
if ((local_disparity[0] < dtest-1 || local_disparity[0] > dtest+1) && (last_opt <= thresh))
316323
{
317324
uniqueness_approved[0] = 0;
@@ -338,9 +345,9 @@ namespace cv { namespace cuda { namespace device
338345
}
339346

340347
line_ssd_tails[3*0 + 0] = opt;
341-
if (mssd < last_opt)
348+
if (batch_opt.x < last_opt)
342349
{
343-
local_disparity[0] = (unsigned char)(d + bestIdx);
350+
local_disparity[0] = (unsigned char)(d + batch_opt.y);
344351
}
345352
}
346353
}
@@ -364,38 +371,12 @@ namespace cv { namespace cuda { namespace device
364371

365372
if (row < cheight - RADIUS - Y)
366373
{
367-
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
368-
batch_ssds[0] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
369-
__syncthreads();
370-
batch_ssds[1] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
371-
__syncthreads();
372-
batch_ssds[2] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
373-
__syncthreads();
374-
batch_ssds[3] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
375-
__syncthreads();
376-
batch_ssds[4] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
377-
__syncthreads();
378-
batch_ssds[5] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
379-
__syncthreads();
380-
batch_ssds[6] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
381-
__syncthreads();
382-
batch_ssds[7] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
383-
384-
uint mssd = ::min(::min(::min(batch_ssds[0], batch_ssds[1]), ::min(batch_ssds[4], batch_ssds[5])),
385-
::min(::min(batch_ssds[2], batch_ssds[3]), ::min(batch_ssds[6], batch_ssds[7])));
386-
387-
int bestIdx = 0;
388-
for (int i = 0; i < N_DISPARITIES; i++)
389-
{
390-
if (mssd == batch_ssds[i])
391-
bestIdx = i;
392-
}
393-
374+
uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X, cwidth, batch_ssds);
394375
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
395-
// computed "minSSD" value, which is the result of "CalcSSDVector" function call, is not used at all.
376+
// computed "batch_opt" value, which is the result of "MinSSD" function call, is not used at all.
396377
//
397-
// However, since the "CalcSSDVector" function has "__syncthreads" call in its body, those threads
398-
// must also call "CalcSSDVector" to avoid deadlock. (#13850)
378+
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
379+
// must also call "MinSSD" to avoid deadlock. (#13850)
399380
//
400381
// From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
401382
// could be an option, but the shared memory access pattern does not allow this option,
@@ -404,7 +385,7 @@ namespace cv { namespace cuda { namespace device
404385
if (X < cwidth - RADIUS)
405386
{
406387
unsigned int last_opt = line_ssd_tails[3*row + 0];
407-
unsigned int opt = ::min(last_opt, mssd);
388+
unsigned int opt = ::min(last_opt, batch_opt.x);
408389
if (uniquenessRatio > 0)
409390
{
410391
line_ssds[0] = line_ssd_tails[3*row + 1];
@@ -413,10 +394,10 @@ namespace cv { namespace cuda { namespace device
413394
float thresh = thresh_scale * opt;
414395
int dtest = local_disparity[row];
415396

416-
if(mssd < last_opt)
397+
if(batch_opt.x < last_opt)
417398
{
418399
uniqueness_approved[row] = 1;
419-
dtest = d + bestIdx;
400+
dtest = d + batch_opt.y;
420401
if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1) && (last_opt <= thresh))
421402
{
422403
uniqueness_approved[row] = 0;
@@ -441,9 +422,9 @@ namespace cv { namespace cuda { namespace device
441422

442423
line_ssd_tails[3*row + 0] = opt;
443424

444-
if (mssd < last_opt)
425+
if (batch_opt.x < last_opt)
445426
{
446-
local_disparity[row] = (unsigned char)(d + bestIdx);
427+
local_disparity[row] = (unsigned char)(d + batch_opt.y);
447428
}
448429
}
449430
}
@@ -521,11 +502,8 @@ namespace cv { namespace cuda { namespace device
521502
if (winsz2 == 0 || winsz2 >= calles_num)
522503
CV_Error(cv::Error::StsBadArg, "Unsupported window size");
523504

524-
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
525-
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
526-
527-
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
528-
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
505+
cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) );
506+
cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) );
529507

530508
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
531509
callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream);

0 commit comments

Comments
 (0)