@@ -94,6 +94,38 @@ namespace cv { namespace cuda { namespace device
9494 return col_ssd[0 ] + cache + cache2;
9595 }
9696
97+ template <int RADIUS>
98+ __device__ uint2 MinSSD (volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X, int cwidth, unsigned int * ssd)
99+ {
100+ // See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
101+ ssd[0 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
102+ __syncthreads ();
103+ ssd[1 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
104+ __syncthreads ();
105+ ssd[2 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
106+ __syncthreads ();
107+ ssd[3 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
108+ __syncthreads ();
109+ ssd[4 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
110+ __syncthreads ();
111+ ssd[5 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
112+ __syncthreads ();
113+ ssd[6 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
114+ __syncthreads ();
115+ ssd[7 ] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
116+
117+ int mssd = ::min (::min (::min (ssd[0 ], ssd[1 ]), ::min (ssd[4 ], ssd[5 ])), ::min (::min (ssd[2 ], ssd[3 ]), ::min (ssd[6 ], ssd[7 ])));
118+
119+ int bestIdx = 0 ;
120+ for (int i = 0 ; i < N_DISPARITIES; i++)
121+ {
122+ if (mssd == ssd[i])
123+ bestIdx = i;
124+ }
125+
126+ return make_uint2 (mssd, bestIdx);
127+ }
128+
97129 template <int RADIUS>
98130 __device__ void StepDown (int idx1, int idx2, unsigned char * imageL, unsigned char * imageR, int d, volatile unsigned int *col_ssd)
99131 {
@@ -254,42 +286,17 @@ namespace cv { namespace cuda { namespace device
254286 if (x_tex + BLOCK_W < cwidth)
255287 InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
256288
257- __syncthreads (); // before CalcSSD function
289+ __syncthreads (); // before MinSSD function
258290
259291 if (Y < cheight - RADIUS)
260292 {
261- // See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
262- batch_ssds[0 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
263- __syncthreads ();
264- batch_ssds[1 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
265- __syncthreads ();
266- batch_ssds[2 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
267- __syncthreads ();
268- batch_ssds[3 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
269- __syncthreads ();
270- batch_ssds[4 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
271- __syncthreads ();
272- batch_ssds[5 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
273- __syncthreads ();
274- batch_ssds[6 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
275- __syncthreads ();
276- batch_ssds[7 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
277-
278- uint mssd = ::min (::min (::min (batch_ssds[0 ], batch_ssds[1 ]), ::min (batch_ssds[4 ], batch_ssds[5 ])),
279- ::min (::min(batch_ssds[2 ], batch_ssds[3 ]), ::min(batch_ssds[6 ], batch_ssds[7 ])));
280-
281- int bestIdx = 0 ;
282- for (int i = 0 ; i < N_DISPARITIES; i++)
283- {
284- if (mssd == batch_ssds[i])
285- bestIdx = i;
286- }
293+ uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd, X, cwidth, batch_ssds);
287294
288295 // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
289- // computed "minSSD " value is not used at all.
296+ // computed "batch_opt " value is not used at all.
290297 //
291298 // However, since the batch_ssds computation has "__syncthreads" call in its body, those threads
292- // must also call "CalcSSD " to avoid deadlock. (#13850)
299+ // must also call "MinSSD " to avoid deadlock. (#13850)
293300 //
294301 // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
295302 // could be an option, but the shared memory access pattern does not allow this option,
@@ -298,7 +305,7 @@ namespace cv { namespace cuda { namespace device
298305 if (X < cwidth - RADIUS)
299306 {
300307 unsigned int last_opt = line_ssd_tails[3 *0 + 0 ];
301- unsigned int opt = ::min (last_opt, mssd );
308+ unsigned int opt = ::min (last_opt, batch_opt. x );
302309
303310 if (uniquenessRatio > 0 )
304311 {
@@ -308,10 +315,10 @@ namespace cv { namespace cuda { namespace device
308315 float thresh = thresh_scale * opt;
309316 int dtest = local_disparity[0 ];
310317
311- if (mssd < last_opt)
318+ if (batch_opt. x < last_opt)
312319 {
313320 uniqueness_approved[0 ] = 1 ;
314- dtest = d + bestIdx ;
321+ dtest = d + batch_opt. y ;
315322 if ((local_disparity[0 ] < dtest-1 || local_disparity[0 ] > dtest+1 ) && (last_opt <= thresh))
316323 {
317324 uniqueness_approved[0 ] = 0 ;
@@ -338,9 +345,9 @@ namespace cv { namespace cuda { namespace device
338345 }
339346
340347 line_ssd_tails[3 *0 + 0 ] = opt;
341- if (mssd < last_opt)
348+ if (batch_opt. x < last_opt)
342349 {
343- local_disparity[0 ] = (unsigned char )(d + bestIdx );
350+ local_disparity[0 ] = (unsigned char )(d + batch_opt. y );
344351 }
345352 }
346353 }
@@ -364,38 +371,12 @@ namespace cv { namespace cuda { namespace device
364371
365372 if (row < cheight - RADIUS - Y)
366373 {
367- // See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
368- batch_ssds[0 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X, cwidth);
369- __syncthreads ();
370- batch_ssds[1 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X, cwidth);
371- __syncthreads ();
372- batch_ssds[2 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X, cwidth);
373- __syncthreads ();
374- batch_ssds[3 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X, cwidth);
375- __syncthreads ();
376- batch_ssds[4 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X, cwidth);
377- __syncthreads ();
378- batch_ssds[5 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X, cwidth);
379- __syncthreads ();
380- batch_ssds[6 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X, cwidth);
381- __syncthreads ();
382- batch_ssds[7 ] = CalcSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X, cwidth);
383-
384- uint mssd = ::min (::min (::min (batch_ssds[0 ], batch_ssds[1 ]), ::min (batch_ssds[4 ], batch_ssds[5 ])),
385- ::min (::min(batch_ssds[2 ], batch_ssds[3 ]), ::min(batch_ssds[6 ], batch_ssds[7 ])));
386-
387- int bestIdx = 0 ;
388- for (int i = 0 ; i < N_DISPARITIES; i++)
389- {
390- if (mssd == batch_ssds[i])
391- bestIdx = i;
392- }
393-
374+ uint2 batch_opt = MinSSD<RADIUS>(col_ssd_cache + threadIdx .x , col_ssd, X, cwidth, batch_ssds);
394375 // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
395- // computed "minSSD " value, which is the result of "CalcSSDVector " function call, is not used at all.
376+ // computed "batch_opt " value, which is the result of "MinSSD " function call, is not used at all.
396377 //
397- // However, since the "CalcSSDVector " function has "__syncthreads" call in its body, those threads
398- // must also call "CalcSSDVector " to avoid deadlock. (#13850)
378+ // However, since the "MinSSD " function has "__syncthreads" call in its body, those threads
379+ // must also call "MinSSD " to avoid deadlock. (#13850)
399380 //
400381 // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
401382 // could be an option, but the shared memory access pattern does not allow this option,
@@ -404,7 +385,7 @@ namespace cv { namespace cuda { namespace device
404385 if (X < cwidth - RADIUS)
405386 {
406387 unsigned int last_opt = line_ssd_tails[3 *row + 0 ];
407- unsigned int opt = ::min (last_opt, mssd );
388+ unsigned int opt = ::min (last_opt, batch_opt. x );
408389 if (uniquenessRatio > 0 )
409390 {
410391 line_ssds[0 ] = line_ssd_tails[3 *row + 1 ];
@@ -413,10 +394,10 @@ namespace cv { namespace cuda { namespace device
413394 float thresh = thresh_scale * opt;
414395 int dtest = local_disparity[row];
415396
416- if (mssd < last_opt)
397+ if (batch_opt. x < last_opt)
417398 {
418399 uniqueness_approved[row] = 1 ;
419- dtest = d + bestIdx ;
400+ dtest = d + batch_opt. y ;
420401 if ((local_disparity[row] < dtest-1 || local_disparity[row] > dtest+1 ) && (last_opt <= thresh))
421402 {
422403 uniqueness_approved[row] = 0 ;
@@ -441,9 +422,9 @@ namespace cv { namespace cuda { namespace device
441422
442423 line_ssd_tails[3 *row + 0 ] = opt;
443424
444- if (mssd < last_opt)
425+ if (batch_opt. x < last_opt)
445426 {
446- local_disparity[row] = (unsigned char )(d + bestIdx );
427+ local_disparity[row] = (unsigned char )(d + batch_opt. y );
447428 }
448429 }
449430 }
0 commit comments