Skip to content

Commit 47f13d7

Browse files
committed
Merge pull request #3091 from asmorkalov:as/async_warp
2 parents ca6f422 + 4607b7d commit 47f13d7

File tree

1 file changed

+54
-42
lines changed
  • modules/cudawarping/src/cuda

1 file changed

+54
-42
lines changed

modules/cudawarping/src/cuda/warp.cu

Lines changed: 54 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -53,55 +53,75 @@ namespace cv { namespace cuda { namespace device
5353
{
5454
namespace imgproc
5555
{
56-
__constant__ float c_warpMat[3 * 3];
57-
5856
struct AffineTransform
5957
{
60-
static __device__ __forceinline__ float2 calcCoord(int x, int y)
58+
static const int rows = 2;
59+
static __device__ __forceinline__ float2 calcCoord(const float warpMat[AffineTransform::rows * 3], int x, int y)
6160
{
62-
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
63-
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
61+
const float xcoo = warpMat[0] * x + warpMat[1] * y + warpMat[2];
62+
const float ycoo = warpMat[3] * x + warpMat[4] * y + warpMat[5];
6463

6564
return make_float2(xcoo, ycoo);
6665
}
66+
67+
struct Coefficients
68+
{
69+
Coefficients(const float* c_)
70+
{
71+
for(int i = 0; i < AffineTransform::rows * 3; i++)
72+
c[i] = c_[i];
73+
}
74+
float c[AffineTransform::rows * 3];
75+
};
6776
};
6877

6978
struct PerspectiveTransform
7079
{
71-
static __device__ __forceinline__ float2 calcCoord(int x, int y)
80+
static const int rows = 3;
81+
static __device__ __forceinline__ float2 calcCoord(const float warpMat[PerspectiveTransform::rows * 3], int x, int y)
7282
{
73-
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
83+
const float coeff = 1.0f / (warpMat[6] * x + warpMat[7] * y + warpMat[8]);
7484

75-
const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
76-
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
85+
const float xcoo = coeff * (warpMat[0] * x + warpMat[1] * y + warpMat[2]);
86+
const float ycoo = coeff * (warpMat[3] * x + warpMat[4] * y + warpMat[5]);
7787

7888
return make_float2(xcoo, ycoo);
7989
}
90+
struct Coefficients
91+
{
92+
Coefficients(const float* c_)
93+
{
94+
for(int i = 0; i < PerspectiveTransform::rows * 3; i++)
95+
c[i] = c_[i];
96+
}
97+
98+
float c[PerspectiveTransform::rows * 3];
99+
};
80100
};
81101

82102
///////////////////////////////////////////////////////////////////
83103
// Build Maps
84104

85-
template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
105+
template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap, const typename Transform::Coefficients warpMat)
86106
{
87107
const int x = blockDim.x * blockIdx.x + threadIdx.x;
88108
const int y = blockDim.y * blockIdx.y + threadIdx.y;
89109

90110
if (x < xmap.cols && y < xmap.rows)
91111
{
92-
const float2 coord = Transform::calcCoord(x, y);
112+
const float2 coord = Transform::calcCoord(warpMat.c, x, y);
93113

94114
xmap(y, x) = coord.x;
95115
ymap(y, x) = coord.y;
96116
}
97117
}
98118

99-
template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
119+
template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, const float warpMat[Transform::rows * 3], cudaStream_t stream)
100120
{
101121
dim3 block(32, 8);
102122
dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
103123

104-
buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
124+
buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap, warpMat);
105125
cudaSafeCall( cudaGetLastError() );
106126

107127
if (stream == 0)
@@ -110,37 +130,33 @@ namespace cv { namespace cuda { namespace device
110130

111131
void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
112132
{
113-
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
114-
115-
buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
133+
buildWarpMaps_caller<AffineTransform>(xmap, ymap, coeffs, stream);
116134
}
117135

118136
void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
119137
{
120-
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
121-
122-
buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
138+
buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, coeffs, stream);
123139
}
124140

125141
///////////////////////////////////////////////////////////////////
126142
// Warp
127143

128-
template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
144+
template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst, const typename Transform::Coefficients warpMat)
129145
{
130146
const int x = blockDim.x * blockIdx.x + threadIdx.x;
131147
const int y = blockDim.y * blockIdx.y + threadIdx.y;
132148

133149
if (x < dst.cols && y < dst.rows)
134150
{
135-
const float2 coord = Transform::calcCoord(x, y);
151+
const float2 coord = Transform::calcCoord(warpMat.c, x, y);
136152

137153
dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
138154
}
139155
}
140156

141157
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
142158
{
143-
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
159+
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool)
144160
{
145161
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
146162

@@ -151,14 +167,14 @@ namespace cv { namespace cuda { namespace device
151167
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
152168
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
153169

154-
warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
170+
warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst, warpMat);
155171
cudaSafeCall( cudaGetLastError() );
156172
}
157173
};
158174

159175
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
160176
{
161-
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
177+
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], bool)
162178
{
163179
CV_UNUSED(xoff);
164180
CV_UNUSED(yoff);
@@ -173,7 +189,7 @@ namespace cv { namespace cuda { namespace device
173189
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
174190
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
175191

176-
warp<Transform><<<grid, block>>>(filter_src, dst);
192+
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat);
177193
cudaSafeCall( cudaGetLastError() );
178194

179195
cudaSafeCall( cudaDeviceSynchronize() );
@@ -195,7 +211,7 @@ namespace cv { namespace cuda { namespace device
195211
}; \
196212
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
197213
{ \
198-
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
214+
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20) \
199215
{ \
200216
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
201217
dim3 block(32, cc20 ? 8 : 4); \
@@ -205,14 +221,14 @@ namespace cv { namespace cuda { namespace device
205221
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
206222
BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
207223
Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
208-
warp<Transform><<<grid, block>>>(filter_src, dst); \
224+
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
209225
cudaSafeCall( cudaGetLastError() ); \
210226
cudaSafeCall( cudaDeviceSynchronize() ); \
211227
} \
212228
}; \
213229
template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
214230
{ \
215-
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
231+
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, const float warpMat[Transform::rows*3], bool) \
216232
{ \
217233
dim3 block(32, 8); \
218234
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
@@ -221,14 +237,14 @@ namespace cv { namespace cuda { namespace device
221237
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
222238
{ \
223239
Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
224-
warp<Transform><<<grid, block>>>(filter_src, dst); \
240+
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
225241
} \
226242
else \
227243
{ \
228244
BrdReplicate<type> brd(src.rows, src.cols); \
229245
BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
230246
Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
231-
warp<Transform><<<grid, block>>>(filter_src, dst); \
247+
warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
232248
} \
233249
cudaSafeCall( cudaGetLastError() ); \
234250
cudaSafeCall( cudaDeviceSynchronize() ); \
@@ -263,20 +279,20 @@ namespace cv { namespace cuda { namespace device
263279

264280
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
265281
{
266-
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
282+
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool cc20)
267283
{
268284
if (stream == 0)
269-
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
285+
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, warpMat, cc20);
270286
else
271-
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
287+
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, warpMat, stream, cc20);
272288
}
273289
};
274290

275291
template <class Transform, typename T>
276292
void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
277-
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
293+
int borderMode, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool cc20)
278294
{
279-
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
295+
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], cudaStream_t stream, bool cc20);
280296

281297
static const func_t funcs[3][5] =
282298
{
@@ -304,15 +320,13 @@ namespace cv { namespace cuda { namespace device
304320
};
305321

306322
funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
307-
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
323+
static_cast< PtrStepSz<T> >(dst), borderValue, warpMat, stream, cc20);
308324
}
309325

310326
template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
311327
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
312328
{
313-
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
314-
315-
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
329+
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, coeffs, stream, cc20);
316330
}
317331

318332
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -348,9 +362,7 @@ namespace cv { namespace cuda { namespace device
348362
template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
349363
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
350364
{
351-
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
352-
353-
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
365+
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, coeffs, stream, cc20);
354366
}
355367

356368
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

0 commit comments

Comments
 (0)