Skip to content

Commit 51dbf82

Browse files
committed
shared memory optimization
1 parent 595cde3 commit 51dbf82

File tree

2 files changed

+271
-42
lines changed

2 files changed

+271
-42
lines changed

modules/cudawarping/src/cuda/resize.cu

Lines changed: 263 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454

5555
namespace cv { namespace cuda { namespace device
5656
{
57-
__device__ float lanczos_weight(float x_)
57+
__device__ __forceinline__ float lanczos_weight(float x_)
5858
{
5959
float x = fabsf(x_);
6060
if (x == 0.0f)
@@ -69,75 +69,154 @@ namespace cv { namespace cuda { namespace device
6969
template <typename T>
7070
__global__ void resize_lanczos4(const PtrStepSz<T> src, PtrStepSz<T> dst, const float fy, const float fx)
7171
{
72-
const int x = blockIdx.x * blockDim.x + threadIdx.x;
73-
const int y = blockIdx.y * blockDim.y + threadIdx.y;
72+
const int tx = threadIdx.x;
73+
const int ty = threadIdx.y;
74+
const int bx = blockIdx.x;
75+
const int by = blockIdx.y;
7476

75-
if (x >= dst.cols || y >= dst.rows)
76-
return;
77-
78-
const float src_x = static_cast<float>(x) * fx;
79-
const float src_y = static_cast<float>(y) * fy;
77+
const int x = bx * blockDim.x + tx;
78+
const int y = by * blockDim.y + ty;
8079

8180
const int in_height = src.rows;
8281
const int in_width = src.cols;
8382

83+
constexpr int R = 4;
84+
constexpr int BASE_W = 32;
85+
constexpr int BASE_H = 8;
86+
constexpr int SHARED_WIDTH_MAX = BASE_W + R + R;
87+
constexpr int SHARED_HEIGHT_MAX = BASE_H + R + R;
88+
89+
__shared__ T shared_src[SHARED_HEIGHT_MAX][SHARED_WIDTH_MAX];
90+
8491
typedef typename VecTraits<T>::elem_type elem_type;
8592
constexpr int cn = VecTraits<T>::cn;
86-
float results[cn] = {0.0f};
8793

88-
for (int c = 0; c < cn; ++c)
89-
{
90-
float acc_val = 0.0f;
91-
float acc_weight = 0.0f;
94+
const int out_x0 = bx * blockDim.x;
95+
const int out_x1 = ::min(out_x0 + blockDim.x - 1, dst.cols - 1);
96+
const int out_y0 = by * blockDim.y;
97+
const int out_y1 = ::min(out_y0 + blockDim.y - 1, dst.rows - 1);
9298

99+
const float src_x0_f = (static_cast<float>(out_x0) + 0.5f) * fx - 0.5f;
100+
const float src_x1_f = (static_cast<float>(out_x1) + 0.5f) * fx - 0.5f;
101+
const float src_y0_f = (static_cast<float>(out_y0) + 0.5f) * fy - 0.5f;
102+
const float src_y1_f = (static_cast<float>(out_y1) + 0.5f) * fy - 0.5f;
93103

94-
const int xmin = int(floorf(src_x)) - 3;
95-
const int xmax = int(floorf(src_x)) + 4;
96-
const int ymin = int(floorf(src_y)) - 3;
97-
const int ymax = int(floorf(src_y)) + 4;
104+
int in_x_min = int(floorf(src_x0_f)) - R;
105+
int in_x_max = int(floorf(src_x1_f)) + R;
106+
int in_y_min = int(floorf(src_y0_f)) - R;
107+
int in_y_max = int(floorf(src_y1_f)) + R;
98108

99-
for (int cy = ymin; cy <= ymax; ++cy)
100-
{
101-
float wy = lanczos_weight(src_y - static_cast<float>(cy));
102-
if (wy == 0.0f)
103-
continue;
109+
if (in_x_min < 0) in_x_min = 0;
110+
if (in_y_min < 0) in_y_min = 0;
111+
if (in_x_max >= in_width) in_x_max = in_width - 1;
112+
if (in_y_max >= in_height) in_y_max = in_height - 1;
104113

105-
for (int cx = xmin; cx <= xmax; ++cx)
114+
const int W_needed = in_x_max - in_x_min + 1;
115+
const int H_needed = in_y_max - in_y_min + 1;
116+
117+
// for fx <= 1 and fy <= 1
118+
const bool use_shared = (W_needed <= SHARED_WIDTH_MAX) && (H_needed <= SHARED_HEIGHT_MAX);
119+
120+
if (use_shared)
121+
{
122+
for (int sy = ty; sy < H_needed; sy += blockDim.y)
123+
{
124+
int iy = in_y_min + sy;
125+
for (int sx = tx; sx < W_needed; sx += blockDim.x)
106126
{
107-
float wx = lanczos_weight(src_x - static_cast<float>(cx));
108-
if (wx == 0.0f)
109-
continue;
127+
int ix = in_x_min + sx;
128+
shared_src[sy][sx] = src(iy, ix);
129+
}
130+
}
131+
__syncthreads();
132+
}
110133

111-
float w = wy * wx;
134+
if (x >= dst.cols || y >= dst.rows)
135+
{
136+
if (use_shared) { __syncthreads(); }
137+
return;
138+
}
112139

113-
int iy = ::max(0, ::min(cy, in_height - 1));
114-
int ix = ::max(0, ::min(cx, in_width - 1));
140+
const float src_x = (static_cast<float>(x) + 0.5f) * fx - 0.5f;
141+
const float src_y = (static_cast<float>(y) + 0.5f) * fy - 0.5f;
115142

116-
T val = src(iy, ix);
117-
118-
const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
119-
elem_type elem_val = val_ptr[c];
120-
float channel_val = static_cast<float>(elem_val);
143+
const int xmin = int(floorf(src_x)) - 3;
144+
const int xmax = int(floorf(src_x)) + 4;
145+
const int ymin = int(floorf(src_y)) - 3;
146+
const int ymax = int(floorf(src_y)) + 4;
121147

122-
acc_val += channel_val * w;
123-
acc_weight += w;
148+
float results[cn];
149+
float acc_weights[cn];
150+
#pragma unroll
151+
for (int c = 0; c < cn; ++c) { results[c] = 0.0f; acc_weights[c] = 0.0f; }
152+
153+
for (int cy = ymin; cy <= ymax; ++cy)
154+
{
155+
float wy = lanczos_weight(src_y - static_cast<float>(cy));
156+
if (wy == 0.0f) continue;
157+
158+
for (int cx = xmin; cx <= xmax; ++cx)
159+
{
160+
float wx = lanczos_weight(src_x - static_cast<float>(cx));
161+
if (wx == 0.0f) continue;
162+
163+
float w = wy * wx;
164+
165+
if (use_shared)
166+
{
167+
int sx = cx - in_x_min;
168+
int sy = cy - in_y_min;
169+
if (sx < 0) sx = 0;
170+
else if (sx >= W_needed) sx = W_needed - 1;
171+
if (sy < 0) sy = 0;
172+
else if (sy >= H_needed) sy = H_needed - 1;
173+
174+
T val = shared_src[sy][sx];
175+
const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
176+
#pragma unroll
177+
for (int c = 0; c < cn; ++c)
178+
{
179+
elem_type elem_val = val_ptr[c];
180+
float channel_val = static_cast<float>(elem_val);
181+
results[c] += channel_val * w;
182+
acc_weights[c] += w;
183+
}
184+
}
185+
else
186+
{
187+
// fallback
188+
int iy_r = cy < 0 ? 0 : (cy >= in_height ? (in_height - 1) : cy);
189+
int ix_r = cx < 0 ? 0 : (cx >= in_width ? (in_width - 1) : cx);
190+
T val = src(iy_r, ix_r);
191+
const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
192+
#pragma unroll
193+
for (int c = 0; c < cn; ++c)
194+
{
195+
elem_type elem_val = val_ptr[c];
196+
float channel_val = static_cast<float>(elem_val);
197+
results[c] += channel_val * w;
198+
acc_weights[c] += w;
199+
}
124200
}
125201
}
126-
127-
float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f;
128-
results[c] = result;
129202
}
130203

204+
#pragma unroll
205+
for (int c = 0; c < cn; ++c)
206+
results[c] = acc_weights[c] > 0.0f ? (results[c] / acc_weights[c]) : 0.0f;
207+
131208
T result_vec;
132209
elem_type* result_ptr = reinterpret_cast<elem_type*>(&result_vec);
210+
211+
#pragma unroll
133212
for (int c = 0; c < cn; ++c)
134-
{
135213
result_ptr[c] = saturate_cast<elem_type>(results[c]);
136-
}
214+
137215
dst(y, x) = result_vec;
138216
}
139217

140218

219+
141220
template <typename T> __global__ void resize_nearest(const PtrStep<T> src, PtrStepSz<T> dst, const float fy, const float fx)
142221
{
143222
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -341,6 +420,92 @@ namespace cv { namespace cuda { namespace device
341420
cudaSafeCall( cudaDeviceSynchronize() );
342421
}
343422

423+
template <typename Ptr2D, typename T>
424+
__global__ void resize_lanczos4_tex(Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
425+
{
426+
const int x = blockDim.x * blockIdx.x + threadIdx.x;
427+
const int y = blockDim.y * blockIdx.y + threadIdx.y;
428+
429+
if (x >= dst.cols || y >= dst.rows)
430+
return;
431+
432+
const float src_x = (static_cast<float>(x) + 0.5f) * fx - 0.5f;
433+
const float src_y = (static_cast<float>(y) + 0.5f) * fy - 0.5f;
434+
435+
typedef typename VecTraits<T>::elem_type elem_type;
436+
constexpr int cn = VecTraits<T>::cn;
437+
float results[cn] = {0.0f};
438+
439+
for (int c = 0; c < cn; ++c)
440+
{
441+
float acc_val = 0.0f;
442+
float acc_weight = 0.0f;
443+
444+
const int xmin = int(floorf(src_x)) - 3;
445+
const int xmax = int(floorf(src_x)) + 4;
446+
const int ymin = int(floorf(src_y)) - 3;
447+
const int ymax = int(floorf(src_y)) + 4;
448+
449+
for (int cy = ymin; cy <= ymax; ++cy)
450+
{
451+
float wy = lanczos_weight(src_y - static_cast<float>(cy));
452+
if (wy == 0.0f)
453+
continue;
454+
455+
for (int cx = xmin; cx <= xmax; ++cx)
456+
{
457+
float wx = lanczos_weight(src_x - static_cast<float>(cx));
458+
if (wx == 0.0f)
459+
continue;
460+
461+
float w = wy * wx;
462+
463+
// Use texture memory for sampling (handles boundary automatically)
464+
T val = src(static_cast<float>(cy), static_cast<float>(cx));
465+
466+
const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
467+
elem_type elem_val = val_ptr[c];
468+
float channel_val = static_cast<float>(elem_val);
469+
470+
acc_val += channel_val * w;
471+
acc_weight += w;
472+
}
473+
}
474+
475+
float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f;
476+
results[c] = result;
477+
}
478+
479+
T result_vec;
480+
elem_type* result_ptr = reinterpret_cast<elem_type*>(&result_vec);
481+
for (int c = 0; c < cn; ++c)
482+
{
483+
result_ptr[c] = saturate_cast<elem_type>(results[c]);
484+
}
485+
dst(y, x) = result_vec;
486+
}
487+
488+
template <typename T>
489+
void call_resize_lanczos4_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
490+
{
491+
const dim3 block(32, 8);
492+
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
493+
if (srcWhole.data == src.data)
494+
{
495+
cudev::Texture<T> texSrc(src);
496+
resize_lanczos4_tex<cudev::TexturePtr<T>><<<grid, block>>>(texSrc, dst, fy, fx);
497+
}
498+
else
499+
{
500+
cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
501+
BrdReplicate<T> brd(src.rows, src.cols);
502+
BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
503+
resize_lanczos4_tex<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>><<<grid, block>>>(brdSrc, dst, fy, fx);
504+
}
505+
cudaSafeCall( cudaGetLastError() );
506+
cudaSafeCall( cudaDeviceSynchronize() );
507+
}
508+
344509
// ResizeNearestDispatcher
345510

346511
template <typename T> struct ResizeNearestDispatcher
@@ -460,6 +625,63 @@ namespace cv { namespace cuda { namespace device
460625
}
461626
};
462627

628+
template <typename T> struct SelectImplForLanczos
629+
{
630+
static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
631+
{
632+
if (stream)
633+
call_resize_lanczos4_glob(src, dst, fy, fx, stream);
634+
else
635+
{
636+
if (fx > 1 || fy > 1)
637+
call_resize_lanczos4_glob(src, dst, fy, fx, 0);
638+
else
639+
call_resize_lanczos4_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
640+
}
641+
}
642+
};
643+
644+
// Texture memory doesn't support 3-channel types, so use glob for those
645+
template <> struct ResizeLanczosDispatcher<uchar> : SelectImplForLanczos<uchar> {};
646+
template <> struct ResizeLanczosDispatcher<uchar3>
647+
{
648+
static void call(const PtrStepSz<uchar3>& src, const PtrStepSz<uchar3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<uchar3>& dst, float fy, float fx, cudaStream_t stream)
649+
{
650+
call_resize_lanczos4_glob(src, dst, fy, fx, stream);
651+
}
652+
};
653+
template <> struct ResizeLanczosDispatcher<uchar4> : SelectImplForLanczos<uchar4> {};
654+
655+
template <> struct ResizeLanczosDispatcher<ushort> : SelectImplForLanczos<ushort> {};
656+
template <> struct ResizeLanczosDispatcher<ushort3>
657+
{
658+
static void call(const PtrStepSz<ushort3>& src, const PtrStepSz<ushort3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<ushort3>& dst, float fy, float fx, cudaStream_t stream)
659+
{
660+
call_resize_lanczos4_glob(src, dst, fy, fx, stream);
661+
}
662+
};
663+
template <> struct ResizeLanczosDispatcher<ushort4> : SelectImplForLanczos<ushort4> {};
664+
665+
template <> struct ResizeLanczosDispatcher<short> : SelectImplForLanczos<short> {};
666+
template <> struct ResizeLanczosDispatcher<short3>
667+
{
668+
static void call(const PtrStepSz<short3>& src, const PtrStepSz<short3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<short3>& dst, float fy, float fx, cudaStream_t stream)
669+
{
670+
call_resize_lanczos4_glob(src, dst, fy, fx, stream);
671+
}
672+
};
673+
template <> struct ResizeLanczosDispatcher<short4> : SelectImplForLanczos<short4> {};
674+
675+
template <> struct ResizeLanczosDispatcher<float> : SelectImplForLanczos<float> {};
676+
template <> struct ResizeLanczosDispatcher<float3>
677+
{
678+
static void call(const PtrStepSz<float3>& src, const PtrStepSz<float3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<float3>& dst, float fy, float fx, cudaStream_t stream)
679+
{
680+
call_resize_lanczos4_glob(src, dst, fy, fx, stream);
681+
}
682+
};
683+
template <> struct ResizeLanczosDispatcher<float4> : SelectImplForLanczos<float4> {};
684+
463685
// ResizeAreaDispatcher
464686

465687
template <typename T> struct ResizeAreaDispatcher

modules/cudawarping/test/test_lanczos.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,19 @@ namespace
2121
float ifx = static_cast<float>(1.0 / fx);
2222
float ify = static_cast<float>(1.0 / fy);
2323

24+
// OpenCV CPU resize uses center-aligned coordinate mapping: (x + 0.5) * fx - 0.5
25+
// Since fx and fy here are scale factors, and ifx = 1.0 / fx, ify = 1.0 / fy,
26+
// the center-aligned mapping becomes: (x + 0.5) / fx - 0.5 = (x + 0.5) * ifx - 0.5
2427
for (int y = 0; y < dsize.height; ++y)
2528
{
2629
for (int x = 0; x < dsize.width; ++x)
2730
{
2831
for (int c = 0; c < cn; ++c)
29-
dst.at<T>(y, x * cn + c) = LanczosInterpolator<T>::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE);
32+
{
33+
float src_x = (static_cast<float>(x) + 0.5f) * ifx - 0.5f;
34+
float src_y = (static_cast<float>(y) + 0.5f) * ify - 0.5f;
35+
dst.at<T>(y, x * cn + c) = LanczosInterpolator<T>::getValue(src, src_y, src_x, c, cv::BORDER_REPLICATE);
36+
}
3037
}
3138
}
3239
}

0 commit comments

Comments
 (0)