5454
5555namespace cv { namespace cuda { namespace device
5656{
57- __device__ float lanczos_weight (float x_)
57+ __device__ __forceinline__ float lanczos_weight (float x_)
5858 {
5959 float x = fabsf (x_);
6060 if (x == 0 .0f )
@@ -69,75 +69,154 @@ namespace cv { namespace cuda { namespace device
6969 template <typename T>
7070 __global__ void resize_lanczos4 (const PtrStepSz<T> src, PtrStepSz<T> dst, const float fy, const float fx)
7171 {
72- const int x = blockIdx .x * blockDim .x + threadIdx .x ;
73- const int y = blockIdx .y * blockDim .y + threadIdx .y ;
72+ const int tx = threadIdx .x ;
73+ const int ty = threadIdx .y ;
74+ const int bx = blockIdx .x ;
75+ const int by = blockIdx .y ;
7476
75- if (x >= dst.cols || y >= dst.rows )
76- return ;
77-
78- const float src_x = static_cast <float >(x) * fx;
79- const float src_y = static_cast <float >(y) * fy;
77+ const int x = bx * blockDim .x + tx;
78+ const int y = by * blockDim .y + ty;
8079
8180 const int in_height = src.rows ;
8281 const int in_width = src.cols ;
8382
83+ constexpr int R = 4 ;
84+ constexpr int BASE_W = 32 ;
85+ constexpr int BASE_H = 8 ;
86+ constexpr int SHARED_WIDTH_MAX = BASE_W + R + R;
87+ constexpr int SHARED_HEIGHT_MAX = BASE_H + R + R;
88+
89+ __shared__ T shared_src[SHARED_HEIGHT_MAX][SHARED_WIDTH_MAX];
90+
8491 typedef typename VecTraits<T>::elem_type elem_type;
8592 constexpr int cn = VecTraits<T>::cn;
86- float results[cn] = {0 .0f };
8793
88- for ( int c = 0 ; c < cn; ++c)
89- {
90- float acc_val = 0 . 0f ;
91- float acc_weight = 0 . 0f ;
94+ const int out_x0 = bx * blockDim . x ;
95+ const int out_x1 = :: min (out_x0 + blockDim . x - 1 , dst. cols - 1 );
96+ const int out_y0 = by * blockDim . y ;
97+ const int out_y1 = :: min (out_y0 + blockDim . y - 1 , dst. rows - 1 ) ;
9298
99+ const float src_x0_f = (static_cast <float >(out_x0) + 0 .5f ) * fx - 0 .5f ;
100+ const float src_x1_f = (static_cast <float >(out_x1) + 0 .5f ) * fx - 0 .5f ;
101+ const float src_y0_f = (static_cast <float >(out_y0) + 0 .5f ) * fy - 0 .5f ;
102+ const float src_y1_f = (static_cast <float >(out_y1) + 0 .5f ) * fy - 0 .5f ;
93103
94- const int xmin = int (floorf (src_x )) - 3 ;
95- const int xmax = int (floorf (src_x )) + 4 ;
96- const int ymin = int (floorf (src_y )) - 3 ;
97- const int ymax = int (floorf (src_y )) + 4 ;
104+ int in_x_min = int (floorf (src_x0_f )) - R ;
105+ int in_x_max = int (floorf (src_x1_f )) + R ;
106+ int in_y_min = int (floorf (src_y0_f )) - R ;
107+ int in_y_max = int (floorf (src_y1_f )) + R ;
98108
99- for (int cy = ymin; cy <= ymax; ++cy)
100- {
101- float wy = lanczos_weight (src_y - static_cast <float >(cy));
102- if (wy == 0 .0f )
103- continue ;
109+ if (in_x_min < 0 ) in_x_min = 0 ;
110+ if (in_y_min < 0 ) in_y_min = 0 ;
111+ if (in_x_max >= in_width) in_x_max = in_width - 1 ;
112+ if (in_y_max >= in_height) in_y_max = in_height - 1 ;
104113
105- for (int cx = xmin; cx <= xmax; ++cx)
114+ const int W_needed = in_x_max - in_x_min + 1 ;
115+ const int H_needed = in_y_max - in_y_min + 1 ;
116+
117+ // for fx <= 1 and fy <= 1
118+ const bool use_shared = (W_needed <= SHARED_WIDTH_MAX) && (H_needed <= SHARED_HEIGHT_MAX);
119+
120+ if (use_shared)
121+ {
122+ for (int sy = ty; sy < H_needed; sy += blockDim .y )
123+ {
124+ int iy = in_y_min + sy;
125+ for (int sx = tx; sx < W_needed; sx += blockDim .x )
106126 {
107- float wx = lanczos_weight (src_x - static_cast <float >(cx));
108- if (wx == 0 .0f )
109- continue ;
127+ int ix = in_x_min + sx;
128+ shared_src[sy][sx] = src (iy, ix);
129+ }
130+ }
131+ __syncthreads ();
132+ }
110133
111- float w = wy * wx;
134+ if (x >= dst.cols || y >= dst.rows )
135+ {
136+ if (use_shared) { __syncthreads (); }
137+ return ;
138+ }
112139
113- int iy = :: max ( 0 , :: min (cy, in_height - 1 )) ;
114- int ix = :: max ( 0 , :: min (cx, in_width - 1 )) ;
140+ const float src_x = ( static_cast < float >(x) + 0 . 5f ) * fx - 0 . 5f ;
141+ const float src_y = ( static_cast < float >(y) + 0 . 5f ) * fy - 0 . 5f ;
115142
116- T val = src (iy, ix);
117-
118- const elem_type* val_ptr = reinterpret_cast <const elem_type*>(&val);
119- elem_type elem_val = val_ptr[c];
120- float channel_val = static_cast <float >(elem_val);
143+ const int xmin = int (floorf (src_x)) - 3 ;
144+ const int xmax = int (floorf (src_x)) + 4 ;
145+ const int ymin = int (floorf (src_y)) - 3 ;
146+ const int ymax = int (floorf (src_y)) + 4 ;
121147
122- acc_val += channel_val * w;
123- acc_weight += w;
148+ float results[cn];
149+ float acc_weights[cn];
150+ #pragma unroll
151+ for (int c = 0 ; c < cn; ++c) { results[c] = 0 .0f ; acc_weights[c] = 0 .0f ; }
152+
153+ for (int cy = ymin; cy <= ymax; ++cy)
154+ {
155+ float wy = lanczos_weight (src_y - static_cast <float >(cy));
156+ if (wy == 0 .0f ) continue ;
157+
158+ for (int cx = xmin; cx <= xmax; ++cx)
159+ {
160+ float wx = lanczos_weight (src_x - static_cast <float >(cx));
161+ if (wx == 0 .0f ) continue ;
162+
163+ float w = wy * wx;
164+
165+ if (use_shared)
166+ {
167+ int sx = cx - in_x_min;
168+ int sy = cy - in_y_min;
169+ if (sx < 0 ) sx = 0 ;
170+ else if (sx >= W_needed) sx = W_needed - 1 ;
171+ if (sy < 0 ) sy = 0 ;
172+ else if (sy >= H_needed) sy = H_needed - 1 ;
173+
174+ T val = shared_src[sy][sx];
175+ const elem_type* val_ptr = reinterpret_cast <const elem_type*>(&val);
176+ #pragma unroll
177+ for (int c = 0 ; c < cn; ++c)
178+ {
179+ elem_type elem_val = val_ptr[c];
180+ float channel_val = static_cast <float >(elem_val);
181+ results[c] += channel_val * w;
182+ acc_weights[c] += w;
183+ }
184+ }
185+ else
186+ {
187+ // fallback
188+ int iy_r = cy < 0 ? 0 : (cy >= in_height ? (in_height - 1 ) : cy);
189+ int ix_r = cx < 0 ? 0 : (cx >= in_width ? (in_width - 1 ) : cx);
190+ T val = src (iy_r, ix_r);
191+ const elem_type* val_ptr = reinterpret_cast <const elem_type*>(&val);
192+ #pragma unroll
193+ for (int c = 0 ; c < cn; ++c)
194+ {
195+ elem_type elem_val = val_ptr[c];
196+ float channel_val = static_cast <float >(elem_val);
197+ results[c] += channel_val * w;
198+ acc_weights[c] += w;
199+ }
124200 }
125201 }
126-
127- float result = acc_weight > 0 .0f ? (acc_val / acc_weight) : 0 .0f ;
128- results[c] = result;
129202 }
130203
204+ #pragma unroll
205+ for (int c = 0 ; c < cn; ++c)
206+ results[c] = acc_weights[c] > 0 .0f ? (results[c] / acc_weights[c]) : 0 .0f ;
207+
131208 T result_vec;
132209 elem_type* result_ptr = reinterpret_cast <elem_type*>(&result_vec);
210+
211+ #pragma unroll
133212 for (int c = 0 ; c < cn; ++c)
134- {
135213 result_ptr[c] = saturate_cast<elem_type>(results[c]);
136- }
214+
137215 dst (y, x) = result_vec;
138216 }
139217
140218
219+
141220 template <typename T> __global__ void resize_nearest (const PtrStep<T> src, PtrStepSz<T> dst, const float fy, const float fx)
142221 {
143222 const int dst_x = blockDim .x * blockIdx .x + threadIdx .x ;
@@ -341,6 +420,92 @@ namespace cv { namespace cuda { namespace device
341420 cudaSafeCall ( cudaDeviceSynchronize () );
342421 }
343422
423+ template <typename Ptr2D, typename T>
424+ __global__ void resize_lanczos4_tex (Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
425+ {
426+ const int x = blockDim .x * blockIdx .x + threadIdx .x ;
427+ const int y = blockDim .y * blockIdx .y + threadIdx .y ;
428+
429+ if (x >= dst.cols || y >= dst.rows )
430+ return ;
431+
432+ const float src_x = (static_cast <float >(x) + 0 .5f ) * fx - 0 .5f ;
433+ const float src_y = (static_cast <float >(y) + 0 .5f ) * fy - 0 .5f ;
434+
435+ typedef typename VecTraits<T>::elem_type elem_type;
436+ constexpr int cn = VecTraits<T>::cn;
437+ float results[cn] = {0 .0f };
438+
439+ for (int c = 0 ; c < cn; ++c)
440+ {
441+ float acc_val = 0 .0f ;
442+ float acc_weight = 0 .0f ;
443+
444+ const int xmin = int (floorf (src_x)) - 3 ;
445+ const int xmax = int (floorf (src_x)) + 4 ;
446+ const int ymin = int (floorf (src_y)) - 3 ;
447+ const int ymax = int (floorf (src_y)) + 4 ;
448+
449+ for (int cy = ymin; cy <= ymax; ++cy)
450+ {
451+ float wy = lanczos_weight (src_y - static_cast <float >(cy));
452+ if (wy == 0 .0f )
453+ continue ;
454+
455+ for (int cx = xmin; cx <= xmax; ++cx)
456+ {
457+ float wx = lanczos_weight (src_x - static_cast <float >(cx));
458+ if (wx == 0 .0f )
459+ continue ;
460+
461+ float w = wy * wx;
462+
463+ // Use texture memory for sampling (handles boundary automatically)
464+ T val = src (static_cast <float >(cy), static_cast <float >(cx));
465+
466+ const elem_type* val_ptr = reinterpret_cast <const elem_type*>(&val);
467+ elem_type elem_val = val_ptr[c];
468+ float channel_val = static_cast <float >(elem_val);
469+
470+ acc_val += channel_val * w;
471+ acc_weight += w;
472+ }
473+ }
474+
475+ float result = acc_weight > 0 .0f ? (acc_val / acc_weight) : 0 .0f ;
476+ results[c] = result;
477+ }
478+
479+ T result_vec;
480+ elem_type* result_ptr = reinterpret_cast <elem_type*>(&result_vec);
481+ for (int c = 0 ; c < cn; ++c)
482+ {
483+ result_ptr[c] = saturate_cast<elem_type>(results[c]);
484+ }
485+ dst (y, x) = result_vec;
486+ }
487+
488+ template <typename T>
489+ void call_resize_lanczos4_tex (const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
490+ {
491+ const dim3 block (32 , 8 );
492+ const dim3 grid (divUp (dst.cols , block.x ), divUp (dst.rows , block.y ));
493+ if (srcWhole.data == src.data )
494+ {
495+ cudev::Texture<T> texSrc (src);
496+ resize_lanczos4_tex<cudev::TexturePtr<T>><<<grid, block>>> (texSrc, dst, fy, fx);
497+ }
498+ else
499+ {
500+ cudev::TextureOff<T> texSrcWhole (srcWhole, yoff, xoff);
501+ BrdReplicate<T> brd (src.rows , src.cols );
502+ BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc (texSrcWhole, brd);
503+ resize_lanczos4_tex<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>><<<grid, block>>> (brdSrc, dst, fy, fx);
504+ }
505+ cudaSafeCall ( cudaGetLastError () );
506+ cudaSafeCall ( cudaDeviceSynchronize () );
507+ }
508+
344509 // ResizeNearestDispatcher
345510
346511 template <typename T> struct ResizeNearestDispatcher
@@ -460,6 +625,63 @@ namespace cv { namespace cuda { namespace device
460625 }
461626 };
462627
628+ template <typename T> struct SelectImplForLanczos
629+ {
630+ static void call (const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
631+ {
632+ if (stream)
633+ call_resize_lanczos4_glob (src, dst, fy, fx, stream);
634+ else
635+ {
636+ if (fx > 1 || fy > 1 )
637+ call_resize_lanczos4_glob (src, dst, fy, fx, 0 );
638+ else
639+ call_resize_lanczos4_tex (src, srcWhole, yoff, xoff, dst, fy, fx);
640+ }
641+ }
642+ };
643+
644+ // Texture memory doesn't support 3-channel types, so use glob for those
645+ template <> struct ResizeLanczosDispatcher <uchar> : SelectImplForLanczos<uchar> {};
646+ template <> struct ResizeLanczosDispatcher <uchar3 >
647+ {
648+ static void call (const PtrStepSz<uchar3 >& src, const PtrStepSz<uchar3 >& /* srcWhole*/ , int /* yoff*/ , int /* xoff*/ , const PtrStepSz<uchar3 >& dst, float fy, float fx, cudaStream_t stream)
649+ {
650+ call_resize_lanczos4_glob (src, dst, fy, fx, stream);
651+ }
652+ };
653+ template <> struct ResizeLanczosDispatcher <uchar4 > : SelectImplForLanczos<uchar4 > {};
654+
655+ template <> struct ResizeLanczosDispatcher <ushort> : SelectImplForLanczos<ushort> {};
656+ template <> struct ResizeLanczosDispatcher <ushort3 >
657+ {
658+ static void call (const PtrStepSz<ushort3 >& src, const PtrStepSz<ushort3 >& /* srcWhole*/ , int /* yoff*/ , int /* xoff*/ , const PtrStepSz<ushort3 >& dst, float fy, float fx, cudaStream_t stream)
659+ {
660+ call_resize_lanczos4_glob (src, dst, fy, fx, stream);
661+ }
662+ };
663+ template <> struct ResizeLanczosDispatcher <ushort4 > : SelectImplForLanczos<ushort4 > {};
664+
665+ template <> struct ResizeLanczosDispatcher <short > : SelectImplForLanczos<short > {};
666+ template <> struct ResizeLanczosDispatcher <short3 >
667+ {
668+ static void call (const PtrStepSz<short3 >& src, const PtrStepSz<short3 >& /* srcWhole*/ , int /* yoff*/ , int /* xoff*/ , const PtrStepSz<short3 >& dst, float fy, float fx, cudaStream_t stream)
669+ {
670+ call_resize_lanczos4_glob (src, dst, fy, fx, stream);
671+ }
672+ };
673+ template <> struct ResizeLanczosDispatcher <short4 > : SelectImplForLanczos<short4 > {};
674+
675+ template <> struct ResizeLanczosDispatcher <float > : SelectImplForLanczos<float > {};
676+ template <> struct ResizeLanczosDispatcher <float3 >
677+ {
678+ static void call (const PtrStepSz<float3 >& src, const PtrStepSz<float3 >& /* srcWhole*/ , int /* yoff*/ , int /* xoff*/ , const PtrStepSz<float3 >& dst, float fy, float fx, cudaStream_t stream)
679+ {
680+ call_resize_lanczos4_glob (src, dst, fy, fx, stream);
681+ }
682+ };
683+ template <> struct ResizeLanczosDispatcher <float4 > : SelectImplForLanczos<float4 > {};
684+
463685 // ResizeAreaDispatcher
464686
465687 template <typename T> struct ResizeAreaDispatcher
0 commit comments