From 595cde3ff43c019a488f4e8093d20fb79d838cd5 Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Tue, 9 Dec 2025 10:21:12 +0800 Subject: [PATCH 1/8] base lanczos4 --- modules/cudawarping/src/cuda/resize.cu | 117 ++++++++++- modules/cudawarping/src/resize.cpp | 2 +- modules/cudawarping/test/interpolation.hpp | 52 +++++ modules/cudawarping/test/test_lanczos.cpp | 230 +++++++++++++++++++++ 4 files changed, 398 insertions(+), 3 deletions(-) create mode 100644 modules/cudawarping/test/test_lanczos.cpp diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu index 27cec5564ef..9c75e829828 100644 --- a/modules/cudawarping/src/cuda/resize.cu +++ b/modules/cudawarping/src/cuda/resize.cu @@ -43,6 +43,7 @@ #if !defined CUDA_DISABLER #include +#include #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/vec_traits.hpp" @@ -53,7 +54,89 @@ namespace cv { namespace cuda { namespace device { + __device__ float lanczos_weight(float x_) + { + float x = fabsf(x_); + if (x == 0.0f) + return 1.0f; + if (x >= 4.0f) + return 0.0f; + float pi_x = M_PI * x; + return sinf(pi_x) * sinf(pi_x / 4.0f) / (pi_x * pi_x / 4.0f); + } + // kernels + template + __global__ void resize_lanczos4(const PtrStepSz src, PtrStepSz dst, const float fy, const float fx) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst.cols || y >= dst.rows) + return; + + const float src_x = static_cast(x) * fx; + const float src_y = static_cast(y) * fy; + + const int in_height = src.rows; + const int in_width = src.cols; + + typedef typename VecTraits::elem_type elem_type; + constexpr int cn = VecTraits::cn; + float results[cn] = {0.0f}; + + for (int c = 0; c < cn; ++c) + { + float acc_val = 0.0f; + float acc_weight = 0.0f; + + + const int xmin = int(floorf(src_x)) - 3; + const int xmax = int(floorf(src_x)) + 4; + const int ymin = int(floorf(src_y)) - 3; + const int ymax = int(floorf(src_y)) + 4; + + for (int cy = ymin; cy <= ymax; ++cy) + { + float wy = lanczos_weight(src_y - static_cast(cy)); + if (wy == 0.0f) + continue; + + for (int cx = xmin; cx <= xmax; ++cx) + { + float wx = lanczos_weight(src_x - static_cast(cx)); + if (wx == 0.0f) + continue; + + float w = wy * wx; + + int iy = ::max(0, ::min(cy, in_height - 1)); + int ix = ::max(0, ::min(cx, in_width - 1)); + + T val = src(iy, ix); + + const elem_type* val_ptr = reinterpret_cast(&val); + elem_type elem_val = val_ptr[c]; + float channel_val = static_cast(elem_val); + + acc_val += channel_val * w; + acc_weight += w; + } + } + + float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f; + results[c] = result; + } + + T result_vec; + elem_type* result_ptr = reinterpret_cast(&result_vec); + for (int c = 0; c < cn; ++c) + { + result_ptr[c] = saturate_cast(results[c]); + } + dst(y, x) = result_vec; + } + template __global__ void resize_nearest(const PtrStep src, PtrStepSz dst, const float fy, const float fx) { @@ -243,6 +326,21 @@ namespace cv { namespace cuda { namespace device cudaSafeCall( cudaDeviceSynchronize() ); } + // callers for lanczos interpolation + + template + void call_resize_lanczos4_glob(const PtrStepSz& src, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + resize_lanczos4<<>>(src, dst, fy, fx); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + // ResizeNearestDispatcher template struct ResizeNearestDispatcher @@ -352,6 +450,16 @@ namespace cv { namespace cuda { namespace device template <> struct ResizeCubicDispatcher : SelectImplForCubic {}; template <> struct ResizeCubicDispatcher : SelectImplForCubic {}; + // ResizeLanczosDispatcher + + template struct ResizeLanczosDispatcher + { + static void call(const PtrStepSz& src, const PtrStepSz& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + call_resize_lanczos4_glob(src, dst, fy, fx, stream); + } + }; + // ResizeAreaDispatcher template struct ResizeAreaDispatcher @@ -393,18 +501,23 @@ namespace cv { namespace cuda { namespace device template void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream) { typedef void (*func_t)(const PtrStepSz& src, const PtrStepSz& srcWhole, int yoff, int xoff, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream); - static const func_t funcs[4] = + static const func_t funcs[5] = { ResizeNearestDispatcher::call, ResizeLinearDispatcher::call, ResizeCubicDispatcher::call, - ResizeAreaDispatcher::call + ResizeAreaDispatcher::call, + ResizeLanczosDispatcher::call }; // change to linear if area interpolation upscaling if (interpolation == 3 && (fx <= 1.f || fy <= 1.f)) interpolation = 1; + // Bounds check for interpolation mode + if (interpolation < 0 || interpolation >= 5) + interpolation = 1; // Default to linear + funcs[interpolation](static_cast< PtrStepSz >(src), static_cast< PtrStepSz >(srcWhole), yoff, xoff, static_cast< PtrStepSz >(dst), fy, fx, stream); } diff --git a/modules/cudawarping/src/resize.cpp b/modules/cudawarping/src/resize.cpp index 9943a6cdc6a..bb30cd1cfbe 100644 --- a/modules/cudawarping/src/resize.cpp +++ b/modules/cudawarping/src/resize.cpp @@ -70,7 +70,7 @@ void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, }; CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 ); - CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA ); + CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA || interpolation == INTER_LANCZOS4 ); CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) ); if (dsize == Size()) diff --git a/modules/cudawarping/test/interpolation.hpp b/modules/cudawarping/test/interpolation.hpp index 7a00143e1d9..07fc5fa9a42 100644 --- a/modules/cudawarping/test/interpolation.hpp +++ b/modules/cudawarping/test/interpolation.hpp @@ -128,4 +128,56 @@ template struct CubicInterpolator } }; +template struct LanczosInterpolator +{ + static constexpr int A = 4; + + static float lanczosCoeff(float x_) + { + float x = fabsf(x_); + if (x == 0.0f) + return 1.0f; + if (x >= A) + return 0.0f; + + const float pi = 3.14159265358979323846f; + float pi_x = pi * x; + return sinf(pi_x) * sinf(pi_x / A) / (pi_x * pi_x / A); + } + + static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) + { + const int xmin = (int) floorf(x) - A + 1; + const int xmax = (int) floorf(x) + A; + + const int ymin = (int) floorf(y) - A + 1; + const int ymax = (int) floorf(y) + A; + + float sum = 0.0f; + float wsum = 0.0f; + + for (int cy = ymin; cy <= ymax; ++cy) + { + float wy = lanczosCoeff(y - cy); + if (wy == 0.0f) + continue; + + for (int cx = xmin; cx <= xmax; ++cx) + { + float wx = lanczosCoeff(x - cx); + if (wx == 0.0f) + continue; + + const float w = wy * wx; + sum += w * readVal(src, cy, cx, c, border_type, borderVal); + wsum += w; + } + } + + float res = (!wsum)? 0 : sum / wsum; + + return cv::saturate_cast(res); + } +}; + #endif // __OPENCV_TEST_INTERPOLATION_HPP__ diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp new file mode 100644 index 00000000000..fd57aab2c02 --- /dev/null +++ b/modules/cudawarping/test/test_lanczos.cpp @@ -0,0 +1,230 @@ +#include "test_precomp.hpp" + +#ifdef HAVE_CUDA + +namespace opencv_test { namespace { + +/////////////////////////////////////////////////////////////////// +// Gold implementation + +namespace +{ + template + void resizeLanczosImpl(const cv::Mat& src, cv::Mat& dst, double fx, double fy) + { + const int cn = src.channels(); + + cv::Size dsize(cv::saturate_cast(src.cols * fx), cv::saturate_cast(src.rows * fy)); + + dst.create(dsize, src.type()); + + float ifx = static_cast(1.0 / fx); + float ify = static_cast(1.0 / fy); + + for (int y = 0; y < dsize.height; ++y) + { + for (int x = 0; x < dsize.width; ++x) + { + for (int c = 0; c < cn; ++c) + dst.at(y, x * cn + c) = LanczosInterpolator::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE); + } + } + } + + void resizeLanczosGold(const cv::Mat& src, cv::Mat& dst, double fx, double fy) + { + typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst, double fx, double fy); + + static const func_t lanczos_funcs[] = + { + resizeLanczosImpl, + resizeLanczosImpl, + resizeLanczosImpl, + resizeLanczosImpl, + resizeLanczosImpl, + resizeLanczosImpl + }; + + lanczos_funcs[src.depth()](src, dst, fx, fy); + } +} + +/////////////////////////////////////////////////////////////////// +// Test + +PARAM_TEST_CASE(ResizeLanczos, cv::cuda::DeviceInfo, cv::Size, MatType, double, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + double coeff; + int type; + bool useRoi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + type = GET_PARAM(2); + coeff = GET_PARAM(3); + useRoi = GET_PARAM(4); + + cv::cuda::setDevice(devInfo.deviceID()); + } + + virtual void TearDown() + { + // GpuMat destructors will automatically clean up GPU memory + } +}; + +CUDA_TEST_P(ResizeLanczos, Accuracy) +{ + cv::Mat src = randomMat(size, type); + + cv::cuda::GpuMat dst = createMat(cv::Size(cv::saturate_cast(src.cols * coeff), cv::saturate_cast(src.rows * coeff)), type, useRoi); + cv::cuda::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); + + cv::Mat dst_gold; + resizeLanczosGold(src, dst_gold, coeff, coeff); + + EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0); +} + +INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczos, testing::Combine( + testing::Values(cv::cuda::DeviceInfo()), + DIFFERENT_SIZES, + testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)), + testing::Values(0.3, 0.5, 1.5, 2.0), + WHOLE_SUBMAT)); + +///////////////// + +PARAM_TEST_CASE(ResizeLanczosSameAsHost, cv::cuda::DeviceInfo, cv::Size, MatType, double, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + double coeff; + int type; + bool useRoi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + type = GET_PARAM(2); + coeff = GET_PARAM(3); + useRoi = GET_PARAM(4); + + cv::cuda::setDevice(devInfo.deviceID()); + } + + virtual void TearDown() + { + // GpuMat destructors will automatically clean up GPU memory + } +}; + +CUDA_TEST_P(ResizeLanczosSameAsHost, Accuracy) +{ + cv::Mat src = randomMat(size, type); + + cv::cuda::GpuMat dst = createMat(cv::Size(cv::saturate_cast(src.cols * coeff), cv::saturate_cast(src.rows * coeff)), type, useRoi); + cv::cuda::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); + + cv::Mat dst_gold; + cv::resize(src, dst_gold, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); + + EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0); +} + +INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosSameAsHost, testing::Combine( + testing::Values(cv::cuda::DeviceInfo()), + DIFFERENT_SIZES, + testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)), + testing::Values(0.3, 0.5, 1.5, 2.0), + WHOLE_SUBMAT)); + +///////////////// +// Performance Test + +PARAM_TEST_CASE(ResizeLanczosPerformance, cv::cuda::DeviceInfo, cv::Size, MatType, double) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + double coeff; + int type; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + type = GET_PARAM(2); + coeff = GET_PARAM(3); + + cv::cuda::setDevice(devInfo.deviceID()); + } +}; + +CUDA_TEST_P(ResizeLanczosPerformance, Performance) +{ + cv::Mat src = randomMat(size, type); + cv::Size dsize(cv::saturate_cast(src.cols * coeff), cv::saturate_cast(src.rows * coeff)); + + // Warm up + { + cv::cuda::GpuMat gpuSrc, gpuDst; + gpuSrc.upload(src); + gpuDst.create(dsize, type); + cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); + cv::Mat dummy; + gpuDst.download(dummy); // Synchronize + } + + // GPU performance + const int iterations = 100; + cv::TickMeter tm_gpu; + cv::cuda::GpuMat gpuSrc, gpuDst; + gpuSrc.upload(src); + gpuDst.create(dsize, type); + + tm_gpu.start(); + for (int i = 0; i < iterations; ++i) + { + cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); + } + cv::Mat dummy; + gpuDst.download(dummy); // Synchronize + tm_gpu.stop(); + + // CPU performance + cv::TickMeter tm_cpu; + cv::Mat dst_cpu; + + tm_cpu.start(); + for (int i = 0; i < iterations; ++i) + { + cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); + } + tm_cpu.stop(); + + double gpu_time = tm_gpu.getTimeMilli() / iterations; + double cpu_time = tm_cpu.getTimeMilli() / iterations; + double speedup = cpu_time / gpu_time; + + std::cout << "Size: " << size << " -> " << dsize + << ", Type: " << type + << ", Coeff: " << coeff << std::endl; + std::cout << " CPU: " << cpu_time << " ms" << std::endl; + std::cout << " GPU: " << gpu_time << " ms" << std::endl; + std::cout << " Speedup: " << speedup << "x" << std::endl; +} + +INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosPerformance, testing::Combine( + testing::Values(cv::cuda::DeviceInfo()), + testing::Values(cv::Size(512, 512), cv::Size(1024, 1024), cv::Size(2048, 2048)), + testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3)), + testing::Values(0.5, 1.5, 2.0))); + +}} // namespace +#endif // HAVE_CUDA + From 51dbf82e621e580bcccf2c8dff4340bdbf555dd7 Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Wed, 10 Dec 2025 11:47:13 +0800 Subject: [PATCH 2/8] shared memory optimization --- modules/cudawarping/src/cuda/resize.cu | 304 +++++++++++++++++++--- modules/cudawarping/test/test_lanczos.cpp | 9 +- 2 files changed, 271 insertions(+), 42 deletions(-) diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu index 9c75e829828..1b474731dda 100644 --- a/modules/cudawarping/src/cuda/resize.cu +++ b/modules/cudawarping/src/cuda/resize.cu @@ -54,7 +54,7 @@ namespace cv { namespace cuda { namespace device { - __device__ float lanczos_weight(float x_) + __device__ __forceinline__ float lanczos_weight(float x_) { float x = fabsf(x_); if (x == 0.0f) @@ -69,75 +69,154 @@ namespace cv { namespace cuda { namespace device template __global__ void resize_lanczos4(const PtrStepSz src, PtrStepSz dst, const float fy, const float fx) { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int bx = blockIdx.x; + const int by = blockIdx.y; - if (x >= dst.cols || y >= dst.rows) - return; - - const float src_x = static_cast(x) * fx; - const float src_y = static_cast(y) * fy; + const int x = bx * blockDim.x + tx; + const int y = by * blockDim.y + ty; const int in_height = src.rows; const int in_width = src.cols; + constexpr int R = 4; + constexpr int BASE_W = 32; + constexpr int BASE_H = 8; + constexpr int SHARED_WIDTH_MAX = BASE_W + R + R; + constexpr int SHARED_HEIGHT_MAX = BASE_H + R + R; + + __shared__ T shared_src[SHARED_HEIGHT_MAX][SHARED_WIDTH_MAX]; + typedef typename VecTraits::elem_type elem_type; constexpr int cn = VecTraits::cn; - float results[cn] = {0.0f}; - for (int c = 0; c < cn; ++c) - { - float acc_val = 0.0f; - float acc_weight = 0.0f; + const int out_x0 = bx * blockDim.x; + const int out_x1 = ::min(out_x0 + blockDim.x - 1, dst.cols - 1); + const int out_y0 = by * blockDim.y; + const int out_y1 = ::min(out_y0 + blockDim.y - 1, dst.rows - 1); + const float src_x0_f = (static_cast(out_x0) + 0.5f) * fx - 0.5f; + const float src_x1_f = (static_cast(out_x1) + 0.5f) * fx - 0.5f; + const float src_y0_f = (static_cast(out_y0) + 0.5f) * fy - 0.5f; + const float src_y1_f = (static_cast(out_y1) + 0.5f) * fy - 0.5f; - const int xmin = int(floorf(src_x)) - 3; - const int xmax = int(floorf(src_x)) + 4; - const int ymin = int(floorf(src_y)) - 3; - const int ymax = int(floorf(src_y)) + 4; + int in_x_min = int(floorf(src_x0_f)) - R; + int in_x_max = int(floorf(src_x1_f)) + R; + int in_y_min = int(floorf(src_y0_f)) - R; + int in_y_max = int(floorf(src_y1_f)) + R; - for (int cy = ymin; cy <= ymax; ++cy) - { - float wy = lanczos_weight(src_y - static_cast(cy)); - if (wy == 0.0f) - continue; + if (in_x_min < 0) in_x_min = 0; + if (in_y_min < 0) in_y_min = 0; + if (in_x_max >= in_width) in_x_max = in_width - 1; + if (in_y_max >= in_height) in_y_max = in_height - 1; - for (int cx = xmin; cx <= xmax; ++cx) + const int W_needed = in_x_max - in_x_min + 1; + const int H_needed = in_y_max - in_y_min + 1; + + // for fx <= 1 and fy <= 1 + const bool use_shared = (W_needed <= SHARED_WIDTH_MAX) && (H_needed <= SHARED_HEIGHT_MAX); + + if (use_shared) + { + for (int sy = ty; sy < H_needed; sy += blockDim.y) + { + int iy = in_y_min + sy; + for (int sx = tx; sx < W_needed; sx += blockDim.x) { - float wx = lanczos_weight(src_x - static_cast(cx)); - if (wx == 0.0f) - continue; + int ix = in_x_min + sx; + shared_src[sy][sx] = src(iy, ix); + } + } + __syncthreads(); + } - float w = wy * wx; + if (x >= dst.cols || y >= dst.rows) + { + if (use_shared) { __syncthreads(); } + return; + } - int iy = ::max(0, ::min(cy, in_height - 1)); - int ix = ::max(0, ::min(cx, in_width - 1)); + const float src_x = (static_cast(x) + 0.5f) * fx - 0.5f; + const float src_y = (static_cast(y) + 0.5f) * fy - 0.5f; - T val = src(iy, ix); - - const elem_type* val_ptr = reinterpret_cast(&val); - elem_type elem_val = val_ptr[c]; - float channel_val = static_cast(elem_val); + const int xmin = int(floorf(src_x)) - 3; + const int xmax = int(floorf(src_x)) + 4; + const int ymin = int(floorf(src_y)) - 3; + const int ymax = int(floorf(src_y)) + 4; - acc_val += channel_val * w; - acc_weight += w; + float results[cn]; + float acc_weights[cn]; + #pragma unroll + for (int c = 0; c < cn; ++c) { results[c] = 0.0f; acc_weights[c] = 0.0f; } + + for (int cy = ymin; cy <= ymax; ++cy) + { + float wy = lanczos_weight(src_y - static_cast(cy)); + if (wy == 0.0f) continue; + + for (int cx = xmin; cx <= xmax; ++cx) + { + float wx = lanczos_weight(src_x - static_cast(cx)); + if (wx == 0.0f) continue; + + float w = wy * wx; + + if (use_shared) + { + int sx = cx - in_x_min; + int sy = cy - in_y_min; + if (sx < 0) sx = 0; + else if (sx >= W_needed) sx = W_needed - 1; + if (sy < 0) sy = 0; + else if (sy >= H_needed) sy = H_needed - 1; + + T val = shared_src[sy][sx]; + const elem_type* val_ptr = reinterpret_cast(&val); + #pragma unroll + for (int c = 0; c < cn; ++c) + { + elem_type elem_val = val_ptr[c]; + float channel_val = static_cast(elem_val); + results[c] += channel_val * w; + acc_weights[c] += w; + } + } + else + { + // fallback + int iy_r = cy < 0 ? 0 : (cy >= in_height ? (in_height - 1) : cy); + int ix_r = cx < 0 ? 0 : (cx >= in_width ? (in_width - 1) : cx); + T val = src(iy_r, ix_r); + const elem_type* val_ptr = reinterpret_cast(&val); + #pragma unroll + for (int c = 0; c < cn; ++c) + { + elem_type elem_val = val_ptr[c]; + float channel_val = static_cast(elem_val); + results[c] += channel_val * w; + acc_weights[c] += w; + } } } - - float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f; - results[c] = result; } + #pragma unroll + for (int c = 0; c < cn; ++c) + results[c] = acc_weights[c] > 0.0f ? (results[c] / acc_weights[c]) : 0.0f; + T result_vec; elem_type* result_ptr = reinterpret_cast(&result_vec); + + #pragma unroll for (int c = 0; c < cn; ++c) - { result_ptr[c] = saturate_cast(results[c]); - } + dst(y, x) = result_vec; } + template __global__ void resize_nearest(const PtrStep src, PtrStepSz dst, const float fy, const float fx) { const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; @@ -341,6 +420,92 @@ namespace cv { namespace cuda { namespace device cudaSafeCall( cudaDeviceSynchronize() ); } + template + __global__ void resize_lanczos4_tex(Ptr2D src, PtrStepSz dst, const float fy, const float fx) + { + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x >= dst.cols || y >= dst.rows) + return; + + const float src_x = (static_cast(x) + 0.5f) * fx - 0.5f; + const float src_y = (static_cast(y) + 0.5f) * fy - 0.5f; + + typedef typename VecTraits::elem_type elem_type; + constexpr int cn = VecTraits::cn; + float results[cn] = {0.0f}; + + for (int c = 0; c < cn; ++c) + { + float acc_val = 0.0f; + float acc_weight = 0.0f; + + const int xmin = int(floorf(src_x)) - 3; + const int xmax = int(floorf(src_x)) + 4; + const int ymin = int(floorf(src_y)) - 3; + const int ymax = int(floorf(src_y)) + 4; + + for (int cy = ymin; cy <= ymax; ++cy) + { + float wy = lanczos_weight(src_y - static_cast(cy)); + if (wy == 0.0f) + continue; + + for (int cx = xmin; cx <= xmax; ++cx) + { + float wx = lanczos_weight(src_x - static_cast(cx)); + if (wx == 0.0f) + continue; + + float w = wy * wx; + + // Use texture memory for sampling (handles boundary automatically) + T val = src(static_cast(cy), static_cast(cx)); + + const elem_type* val_ptr = reinterpret_cast(&val); + elem_type elem_val = val_ptr[c]; + float channel_val = static_cast(elem_val); + + acc_val += channel_val * w; + acc_weight += w; + } + } + + float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f; + results[c] = result; + } + + T result_vec; + elem_type* result_ptr = reinterpret_cast(&result_vec); + for (int c = 0; c < cn; ++c) + { + result_ptr[c] = saturate_cast(results[c]); + } + dst(y, x) = result_vec; + } + + template + void call_resize_lanczos4_tex(const PtrStepSz& src, const PtrStepSz& srcWhole, int yoff, int xoff, const PtrStepSz& dst, float fy, float fx) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (srcWhole.data == src.data) + { + cudev::Texture texSrc(src); + resize_lanczos4_tex><<>>(texSrc, dst, fy, fx); + } + else + { + cudev::TextureOff texSrcWhole(srcWhole, yoff, xoff); + BrdReplicate brd(src.rows, src.cols); + BorderReader, BrdReplicate> brdSrc(texSrcWhole, brd); + resize_lanczos4_tex, BrdReplicate>><<>>(brdSrc, dst, fy, fx); + } + cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); + } + // ResizeNearestDispatcher template struct ResizeNearestDispatcher @@ -460,6 +625,63 @@ namespace cv { namespace cuda { namespace device } }; + template struct SelectImplForLanczos + { + static void call(const PtrStepSz& src, const PtrStepSz& srcWhole, int yoff, int xoff, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + if (stream) + call_resize_lanczos4_glob(src, dst, fy, fx, stream); + else + { + if (fx > 1 || fy > 1) + call_resize_lanczos4_glob(src, dst, fy, fx, 0); + else + call_resize_lanczos4_tex(src, srcWhole, yoff, xoff, dst, fy, fx); + } + } + }; + + // Texture memory doesn't support 3-channel types, so use glob for those + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + template <> struct ResizeLanczosDispatcher + { + static void call(const PtrStepSz& src, const PtrStepSz& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + call_resize_lanczos4_glob(src, dst, fy, fx, stream); + } + }; + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + template <> struct ResizeLanczosDispatcher + { + static void call(const PtrStepSz& src, const PtrStepSz& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + call_resize_lanczos4_glob(src, dst, fy, fx, stream); + } + }; + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + template <> struct ResizeLanczosDispatcher + { + static void call(const PtrStepSz& src, const PtrStepSz& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + call_resize_lanczos4_glob(src, dst, fy, fx, stream); + } + }; + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + template <> struct ResizeLanczosDispatcher + { + static void call(const PtrStepSz& src, const PtrStepSz& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz& dst, float fy, float fx, cudaStream_t stream) + { + call_resize_lanczos4_glob(src, dst, fy, fx, stream); + } + }; + template <> struct ResizeLanczosDispatcher : SelectImplForLanczos {}; + // ResizeAreaDispatcher template struct ResizeAreaDispatcher diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp index fd57aab2c02..d78f0ef5d2d 100644 --- a/modules/cudawarping/test/test_lanczos.cpp +++ b/modules/cudawarping/test/test_lanczos.cpp @@ -21,12 +21,19 @@ namespace float ifx = static_cast(1.0 / fx); float ify = static_cast(1.0 / fy); + // OpenCV CPU resize uses center-aligned coordinate mapping: (x + 0.5) * fx - 0.5 + // Since fx and fy here are scale factors, and ifx = 1.0 / fx, ify = 1.0 / fy, + // the center-aligned mapping becomes: (x + 0.5) / fx - 0.5 = (x + 0.5) * ifx - 0.5 for (int y = 0; y < dsize.height; ++y) { for (int x = 0; x < dsize.width; ++x) { for (int c = 0; c < cn; ++c) - dst.at(y, x * cn + c) = LanczosInterpolator::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE); + { + float src_x = (static_cast(x) + 0.5f) * ifx - 0.5f; + float src_y = (static_cast(y) + 0.5f) * ify - 0.5f; + dst.at(y, x * cn + c) = LanczosInterpolator::getValue(src, src_y, src_x, c, cv::BORDER_REPLICATE); + } } } } From 31c1978f8756ea67ae010dbc67eba70c8a563feb Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Wed, 10 Dec 2025 15:41:39 +0800 Subject: [PATCH 3/8] Fix whitespace issues in test files --- modules/cudawarping/test/interpolation.hpp | 2 +- modules/cudawarping/test/test_lanczos.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/cudawarping/test/interpolation.hpp b/modules/cudawarping/test/interpolation.hpp index 07fc5fa9a42..a008d3fd563 100644 --- a/modules/cudawarping/test/interpolation.hpp +++ b/modules/cudawarping/test/interpolation.hpp @@ -139,7 +139,7 @@ template struct LanczosInterpolator return 1.0f; if (x >= A) return 0.0f; - + const float pi = 3.14159265358979323846f; float pi_x = pi * x; return sinf(pi_x) * sinf(pi_x / A) / (pi_x * pi_x / A); diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp index d78f0ef5d2d..bb06058635a 100644 --- a/modules/cudawarping/test/test_lanczos.cpp +++ b/modules/cudawarping/test/test_lanczos.cpp @@ -218,8 +218,8 @@ CUDA_TEST_P(ResizeLanczosPerformance, Performance) double cpu_time = tm_cpu.getTimeMilli() / iterations; double speedup = cpu_time / gpu_time; - std::cout << "Size: " << size << " -> " << dsize - << ", Type: " << type + std::cout << "Size: " << size << " -> " << dsize + << ", Type: " << type << ", Coeff: " << coeff << std::endl; std::cout << " CPU: " << cpu_time << " ms" << std::endl; std::cout << " GPU: " << gpu_time << " ms" << std::endl; @@ -234,4 +234,3 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosPerformance, testing::Combine }} // namespace #endif // HAVE_CUDA - From 3cd6cf7f43713fc8eeb166415409cf5328033fe7 Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Wed, 10 Dec 2025 17:00:15 +0800 Subject: [PATCH 4/8] move performance tests code to modules//perf/ --- modules/cudawarping/perf/perf_warping.cpp | 42 ++++++++++++ modules/cudawarping/test/test_lanczos.cpp | 81 ----------------------- 2 files changed, 42 insertions(+), 81 deletions(-) diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp index 3e7aa18f559..fb67363529f 100644 --- a/modules/cudawarping/perf/perf_warping.cpp +++ b/modules/cudawarping/perf/perf_warping.cpp @@ -180,6 +180,48 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, Resize, } } +////////////////////////////////////////////////////////////////////// +// ResizeLanczos + +PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ResizeLanczos, + Combine(CUDA_TYPICAL_MAT_SIZES, + Values(CV_8U, CV_32F), + CUDA_CHANNELS_1_3_4, + Values(Interpolation(cv::INTER_LANCZOS4)), + Values(0.5, 1.5, 2.0))) +{ + declare.time(20.0); + + const cv::Size size = GET_PARAM(0); + const int depth = GET_PARAM(1); + const int channels = GET_PARAM(2); + const int interpolation = GET_PARAM(3); + const double f = GET_PARAM(4); + + const int type = CV_MAKE_TYPE(depth, channels); + + cv::Mat src(size, type); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_CUDA()) + { + const cv::cuda::GpuMat d_src(src); + cv::cuda::GpuMat dst; + + TEST_CYCLE() cv::cuda::resize(d_src, dst, cv::Size(), f, f, interpolation); + + CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation); + + CPU_SANITY_CHECK(dst); + } +} + ////////////////////////////////////////////////////////////////////// // ResizeArea diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp index bb06058635a..32b2684c293 100644 --- a/modules/cudawarping/test/test_lanczos.cpp +++ b/modules/cudawarping/test/test_lanczos.cpp @@ -151,86 +151,5 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosSameAsHost, testing::Combine( testing::Values(0.3, 0.5, 1.5, 2.0), WHOLE_SUBMAT)); -///////////////// -// Performance Test - -PARAM_TEST_CASE(ResizeLanczosPerformance, cv::cuda::DeviceInfo, cv::Size, MatType, double) -{ - cv::cuda::DeviceInfo devInfo; - cv::Size size; - double coeff; - int type; - - virtual void SetUp() - { - devInfo = GET_PARAM(0); - size = GET_PARAM(1); - type = GET_PARAM(2); - coeff = GET_PARAM(3); - - cv::cuda::setDevice(devInfo.deviceID()); - } -}; - -CUDA_TEST_P(ResizeLanczosPerformance, Performance) -{ - cv::Mat src = randomMat(size, type); - cv::Size dsize(cv::saturate_cast(src.cols * coeff), cv::saturate_cast(src.rows * coeff)); - - // Warm up - { - cv::cuda::GpuMat gpuSrc, gpuDst; - gpuSrc.upload(src); - gpuDst.create(dsize, type); - cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); - cv::Mat dummy; - gpuDst.download(dummy); // Synchronize - } - - // GPU performance - const int iterations = 100; - cv::TickMeter tm_gpu; - cv::cuda::GpuMat gpuSrc, gpuDst; - gpuSrc.upload(src); - gpuDst.create(dsize, type); - - tm_gpu.start(); - for (int i = 0; i < iterations; ++i) - { - cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); - } - cv::Mat dummy; - gpuDst.download(dummy); // Synchronize - tm_gpu.stop(); - - // CPU performance - cv::TickMeter tm_cpu; - cv::Mat dst_cpu; - - tm_cpu.start(); - for (int i = 0; i < iterations; ++i) - { - cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4); - } - tm_cpu.stop(); - - double gpu_time = tm_gpu.getTimeMilli() / iterations; - double cpu_time = tm_cpu.getTimeMilli() / iterations; - double speedup = cpu_time / gpu_time; - - std::cout << "Size: " << size << " -> " << dsize - << ", Type: " << type - << ", Coeff: " << coeff << std::endl; - std::cout << " CPU: " << cpu_time << " ms" << std::endl; - std::cout << " GPU: " << gpu_time << " ms" << std::endl; - std::cout << " Speedup: " << speedup << "x" << std::endl; -} - -INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosPerformance, testing::Combine( - testing::Values(cv::cuda::DeviceInfo()), - testing::Values(cv::Size(512, 512), cv::Size(1024, 1024), cv::Size(2048, 2048)), - testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3)), - testing::Values(0.5, 1.5, 2.0))); - }} // namespace #endif // HAVE_CUDA From c451a64e07646f802994fcded0c681b6bddc6dd2 Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Wed, 10 Dec 2025 19:00:52 +0800 Subject: [PATCH 5/8] Fix uninitialized output parameter error --- modules/cudawarping/perf/perf_warping.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp index fb67363529f..1a37779c2c1 100644 --- a/modules/cudawarping/perf/perf_warping.cpp +++ b/modules/cudawarping/perf/perf_warping.cpp @@ -207,14 +207,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ResizeLanczos, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; + cv::Size dsize(cv::saturate_cast(src.cols * f), cv::saturate_cast(src.rows * f)); + cv::Mat host_dst(dsize, type); + + declare.out(host_dst); TEST_CYCLE() cv::cuda::resize(d_src, dst, cv::Size(), f, f, interpolation); + dst.download(host_dst); + CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE); } else { - cv::Mat dst; + cv::Size dsize(cv::saturate_cast(src.cols * f), cv::saturate_cast(src.rows * f)); + cv::Mat dst(dsize, type); + + declare.out(dst); TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation); From 2c658849fad7687350f117f91e2cd4fb6dfbc890 Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Thu, 11 Dec 2025 15:26:31 +0800 Subject: [PATCH 6/8] Trigger CI: opencv_extra regression data ready From 32bd21088796f0e35beadb4ca72fcbf1fe45197b Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Tue, 16 Dec 2025 15:08:18 +0800 Subject: [PATCH 7/8] Replace error hiding with CV_Assert for interpolation bounds check --- modules/cudawarping/src/cuda/resize.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu index 1b474731dda..3ac9a28eac1 100644 --- a/modules/cudawarping/src/cuda/resize.cu +++ b/modules/cudawarping/src/cuda/resize.cu @@ -737,8 +737,7 @@ namespace cv { namespace cuda { namespace device interpolation = 1; // Bounds check for interpolation mode - if (interpolation < 0 || interpolation >= 5) - interpolation = 1; // Default to linear + CV_Assert(interpolation >= 0 && interpolation < 5); funcs[interpolation](static_cast< PtrStepSz >(src), static_cast< PtrStepSz >(srcWhole), yoff, xoff, static_cast< PtrStepSz >(dst), fy, fx, stream); } From dba848003ff9779bf34d3cfbed6bc61af3f8b01e Mon Sep 17 00:00:00 2001 From: yuxuewen Date: Tue, 16 Dec 2025 15:20:01 +0800 Subject: [PATCH 8/8] Replace constant pi with CV_PI in LanczosInterpolator --- modules/cudawarping/test/interpolation.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/cudawarping/test/interpolation.hpp b/modules/cudawarping/test/interpolation.hpp index a008d3fd563..32703dc1065 100644 --- a/modules/cudawarping/test/interpolation.hpp +++ b/modules/cudawarping/test/interpolation.hpp @@ -140,8 +140,7 @@ template struct LanczosInterpolator if (x >= A) return 0.0f; - const float pi = 3.14159265358979323846f; - float pi_x = pi * x; + float pi_x = CV_PI * x; return sinf(pi_x) * sinf(pi_x / A) / (pi_x * pi_x / A); }