From 595cde3ff43c019a488f4e8093d20fb79d838cd5 Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Tue, 9 Dec 2025 10:21:12 +0800
Subject: [PATCH 1/8] base lanczos4

---
 modules/cudawarping/src/cuda/resize.cu     | 117 ++++++++++-
 modules/cudawarping/src/resize.cpp         |   2 +-
 modules/cudawarping/test/interpolation.hpp |  52 +++++
 modules/cudawarping/test/test_lanczos.cpp  | 230 +++++++++++++++++++++
 4 files changed, 398 insertions(+), 3 deletions(-)
 create mode 100644 modules/cudawarping/test/test_lanczos.cpp
diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu
index 27cec5564ef..9c75e829828 100644
--- a/modules/cudawarping/src/cuda/resize.cu
+++ b/modules/cudawarping/src/cuda/resize.cu
@@ -43,6 +43,7 @@
 #if !defined CUDA_DISABLER
 
 #include <cfloat>
+#include <cmath>
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 #include "opencv2/core/cuda/vec_traits.hpp"
@@ -53,7 +54,89 @@
 
 namespace cv { namespace cuda { namespace device
 {
+    __device__ float lanczos_weight(float x_)
+    {
+        float x = fabsf(x_);
+        if (x == 0.0f)
+            return 1.0f;
+        if (x >= 4.0f)
+            return 0.0f;
+        float pi_x = M_PI * x;
+        return sinf(pi_x) * sinf(pi_x / 4.0f) / (pi_x * pi_x / 4.0f);
+    }
+
     // kernels
+    template <typename T>
+    __global__ void resize_lanczos4(const PtrStepSz<T> src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        const float src_x = static_cast<float>(x) * fx;
+        const float src_y = static_cast<float>(y) * fy;
+
+        const int in_height = src.rows;
+        const int in_width = src.cols;
+
+        typedef typename VecTraits<T>::elem_type elem_type;
+        constexpr int cn = VecTraits<T>::cn;
+        float results[cn] = {0.0f};
+
+        for (int c = 0; c < cn; ++c)
+        {
+            float acc_val = 0.0f;
+            float acc_weight = 0.0f;
+
+
+            const int xmin = int(floorf(src_x)) - 3;
+            const int xmax = int(floorf(src_x)) + 4;
+            const int ymin = int(floorf(src_y)) - 3;
+            const int ymax = int(floorf(src_y)) + 4;
+
+            for (int cy = ymin; cy <= ymax; ++cy)
+            {
+                float wy = lanczos_weight(src_y - static_cast<float>(cy));
+                if (wy == 0.0f)
+                    continue;
+
+                for (int cx = xmin; cx <= xmax; ++cx)
+                {
+                    float wx = lanczos_weight(src_x - static_cast<float>(cx));
+                    if (wx == 0.0f)
+                        continue;
+
+                    float w = wy * wx;
+
+                    int iy = ::max(0, ::min(cy, in_height - 1));
+                    int ix = ::max(0, ::min(cx, in_width - 1));
+
+                    T val = src(iy, ix);
+                    
+                    const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
+                    elem_type elem_val = val_ptr[c];
+                    float channel_val = static_cast<float>(elem_val);
+
+                    acc_val += channel_val * w;
+                    acc_weight += w;
+                }
+            }
+
+            float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f;
+            results[c] = result;
+        }
+
+        T result_vec;
+        elem_type* result_ptr = reinterpret_cast<elem_type*>(&result_vec);
+        for (int c = 0; c < cn; ++c)
+        {
+            result_ptr[c] = saturate_cast<elem_type>(results[c]);
+        }
+        dst(y, x) = result_vec;
+    }
+
 
     template <typename T> __global__ void resize_nearest(const PtrStep<T> src, PtrStepSz<T> dst, const float fy, const float fx)
     {
@@ -243,6 +326,21 @@ namespace cv { namespace cuda { namespace device
         cudaSafeCall( cudaDeviceSynchronize() );
     }
 
+    // callers for lanczos interpolation
+
+    template <typename T>
+    void call_resize_lanczos4_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize_lanczos4<<<grid, block, 0, stream>>>(src, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
     // ResizeNearestDispatcher
 
     template <typename T> struct ResizeNearestDispatcher
@@ -352,6 +450,16 @@ namespace cv { namespace cuda { namespace device
     template <> struct ResizeCubicDispatcher<float> : SelectImplForCubic<float> {};
     template <> struct ResizeCubicDispatcher<float4> : SelectImplForCubic<float4> {};
 
+    // ResizeLanczosDispatcher
+
+    template <typename T> struct ResizeLanczosDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_lanczos4_glob(src, dst, fy, fx, stream);
+        }
+    };
+
     // ResizeAreaDispatcher
 
     template <typename T> struct ResizeAreaDispatcher
@@ -393,18 +501,23 @@ namespace cv { namespace cuda { namespace device
     template <typename T> void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream)
     {
         typedef void (*func_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream);
-        static const func_t funcs[4] =
+        static const func_t funcs[5] =
         {
             ResizeNearestDispatcher<T>::call,
             ResizeLinearDispatcher<T>::call,
             ResizeCubicDispatcher<T>::call,
-            ResizeAreaDispatcher<T>::call
+            ResizeAreaDispatcher<T>::call,
+            ResizeLanczosDispatcher<T>::call
         };
 
         // change to linear if area interpolation upscaling
         if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
             interpolation = 1;
 
+        // Bounds check for interpolation mode
+        if (interpolation < 0 || interpolation >= 5)
+            interpolation = 1; // Default to linear
+
         funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
     }
 
diff --git a/modules/cudawarping/src/resize.cpp b/modules/cudawarping/src/resize.cpp
index 9943a6cdc6a..bb30cd1cfbe 100644
--- a/modules/cudawarping/src/resize.cpp
+++ b/modules/cudawarping/src/resize.cpp
@@ -70,7 +70,7 @@ void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx,
     };
 
     CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
-    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA || interpolation == INTER_LANCZOS4 );
     CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
 
     if (dsize == Size())
diff --git a/modules/cudawarping/test/interpolation.hpp b/modules/cudawarping/test/interpolation.hpp
index 7a00143e1d9..07fc5fa9a42 100644
--- a/modules/cudawarping/test/interpolation.hpp
+++ b/modules/cudawarping/test/interpolation.hpp
@@ -128,4 +128,56 @@ template <typename T> struct CubicInterpolator
     }
 };
 
+template <typename T> struct LanczosInterpolator
+{
+    static constexpr int A = 4;
+
+    static float lanczosCoeff(float x_)
+    {
+        float x = fabsf(x_);
+        if (x == 0.0f)
+            return 1.0f;
+        if (x >= A)
+            return 0.0f;
+        
+        const float pi = 3.14159265358979323846f;
+        float pi_x = pi * x;
+        return sinf(pi_x) * sinf(pi_x / A) / (pi_x * pi_x / A);
+    }
+
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        const int xmin = (int) floorf(x) - A + 1;
+        const int xmax = (int) floorf(x) + A;
+
+        const int ymin = (int) floorf(y) - A + 1;
+        const int ymax = (int) floorf(y) + A;
+
+        float sum  = 0.0f;
+        float wsum = 0.0f;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            float wy = lanczosCoeff(y - cy);
+            if (wy == 0.0f)
+                continue;
+
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                float wx = lanczosCoeff(x - cx);
+                if (wx == 0.0f)
+                    continue;
+
+                const float w = wy * wx;
+                sum += w * readVal<T>(src, cy, cx, c, border_type, borderVal);
+                wsum += w;
+            }
+        }
+
+        float res = (!wsum)? 0 : sum / wsum;
+
+        return cv::saturate_cast<T>(res);
+    }
+};
+
 #endif // __OPENCV_TEST_INTERPOLATION_HPP__
diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp
new file mode 100644
index 00000000000..fd57aab2c02
--- /dev/null
+++ b/modules/cudawarping/test/test_lanczos.cpp
@@ -0,0 +1,230 @@
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T>
+    void resizeLanczosImpl(const cv::Mat& src, cv::Mat& dst, double fx, double fy)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize(cv::saturate_cast<int>(src.cols * fx), cv::saturate_cast<int>(src.rows * fy));
+
+        dst.create(dsize, src.type());
+
+        float ifx = static_cast<float>(1.0 / fx);
+        float ify = static_cast<float>(1.0 / fy);
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = LanczosInterpolator<T>::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE);
+            }
+        }
+    }
+
+    void resizeLanczosGold(const cv::Mat& src, cv::Mat& dst, double fx, double fy)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst, double fx, double fy);
+
+        static const func_t lanczos_funcs[] =
+        {
+            resizeLanczosImpl<unsigned char>,
+            resizeLanczosImpl<signed char>,
+            resizeLanczosImpl<unsigned short>,
+            resizeLanczosImpl<short>,
+            resizeLanczosImpl<int>,
+            resizeLanczosImpl<float>
+        };
+
+        lanczos_funcs[src.depth()](src, dst, fx, fy);
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(ResizeLanczos, cv::cuda::DeviceInfo, cv::Size, MatType, double, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+    virtual void TearDown()
+    {
+        // GpuMat destructors will automatically clean up GPU memory
+    }
+};
+
+CUDA_TEST_P(ResizeLanczos, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
+    cv::cuda::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
+
+    cv::Mat dst_gold;
+    resizeLanczosGold(src, dst_gold, coeff, coeff);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczos, testing::Combine(
+    testing::Values(cv::cuda::DeviceInfo()),
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    WHOLE_SUBMAT));
+
+/////////////////
+
+PARAM_TEST_CASE(ResizeLanczosSameAsHost, cv::cuda::DeviceInfo, cv::Size, MatType, double, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+    virtual void TearDown()
+    {
+        // GpuMat destructors will automatically clean up GPU memory
+    }
+};
+
+CUDA_TEST_P(ResizeLanczosSameAsHost, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
+    cv::cuda::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
+
+    cv::Mat dst_gold;
+    cv::resize(src, dst_gold, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosSameAsHost, testing::Combine(
+    testing::Values(cv::cuda::DeviceInfo()),
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    WHOLE_SUBMAT));
+
+/////////////////
+// Performance Test
+
+PARAM_TEST_CASE(ResizeLanczosPerformance, cv::cuda::DeviceInfo, cv::Size, MatType, double)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ResizeLanczosPerformance, Performance)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Size dsize(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff));
+
+    // Warm up
+    {
+        cv::cuda::GpuMat gpuSrc, gpuDst;
+        gpuSrc.upload(src);
+        gpuDst.create(dsize, type);
+        cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
+        cv::Mat dummy;
+        gpuDst.download(dummy); // Synchronize
+    }
+
+    // GPU performance
+    const int iterations = 100;
+    cv::TickMeter tm_gpu;
+    cv::cuda::GpuMat gpuSrc, gpuDst;
+    gpuSrc.upload(src);
+    gpuDst.create(dsize, type);
+
+    tm_gpu.start();
+    for (int i = 0; i < iterations; ++i)
+    {
+        cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
+    }
+    cv::Mat dummy;
+    gpuDst.download(dummy); // Synchronize
+    tm_gpu.stop();
+
+    // CPU performance
+    cv::TickMeter tm_cpu;
+    cv::Mat dst_cpu;
+
+    tm_cpu.start();
+    for (int i = 0; i < iterations; ++i)
+    {
+        cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
+    }
+    tm_cpu.stop();
+
+    double gpu_time = tm_gpu.getTimeMilli() / iterations;
+    double cpu_time = tm_cpu.getTimeMilli() / iterations;
+    double speedup = cpu_time / gpu_time;
+
+    std::cout << "Size: " << size << " -> " << dsize 
+              << ", Type: " << type 
+              << ", Coeff: " << coeff << std::endl;
+    std::cout << "  CPU: " << cpu_time << " ms" << std::endl;
+    std::cout << "  GPU: " << gpu_time << " ms" << std::endl;
+    std::cout << "  Speedup: " << speedup << "x" << std::endl;
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosPerformance, testing::Combine(
+    testing::Values(cv::cuda::DeviceInfo()),
+    testing::Values(cv::Size(512, 512), cv::Size(1024, 1024), cv::Size(2048, 2048)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3)),
+    testing::Values(0.5, 1.5, 2.0)));
+
+}} // namespace
+#endif // HAVE_CUDA
+

From 51dbf82e621e580bcccf2c8dff4340bdbf555dd7 Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Wed, 10 Dec 2025 11:47:13 +0800
Subject: [PATCH 2/8] shared memory optimization

---
 modules/cudawarping/src/cuda/resize.cu    | 304 +++++++++++++++++++---
 modules/cudawarping/test/test_lanczos.cpp |   9 +-
 2 files changed, 271 insertions(+), 42 deletions(-)

diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu
index 9c75e829828..1b474731dda 100644
--- a/modules/cudawarping/src/cuda/resize.cu
+++ b/modules/cudawarping/src/cuda/resize.cu
@@ -54,7 +54,7 @@
 
 namespace cv { namespace cuda { namespace device
 {
-    __device__ float lanczos_weight(float x_)
+    __device__ __forceinline__ float lanczos_weight(float x_)
     {
         float x = fabsf(x_);
         if (x == 0.0f)
@@ -69,75 +69,154 @@ namespace cv { namespace cuda { namespace device
     template <typename T>
     __global__ void resize_lanczos4(const PtrStepSz<T> src, PtrStepSz<T> dst, const float fy, const float fx)
     {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int tx = threadIdx.x;
+        const int ty = threadIdx.y;
+        const int bx = blockIdx.x;
+        const int by = blockIdx.y;
 
-        if (x >= dst.cols || y >= dst.rows)
-            return;
-
-        const float src_x = static_cast<float>(x) * fx;
-        const float src_y = static_cast<float>(y) * fy;
+        const int x = bx * blockDim.x + tx;
+        const int y = by * blockDim.y + ty;
 
         const int in_height = src.rows;
         const int in_width = src.cols;
 
+        constexpr int R = 4;
+        constexpr int BASE_W = 32;
+        constexpr int BASE_H = 8;
+        constexpr int SHARED_WIDTH_MAX = BASE_W + R + R;
+        constexpr int SHARED_HEIGHT_MAX = BASE_H + R + R;
+
+        __shared__ T shared_src[SHARED_HEIGHT_MAX][SHARED_WIDTH_MAX];
+
         typedef typename VecTraits<T>::elem_type elem_type;
         constexpr int cn = VecTraits<T>::cn;
-        float results[cn] = {0.0f};
 
-        for (int c = 0; c < cn; ++c)
-        {
-            float acc_val = 0.0f;
-            float acc_weight = 0.0f;
+        const int out_x0 = bx * blockDim.x;
+        const int out_x1 = ::min(out_x0 + blockDim.x - 1, dst.cols - 1);
+        const int out_y0 = by * blockDim.y;
+        const int out_y1 = ::min(out_y0 + blockDim.y - 1, dst.rows - 1);
 
+        const float src_x0_f = (static_cast<float>(out_x0) + 0.5f) * fx - 0.5f;
+        const float src_x1_f = (static_cast<float>(out_x1) + 0.5f) * fx - 0.5f;
+        const float src_y0_f = (static_cast<float>(out_y0) + 0.5f) * fy - 0.5f;
+        const float src_y1_f = (static_cast<float>(out_y1) + 0.5f) * fy - 0.5f;
 
-            const int xmin = int(floorf(src_x)) - 3;
-            const int xmax = int(floorf(src_x)) + 4;
-            const int ymin = int(floorf(src_y)) - 3;
-            const int ymax = int(floorf(src_y)) + 4;
+        int in_x_min = int(floorf(src_x0_f)) - R;
+        int in_x_max = int(floorf(src_x1_f)) + R;
+        int in_y_min = int(floorf(src_y0_f)) - R;
+        int in_y_max = int(floorf(src_y1_f)) + R;
 
-            for (int cy = ymin; cy <= ymax; ++cy)
-            {
-                float wy = lanczos_weight(src_y - static_cast<float>(cy));
-                if (wy == 0.0f)
-                    continue;
+        if (in_x_min < 0) in_x_min = 0;
+        if (in_y_min < 0) in_y_min = 0;
+        if (in_x_max >= in_width) in_x_max = in_width - 1;
+        if (in_y_max >= in_height) in_y_max = in_height - 1;
 
-                for (int cx = xmin; cx <= xmax; ++cx)
+        const int W_needed = in_x_max - in_x_min + 1;
+        const int H_needed = in_y_max - in_y_min + 1;
+
+        // for fx <= 1 and fy <= 1
+        const bool use_shared = (W_needed <= SHARED_WIDTH_MAX) && (H_needed <= SHARED_HEIGHT_MAX);
+
+        if (use_shared)
+        {
+            for (int sy = ty; sy < H_needed; sy += blockDim.y)
+            {
+                int iy = in_y_min + sy;
+                for (int sx = tx; sx < W_needed; sx += blockDim.x)
                 {
-                    float wx = lanczos_weight(src_x - static_cast<float>(cx));
-                    if (wx == 0.0f)
-                        continue;
+                    int ix = in_x_min + sx;
+                    shared_src[sy][sx] = src(iy, ix);
+                }
+            }
+            __syncthreads();
+        }
 
-                    float w = wy * wx;
+        if (x >= dst.cols || y >= dst.rows)
+        {
+            if (use_shared) { __syncthreads(); }
+            return;
+        }
 
-                    int iy = ::max(0, ::min(cy, in_height - 1));
-                    int ix = ::max(0, ::min(cx, in_width - 1));
+        const float src_x = (static_cast<float>(x) + 0.5f) * fx - 0.5f;
+        const float src_y = (static_cast<float>(y) + 0.5f) * fy - 0.5f;
 
-                    T val = src(iy, ix);
-                    
-                    const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
-                    elem_type elem_val = val_ptr[c];
-                    float channel_val = static_cast<float>(elem_val);
+        const int xmin = int(floorf(src_x)) - 3;
+        const int xmax = int(floorf(src_x)) + 4;
+        const int ymin = int(floorf(src_y)) - 3;
+        const int ymax = int(floorf(src_y)) + 4;
 
-                    acc_val += channel_val * w;
-                    acc_weight += w;
+        float results[cn];
+        float acc_weights[cn];
+        #pragma unroll
+        for (int c = 0; c < cn; ++c) { results[c] = 0.0f; acc_weights[c] = 0.0f; }
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            float wy = lanczos_weight(src_y - static_cast<float>(cy));
+            if (wy == 0.0f) continue;
+
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                float wx = lanczos_weight(src_x - static_cast<float>(cx));
+                if (wx == 0.0f) continue;
+
+                float w = wy * wx;
+
+                if (use_shared)
+                {
+                    int sx = cx - in_x_min;
+                    int sy = cy - in_y_min;
+                    if (sx < 0) sx = 0;
+                    else if (sx >= W_needed) sx = W_needed - 1;
+                    if (sy < 0) sy = 0;
+                    else if (sy >= H_needed) sy = H_needed - 1;
+
+                    T val = shared_src[sy][sx];
+                    const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
+                    #pragma unroll
+                    for (int c = 0; c < cn; ++c)
+                    {
+                        elem_type elem_val = val_ptr[c];
+                        float channel_val = static_cast<float>(elem_val);
+                        results[c] += channel_val * w;
+                        acc_weights[c] += w;
+                    }
+                }
+                else
+                {
+                    // fallback
+                    int iy_r = cy < 0 ? 0 : (cy >= in_height ? (in_height - 1) : cy);
+                    int ix_r = cx < 0 ? 0 : (cx >= in_width ? (in_width - 1) : cx);
+                    T val = src(iy_r, ix_r);
+                    const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
+                    #pragma unroll
+                    for (int c = 0; c < cn; ++c)
+                    {
+                        elem_type elem_val = val_ptr[c];
+                        float channel_val = static_cast<float>(elem_val);
+                        results[c] += channel_val * w;
+                        acc_weights[c] += w;
+                    }
                 }
             }
-
-            float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f;
-            results[c] = result;
         }
 
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            results[c] = acc_weights[c] > 0.0f ? (results[c] / acc_weights[c]) : 0.0f;
+
         T result_vec;
         elem_type* result_ptr = reinterpret_cast<elem_type*>(&result_vec);
+
+        #pragma unroll
         for (int c = 0; c < cn; ++c)
-        {
             result_ptr[c] = saturate_cast<elem_type>(results[c]);
-        }
+
         dst(y, x) = result_vec;
     }
 
 
+
     template <typename T> __global__ void resize_nearest(const PtrStep<T> src, PtrStepSz<T> dst, const float fy, const float fx)
     {
         const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -341,6 +420,92 @@ namespace cv { namespace cuda { namespace device
             cudaSafeCall( cudaDeviceSynchronize() );
     }
 
+    template <typename Ptr2D, typename T>
+    __global__ void resize_lanczos4_tex(Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        const float src_x = (static_cast<float>(x) + 0.5f) * fx - 0.5f;
+        const float src_y = (static_cast<float>(y) + 0.5f) * fy - 0.5f;
+
+        typedef typename VecTraits<T>::elem_type elem_type;
+        constexpr int cn = VecTraits<T>::cn;
+        float results[cn] = {0.0f};
+
+        for (int c = 0; c < cn; ++c)
+        {
+            float acc_val = 0.0f;
+            float acc_weight = 0.0f;
+
+            const int xmin = int(floorf(src_x)) - 3;
+            const int xmax = int(floorf(src_x)) + 4;
+            const int ymin = int(floorf(src_y)) - 3;
+            const int ymax = int(floorf(src_y)) + 4;
+
+            for (int cy = ymin; cy <= ymax; ++cy)
+            {
+                float wy = lanczos_weight(src_y - static_cast<float>(cy));
+                if (wy == 0.0f)
+                    continue;
+
+                for (int cx = xmin; cx <= xmax; ++cx)
+                {
+                    float wx = lanczos_weight(src_x - static_cast<float>(cx));
+                    if (wx == 0.0f)
+                        continue;
+
+                    float w = wy * wx;
+
+                    // Use texture memory for sampling (handles boundary automatically)
+                    T val = src(static_cast<float>(cy), static_cast<float>(cx));
+
+                    const elem_type* val_ptr = reinterpret_cast<const elem_type*>(&val);
+                    elem_type elem_val = val_ptr[c];
+                    float channel_val = static_cast<float>(elem_val);
+
+                    acc_val += channel_val * w;
+                    acc_weight += w;
+                }
+            }
+
+            float result = acc_weight > 0.0f ? (acc_val / acc_weight) : 0.0f;
+            results[c] = result;
+        }
+
+        T result_vec;
+        elem_type* result_ptr = reinterpret_cast<elem_type*>(&result_vec);
+        for (int c = 0; c < cn; ++c)
+        {
+            result_ptr[c] = saturate_cast<elem_type>(results[c]);
+        }
+        dst(y, x) = result_vec;
+    }
+
+    template <typename T>
+    void call_resize_lanczos4_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        if (srcWhole.data == src.data)
+        {
+            cudev::Texture<T> texSrc(src);
+            resize_lanczos4_tex<cudev::TexturePtr<T>><<<grid, block>>>(texSrc, dst, fy, fx);
+        }
+        else
+        {
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
+            resize_lanczos4_tex<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>><<<grid, block>>>(brdSrc, dst, fy, fx);
+        }
+        cudaSafeCall( cudaGetLastError() );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
     // ResizeNearestDispatcher
 
     template <typename T> struct ResizeNearestDispatcher
@@ -460,6 +625,63 @@ namespace cv { namespace cuda { namespace device
         }
     };
 
+    template <typename T> struct SelectImplForLanczos
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_lanczos4_glob(src, dst, fy, fx, stream);
+            else
+            {
+                if (fx > 1 || fy > 1)
+                    call_resize_lanczos4_glob(src, dst, fy, fx, 0);
+                else
+                    call_resize_lanczos4_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+            }
+        }
+    };
+
+    // Texture memory doesn't support 3-channel types, so use glob for those
+    template <> struct ResizeLanczosDispatcher<uchar> : SelectImplForLanczos<uchar> {};
+    template <> struct ResizeLanczosDispatcher<uchar3>
+    {
+        static void call(const PtrStepSz<uchar3>& src, const PtrStepSz<uchar3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<uchar3>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_lanczos4_glob(src, dst, fy, fx, stream);
+        }
+    };
+    template <> struct ResizeLanczosDispatcher<uchar4> : SelectImplForLanczos<uchar4> {};
+
+    template <> struct ResizeLanczosDispatcher<ushort> : SelectImplForLanczos<ushort> {};
+    template <> struct ResizeLanczosDispatcher<ushort3>
+    {
+        static void call(const PtrStepSz<ushort3>& src, const PtrStepSz<ushort3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<ushort3>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_lanczos4_glob(src, dst, fy, fx, stream);
+        }
+    };
+    template <> struct ResizeLanczosDispatcher<ushort4> : SelectImplForLanczos<ushort4> {};
+
+    template <> struct ResizeLanczosDispatcher<short> : SelectImplForLanczos<short> {};
+    template <> struct ResizeLanczosDispatcher<short3>
+    {
+        static void call(const PtrStepSz<short3>& src, const PtrStepSz<short3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<short3>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_lanczos4_glob(src, dst, fy, fx, stream);
+        }
+    };
+    template <> struct ResizeLanczosDispatcher<short4> : SelectImplForLanczos<short4> {};
+
+    template <> struct ResizeLanczosDispatcher<float> : SelectImplForLanczos<float> {};
+    template <> struct ResizeLanczosDispatcher<float3>
+    {
+        static void call(const PtrStepSz<float3>& src, const PtrStepSz<float3>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<float3>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_lanczos4_glob(src, dst, fy, fx, stream);
+        }
+    };
+    template <> struct ResizeLanczosDispatcher<float4> : SelectImplForLanczos<float4> {};
+
     // ResizeAreaDispatcher
 
     template <typename T> struct ResizeAreaDispatcher
diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp
index fd57aab2c02..d78f0ef5d2d 100644
--- a/modules/cudawarping/test/test_lanczos.cpp
+++ b/modules/cudawarping/test/test_lanczos.cpp
@@ -21,12 +21,19 @@ namespace
         float ifx = static_cast<float>(1.0 / fx);
         float ify = static_cast<float>(1.0 / fy);
 
+        // OpenCV CPU resize uses center-aligned coordinate mapping: (x + 0.5) * fx - 0.5
+        // Since fx and fy here are scale factors, and ifx = 1.0 / fx, ify = 1.0 / fy,
+        // the center-aligned mapping becomes: (x + 0.5) / fx - 0.5 = (x + 0.5) * ifx - 0.5
         for (int y = 0; y < dsize.height; ++y)
         {
             for (int x = 0; x < dsize.width; ++x)
             {
                 for (int c = 0; c < cn; ++c)
-                    dst.at<T>(y, x * cn + c) = LanczosInterpolator<T>::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE);
+                {
+                    float src_x = (static_cast<float>(x) + 0.5f) * ifx - 0.5f;
+                    float src_y = (static_cast<float>(y) + 0.5f) * ify - 0.5f;
+                    dst.at<T>(y, x * cn + c) = LanczosInterpolator<T>::getValue(src, src_y, src_x, c, cv::BORDER_REPLICATE);
+                }
             }
         }
     }

From 31c1978f8756ea67ae010dbc67eba70c8a563feb Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Wed, 10 Dec 2025 15:41:39 +0800
Subject: [PATCH 3/8] Fix whitespace issues in test files

---
 modules/cudawarping/test/interpolation.hpp | 2 +-
 modules/cudawarping/test/test_lanczos.cpp  | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/modules/cudawarping/test/interpolation.hpp b/modules/cudawarping/test/interpolation.hpp
index 07fc5fa9a42..a008d3fd563 100644
--- a/modules/cudawarping/test/interpolation.hpp
+++ b/modules/cudawarping/test/interpolation.hpp
@@ -139,7 +139,7 @@ template <typename T> struct LanczosInterpolator
             return 1.0f;
         if (x >= A)
             return 0.0f;
-        
+
         const float pi = 3.14159265358979323846f;
         float pi_x = pi * x;
         return sinf(pi_x) * sinf(pi_x / A) / (pi_x * pi_x / A);
diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp
index d78f0ef5d2d..bb06058635a 100644
--- a/modules/cudawarping/test/test_lanczos.cpp
+++ b/modules/cudawarping/test/test_lanczos.cpp
@@ -218,8 +218,8 @@ CUDA_TEST_P(ResizeLanczosPerformance, Performance)
     double cpu_time = tm_cpu.getTimeMilli() / iterations;
     double speedup = cpu_time / gpu_time;
 
-    std::cout << "Size: " << size << " -> " << dsize 
-              << ", Type: " << type 
+    std::cout << "Size: " << size << " -> " << dsize
+              << ", Type: " << type
               << ", Coeff: " << coeff << std::endl;
     std::cout << "  CPU: " << cpu_time << " ms" << std::endl;
     std::cout << "  GPU: " << gpu_time << " ms" << std::endl;
@@ -234,4 +234,3 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosPerformance, testing::Combine
 
 }} // namespace
 #endif // HAVE_CUDA
-

From 3cd6cf7f43713fc8eeb166415409cf5328033fe7 Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Wed, 10 Dec 2025 17:00:15 +0800
Subject: [PATCH 4/8] move performance tests code to
 modules/<module_name>/perf/

---
 modules/cudawarping/perf/perf_warping.cpp | 42 ++++++++++++
 modules/cudawarping/test/test_lanczos.cpp | 81 -----------------------
 2 files changed, 42 insertions(+), 81 deletions(-)

diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp
index 3e7aa18f559..fb67363529f 100644
--- a/modules/cudawarping/perf/perf_warping.cpp
+++ b/modules/cudawarping/perf/perf_warping.cpp
@@ -180,6 +180,48 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, Resize,
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// ResizeLanczos
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ResizeLanczos,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_LANCZOS4)),
+                    Values(0.5, 1.5, 2.0)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const double f = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::resize(d_src, dst, cv::Size(), f, f, interpolation);
+
+        CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // ResizeArea
 
diff --git a/modules/cudawarping/test/test_lanczos.cpp b/modules/cudawarping/test/test_lanczos.cpp
index bb06058635a..32b2684c293 100644
--- a/modules/cudawarping/test/test_lanczos.cpp
+++ b/modules/cudawarping/test/test_lanczos.cpp
@@ -151,86 +151,5 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosSameAsHost, testing::Combine(
     testing::Values(0.3, 0.5, 1.5, 2.0),
     WHOLE_SUBMAT));
 
-/////////////////
-// Performance Test
-
-PARAM_TEST_CASE(ResizeLanczosPerformance, cv::cuda::DeviceInfo, cv::Size, MatType, double)
-{
-    cv::cuda::DeviceInfo devInfo;
-    cv::Size size;
-    double coeff;
-    int type;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        coeff = GET_PARAM(3);
-
-        cv::cuda::setDevice(devInfo.deviceID());
-    }
-};
-
-CUDA_TEST_P(ResizeLanczosPerformance, Performance)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Size dsize(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff));
-
-    // Warm up
-    {
-        cv::cuda::GpuMat gpuSrc, gpuDst;
-        gpuSrc.upload(src);
-        gpuDst.create(dsize, type);
-        cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
-        cv::Mat dummy;
-        gpuDst.download(dummy); // Synchronize
-    }
-
-    // GPU performance
-    const int iterations = 100;
-    cv::TickMeter tm_gpu;
-    cv::cuda::GpuMat gpuSrc, gpuDst;
-    gpuSrc.upload(src);
-    gpuDst.create(dsize, type);
-
-    tm_gpu.start();
-    for (int i = 0; i < iterations; ++i)
-    {
-        cv::cuda::resize(gpuSrc, gpuDst, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
-    }
-    cv::Mat dummy;
-    gpuDst.download(dummy); // Synchronize
-    tm_gpu.stop();
-
-    // CPU performance
-    cv::TickMeter tm_cpu;
-    cv::Mat dst_cpu;
-
-    tm_cpu.start();
-    for (int i = 0; i < iterations; ++i)
-    {
-        cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, cv::INTER_LANCZOS4);
-    }
-    tm_cpu.stop();
-
-    double gpu_time = tm_gpu.getTimeMilli() / iterations;
-    double cpu_time = tm_cpu.getTimeMilli() / iterations;
-    double speedup = cpu_time / gpu_time;
-
-    std::cout << "Size: " << size << " -> " << dsize
-              << ", Type: " << type
-              << ", Coeff: " << coeff << std::endl;
-    std::cout << "  CPU: " << cpu_time << " ms" << std::endl;
-    std::cout << "  GPU: " << gpu_time << " ms" << std::endl;
-    std::cout << "  Speedup: " << speedup << "x" << std::endl;
-}
-
-INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeLanczosPerformance, testing::Combine(
-    testing::Values(cv::cuda::DeviceInfo()),
-    testing::Values(cv::Size(512, 512), cv::Size(1024, 1024), cv::Size(2048, 2048)),
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3)),
-    testing::Values(0.5, 1.5, 2.0)));
-
 }} // namespace
 #endif // HAVE_CUDA

From c451a64e07646f802994fcded0c681b6bddc6dd2 Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Wed, 10 Dec 2025 19:00:52 +0800
Subject: [PATCH 5/8] Fix uninitialized output parameter error

---
 modules/cudawarping/perf/perf_warping.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp
index fb67363529f..1a37779c2c1 100644
--- a/modules/cudawarping/perf/perf_warping.cpp
+++ b/modules/cudawarping/perf/perf_warping.cpp
@@ -207,14 +207,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ResizeLanczos,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
+        cv::Size dsize(cv::saturate_cast<int>(src.cols * f), cv::saturate_cast<int>(src.rows * f));
+        cv::Mat host_dst(dsize, type);
+
+        declare.out(host_dst);
 
         TEST_CYCLE() cv::cuda::resize(d_src, dst, cv::Size(), f, f, interpolation);
 
+        dst.download(host_dst);
+
         CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
     }
     else
     {
-        cv::Mat dst;
+        cv::Size dsize(cv::saturate_cast<int>(src.cols * f), cv::saturate_cast<int>(src.rows * f));
+        cv::Mat dst(dsize, type);
+
+        declare.out(dst);
 
         TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
 

From 2c658849fad7687350f117f91e2cd4fb6dfbc890 Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Thu, 11 Dec 2025 15:26:31 +0800
Subject: [PATCH 6/8] Trigger CI: opencv_extra regression data ready


From 32bd21088796f0e35beadb4ca72fcbf1fe45197b Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Tue, 16 Dec 2025 15:08:18 +0800
Subject: [PATCH 7/8] Replace error hiding with CV_Assert for interpolation
 bounds check

---
 modules/cudawarping/src/cuda/resize.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu
index 1b474731dda..3ac9a28eac1 100644
--- a/modules/cudawarping/src/cuda/resize.cu
+++ b/modules/cudawarping/src/cuda/resize.cu
@@ -737,8 +737,7 @@ namespace cv { namespace cuda { namespace device
             interpolation = 1;
 
         // Bounds check for interpolation mode
-        if (interpolation < 0 || interpolation >= 5)
-            interpolation = 1; // Default to linear
+        CV_Assert(interpolation >= 0 && interpolation < 5);
 
         funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
     }

From dba848003ff9779bf34d3cfbed6bc61af3f8b01e Mon Sep 17 00:00:00 2001
From: yuxuewen <c8libur@gmail.com>
Date: Tue, 16 Dec 2025 15:20:01 +0800
Subject: [PATCH 8/8] Replace constant pi with CV_PI in LanczosInterpolator

---
 modules/cudawarping/test/interpolation.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/cudawarping/test/interpolation.hpp b/modules/cudawarping/test/interpolation.hpp
index a008d3fd563..32703dc1065 100644
--- a/modules/cudawarping/test/interpolation.hpp
+++ b/modules/cudawarping/test/interpolation.hpp
@@ -140,8 +140,7 @@ template <typename T> struct LanczosInterpolator
         if (x >= A)
             return 0.0f;
 
-        const float pi = 3.14159265358979323846f;
-        float pi_x = pi * x;
+        float pi_x = CV_PI * x;
         return sinf(pi_x) * sinf(pi_x / A) / (pi_x * pi_x / A);
     }