[~] Minor refactoring, clean-up

[+] Added 128-bit transpose

[~] Minor refactoring, clean-up
[+] Added 128-bit transpose
0c325cac · Anton Obukhov · e2caf4a3 · 0c325cac · 0c325cac · 0c325cac
Commit 0c325cac authored Apr 24, 2011 by Anton Obukhov
9 changed files
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
@@ -96,65 +96,65 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
 * \return NCV status code
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
-                                          Ncv32u *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+NCVStatus nppiStDecimate_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
+                                 Ncv32u *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
-                                          Ncv32s *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+NCVStatus nppiStDecimate_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
+                                 Ncv32s *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
-                                          Ncv32f *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+NCVStatus nppiStDecimate_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                 Ncv32f *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
-* \see nppiStDownsampleNearest_32u_C1R
+* \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
-                                          Ncv64u *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+NCVStatus nppiStDecimate_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
+                                 Ncv64u *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
-                                          Ncv64s *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+NCVStatus nppiStDecimate_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
+                                 Ncv64s *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
-                                          Ncv64f *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+NCVStatus nppiStDecimate_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
+                                 Ncv64f *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);


 /**
@@ -170,59 +170,59 @@ NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
 * \return NCV status code
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
-                                               Ncv32u *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+NCVStatus nppiStDecimate_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
+                                      Ncv32u *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
-                                               Ncv32s *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+NCVStatus nppiStDecimate_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
+                                      Ncv32s *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
-                                               Ncv32f *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+NCVStatus nppiStDecimate_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                      Ncv32f *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
-                                               Ncv64u *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+NCVStatus nppiStDecimate_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
+                                      Ncv64u *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
-                                               Ncv64s *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+NCVStatus nppiStDecimate_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
+                                      Ncv64s *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);


 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
-                                               Ncv64f *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+NCVStatus nppiStDecimate_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
+                                      Ncv64f *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);


 /**
@@ -333,6 +333,15 @@ NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
                                  Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);


+/**
+ * Transposes an image. 128-bit pixels of any type, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
+                                  void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
+
+
 /**
 * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
 *
@@ -394,6 +403,15 @@ NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
                                       Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);


+/**
+ * Transposes an image. 128-bit pixels of any type, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
+                                       void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
+
+
 /**
 * Calculates the size of the temporary buffer for integral image creation
 *

--- a/modules/gpu/src/nvidia/core/NCV.cpp
+++ b/modules/gpu/src/nvidia/core/NCV.cpp
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -129,8 +129,8 @@ struct NcvRect8u
    Ncv8u y;
    Ncv8u width;
    Ncv8u height;
-    NcvRect8u() : x(0), y(0), width(0), height(0) {};
-    NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
+    __host__ __device__ NcvRect8u() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
 };


@@ -140,8 +140,8 @@ struct NcvRect32s
    Ncv32s y;          ///< y-coordinate of upper left corner.
    Ncv32s width;      ///< Rectangle width.
    Ncv32s height;     ///< Rectangle height.
-    NcvRect32s() : x(0), y(0), width(0), height(0) {};
-    NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
+    __host__ __device__ NcvRect32s() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
 };


@@ -151,8 +151,8 @@ struct NcvRect32u
    Ncv32u y;          ///< y-coordinate of upper left corner.
    Ncv32u width;      ///< Rectangle width.
    Ncv32u height;     ///< Rectangle height.
-    NcvRect32u() : x(0), y(0), width(0), height(0) {};
-    NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
+    __host__ __device__ NcvRect32u() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
 };


@@ -160,8 +160,8 @@ struct NcvSize32s
 {
    Ncv32s width;  ///< Rectangle width.
    Ncv32s height; ///< Rectangle height.
-    NcvSize32s() : width(0), height(0) {};
-    NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
+    __host__ __device__ NcvSize32s() : width(0), height(0) {};
+    __host__ __device__ NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
 };


@@ -169,8 +169,8 @@ struct NcvSize32u
 {
    Ncv32u width;  ///< Rectangle width.
    Ncv32u height; ///< Rectangle height.
-    NcvSize32u() : width(0), height(0) {};
-    NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
+    __host__ __device__ NcvSize32u() : width(0), height(0) {};
+    __host__ __device__ NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
 };


@@ -275,6 +275,7 @@ enum NCVStatus
 {
    //NCV statuses
    NCV_SUCCESS,
+    NCV_UNKNOWN_ERROR,

    NCV_CUDA_ERROR,
    NCV_NPP_ERROR,
@@ -501,13 +502,18 @@ private:


 /**
-* Copy dispatcher
+* Copy dispatchers
 */
 NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
                                       const void *src, NCVMemoryType srcType,
                                       size_t sz, cudaStream_t cuStream);


+NCV_EXPORTS NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
+                                         const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
+                                         Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream);
+
+
 /**
 * NCVVector (1D)
 */
@@ -532,7 +538,7 @@ public:
        _memtype = NCVMemoryTypeNone;
    }

-    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
    {
        if (howMuch == 0)
        {
@@ -600,7 +606,6 @@ public:
        this->_memtype = this->allocatedMem.begin.memtype;
    }

-
    ~NCVVectorAlloc()
    {
        NCVStatus ncvStat;
@@ -611,25 +616,22 @@ public:
        this->clear();
    }

-
    NcvBool isMemAllocated() const
    {
        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
    }

-
    Ncv32u getAllocatorsAlignment() const
    {
        return allocator.alignment();
    }

-
    NCVMemSegment getSegment() const
    {
        return allocatedMem;
    }

-private:		
+private:
    INCVMemAllocator &allocator;
    NCVMemSegment allocatedMem;
 };
@@ -658,7 +660,6 @@ public:
        this->bReused = true;
    }

-
    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
    {
        this->bReused = false;
@@ -674,7 +675,6 @@ public:
        this->bReused = true;
    }

-
    NcvBool isMemReused() const
    {
        return this->bReused;
@@ -703,7 +703,6 @@ public:

    virtual ~NCVMatrix() {}

-
    void clear()
    {
        _ptr = NULL;
@@ -713,14 +712,13 @@ public:
        _memtype = NCVMemoryTypeNone;
    }

-
    Ncv32u stride() const
    {
        return _pitch / sizeof(T);
    }

-
-    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    //a side effect of this function is that it copies everything in a single chunk, so the "padding" will be overwritten
+    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
    {
        if (howMuch == 0)
        {
@@ -748,6 +746,24 @@ public:
        return ncvStat;
    }

+    NCVStatus copy2D(NCVMatrix<T> &dst, NcvSize32u roi, cudaStream_t cuStream) const
+    {
+        ncvAssertReturn(this->width() >= roi.width && this->height() >= roi.height &&
+                        dst.width() >= roi.width && dst.height() >= roi.height, NCV_MEM_COPY_ERROR);
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && 
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper2D(dst._ptr, dst._pitch, dst._memtype,
+                                         this->_ptr, this->_pitch, this->_memtype,
+                                         roi.width * sizeof(T), roi.height, cuStream);
+        }
+
+        return ncvStat;
+    }
+
    T *ptr() const {return this->_ptr;}
    Ncv32u width() const {return this->_width;}
    Ncv32u height() const {return this->_height;}
@@ -817,19 +833,16 @@ public:
        this->clear();
    }

-
    NcvBool isMemAllocated() const
    {
        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
    }

-
    Ncv32u getAllocatorsAlignment() const
    {
        return allocator.alignment();
    }

-
    NCVMemSegment getSegment() const
    {
        return allocatedMem;
@@ -888,6 +901,23 @@ public:
        this->bReused = true;
    }

+    NCVMatrixReuse(const NCVMatrix<T> &mat, NcvRect32u roi)
+    {
+        this->bReused = false;
+        this->clear();
+
+        ncvAssertPrintReturn(roi.x < mat.width() && roi.y < mat.height() && \
+            roi.x + roi.width <= mat.width() && roi.y + roi.height <= mat.height(),
+            "NCVMatrixReuse ctor:: memory binding failed due to mismatching ROI and source matrix dims", );
+
+        this->_width = roi.width;
+        this->_height = roi.height;
+        this->_pitch = mat.pitch();
+        this->_ptr = mat.ptr() + roi.y * mat.stride() + roi.x;
+        this->_memtype = mat.memType();
+
+        this->bReused = true;
+    }

    NcvBool isMemReused() const
    {
@@ -899,4 +929,27 @@ private:
    NcvBool bReused;
 };

+
+/**
+* Operations with rectangles
+*/
+NCV_EXPORTS NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses, Ncv32u &numHypotheses,
+                                              Ncv32u minNeighbors, Ncv32f intersectEps, NCVVector<Ncv32u> *hypothesesWeights);
+
+
+NCV_EXPORTS NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                           NcvRect32u *h_rects, Ncv32u numRects, Ncv8u color);
+
+
+NCV_EXPORTS NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                            NcvRect32u *h_rects, Ncv32u numRects, Ncv32u color);
+
+
+NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                             NcvRect32u *d_rects, Ncv32u numRects, Ncv8u color, cudaStream_t cuStream);
+
+
+NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                              NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);
+
 #endif // _ncv_hpp_
--- a/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
@@ -150,14 +150,14 @@ namespace NCVRuntimeTemplateBool
    {
        //Convenience function used by the user
        //Takes a variable argument list, transforms it into a list
-        static void call(Func *functor, int dummy, ...)
+        static void call(Func *functor, ...)
        {
            //Vector used to collect arguments
            std::vector<int> templateParamList;

            //Variable argument list manipulation
            va_list listPointer;
-            va_start(listPointer, dummy);
+            va_start(listPointer, functor);
            //Collect parameters into the list
            for(int i=0; i<NumArguments; i++)
            {

--- a/modules/gpu/test/nvidia/TestHypothesesFilter.cpp
+++ b/modules/gpu/test/nvidia/TestHypothesesFilter.cpp
@@ -134,7 +134,7 @@ bool TestHypothesesFilter::process()

    Ncv32u numHypothesesSrc = h_vecSrc.length();
    NCV_SKIP_COND_BEGIN
-    ncvStat = ncvFilterHypotheses_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
+    ncvStat = ncvGroupRectangles_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
    NCV_SKIP_COND_END


--- a/modules/gpu/test/nvidia/TestResize.cpp
+++ b/modules/gpu/test/nvidia/TestResize.cpp
@@ -83,17 +83,17 @@ bool TestResize<T>::process()
    NCV_SKIP_COND_BEGIN
    if (sizeof(T) == sizeof(Ncv32u))
    {
-        ncvStat = nppiStDownsampleNearest_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
-                                                  (Ncv32u *)d_small.ptr(), d_small.pitch(),
-                                                  srcSize, this->scaleFactor,
-                                                  this->bTextureCache);
+        ncvStat = nppiStDecimate_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
+                                         (Ncv32u *)d_small.ptr(), d_small.pitch(),
+                                         srcSize, this->scaleFactor,
+                                         this->bTextureCache);
    }
    else if (sizeof(T) == sizeof(Ncv64u))
    {
-        ncvStat = nppiStDownsampleNearest_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
-                                                  (Ncv64u *)d_small.ptr(), d_small.pitch(),
-                                                  srcSize, this->scaleFactor,
-                                                  this->bTextureCache);
+        ncvStat = nppiStDecimate_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
+                                         (Ncv64u *)d_small.ptr(), d_small.pitch(),
+                                         srcSize, this->scaleFactor,
+                                         this->bTextureCache);
    }
    else
    {
@@ -107,15 +107,15 @@ bool TestResize<T>::process()
    NCV_SKIP_COND_BEGIN
    if (sizeof(T) == sizeof(Ncv32u))
    {
-        ncvStat = nppiStDownsampleNearest_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
-                                                       (Ncv32u *)h_small.ptr(), h_small.pitch(),
-                                                       srcSize, this->scaleFactor);
+        ncvStat = nppiStDecimate_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
+                                              (Ncv32u *)h_small.ptr(), h_small.pitch(),
+                                              srcSize, this->scaleFactor);
    }
    else if (sizeof(T) == sizeof(Ncv64u))
    {
-        ncvStat = nppiStDownsampleNearest_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
-                                                       (Ncv64u *)h_small.ptr(), h_small.pitch(),
-                                                       srcSize, this->scaleFactor);
+        ncvStat = nppiStDecimate_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
+                                              (Ncv64u *)h_small.ptr(), h_small.pitch(),
+                                              srcSize, this->scaleFactor);
    }
    else
    {