[~] Minor refactoring, clean-up

[+] Added 128-bit transpose

[~] Minor refactoring, clean-up
[+] Added 128-bit transpose
0c325cac · Anton Obukhov · e2caf4a3 · 0c325cac · 0c325cac · 0c325cac
Commit 0c325cac authored Apr 24, 2011 by Anton Obukhov
9 changed files
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -63,8 +63,6 @@
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVHaarObjectDetection.hpp"
-void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
 //==============================================================================
 //
@@ -785,7 +783,6 @@ void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
    //Second parameter is the number of "dynamic" template parameters
    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
        ::call( &functor,
-                0xC001C0DE, //this is dummy int for the va_args C compatibility
                tbInitMaskPositively,
                tbCacheTextureIImg,
                tbCacheTextureCascade,
@@ -890,7 +887,6 @@ void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg
    //Second parameter is the number of "dynamic" template parameters
    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
        ::call( &functor,
-                0xC001C0DE, //this is dummy int for the va_args C compatibility
                tbCacheTextureIImg,
                tbCacheTextureCascade,
                tbDoAtomicCompaction);
@@ -957,7 +953,6 @@ void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
    //Second parameter is the number of "dynamic" template parameters
    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
        ::call( &functor,
-                0xC001C0DE, //this is dummy int for the va_args C compatibility
                tbMaskByInmask,
                tbDoAtomicCompaction);
 }
@@ -1554,172 +1549,6 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
 }
-//==============================================================================
-//
-// Visualize file
-//
-//==============================================================================
-const Ncv32u NUMTHREADS_DRAWRECTS = 32;
-const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
-template <class T>
-__global__ void drawRects(T *d_dst,
-                          Ncv32u dstStride,
-                          Ncv32u dstWidth,
-                          Ncv32u dstHeight,
-                          NcvRect32u *d_rects,
-                          Ncv32u numRects,
-                          T color)
-{
-    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
-    if (blockId > numRects * 4)
-    {
-        return;
-    }
-    NcvRect32u curRect = d_rects[blockId >> 2];
-    NcvBool bVertical = blockId & 0x1;
-    NcvBool bTopLeft = blockId & 0x2;
-    Ncv32u pt0x, pt0y;
-    if (bVertical)
-    {
-        Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
-        pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
-        pt0y = curRect.y;
-        if (pt0x < dstWidth)
-        {
-            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
-            {
-                Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
-                if (ptY < pt0y + curRect.height && ptY < dstHeight)
-                {
-                    d_dst[ptY * dstStride + pt0x] = color;
-                }
-            }
-        }
-    }
-    else
-    {
-        Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
-        pt0x = curRect.x;
-        pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
-        if (pt0y < dstHeight)
-        {
-            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
-            {
-                Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
-                if (ptX < pt0x + curRect.width && ptX < dstWidth)
-                {
-                    d_dst[pt0y * dstStride + ptX] = color;
-                }
-            }
-        }
-    }
-}
-template <class T>
-static NCVStatus drawRectsWrapperDevice(T *d_dst,
-                                        Ncv32u dstStride,
-                                        Ncv32u dstWidth,
-                                        Ncv32u dstHeight,
-                                        NcvRect32u *d_rects,
-                                        Ncv32u numRects,
-                                        T color,
-                                        cudaStream_t cuStream)
-{
-    ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
-    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
-    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
-    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
-    if (numRects == 0)
-    {
-        return NCV_SUCCESS;
-    }
-#if defined _SELF_TEST_
-    T *h_dst;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
-    NcvRect32s *h_rects;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
-    ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));
-#endif
-    dim3 grid(numRects * 4);
-    dim3 block(NUMTHREADS_DRAWRECTS);
-    if (grid.x > 65535)
-    {
-        grid.y = (grid.x + 65534) / 65535;
-        grid.x = 65535;
-    }
-    drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
-    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
-#if defined _SELF_TEST_
-    T *h_dst_after;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
-    bool bPass = true;
-    for (Ncv32u i=0; i<dstHeight && bPass; i++)
-    {
-        for (Ncv32u j=0; j<dstWidth && bPass; j++)
-        {
-            if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])
-            {
-                printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);
-                bPass = false;
-            }
-        }
-    }
-    ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);
-    printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
-#endif
-    return NCV_SUCCESS;
-}
-NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
-                                 Ncv32u dstStride,
-                                 Ncv32u dstWidth,
-                                 Ncv32u dstHeight,
-                                 NcvRect32u *d_rects,
-                                 Ncv32u numRects,
-                                 Ncv8u color,
-                                 cudaStream_t cuStream)
-{
-    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
-}
-NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
-                                  Ncv32u dstStride,
-                                  Ncv32u dstWidth,
-                                  Ncv32u dstHeight,
-                                  NcvRect32u *d_rects,
-                                  Ncv32u numRects,
-                                  Ncv32u color,
-                                  cudaStream_t cuStream)
-{
-    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
-}
 //==============================================================================
 //
 // Pipeline file
@@ -1901,13 +1730,13 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
        NCV_SKIP_COND_BEGIN
-        nppStat = nppiStDownsampleNearest_32u_C1R(
+        nppStat = nppiStDecimate_32u_C1R(
            d_integralImage.ptr(), d_integralImage.pitch(),
            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
            srcIIRoi, scale, true);
        ncvAssertReturnNcvStat(nppStat);
-        nppStat = nppiStDownsampleNearest_64u_C1R(
+        nppStat = nppiStDecimate_64u_C1R(
            d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
            srcIIRoi, scale, true);
@@ -1969,7 +1798,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
            }
            Ncv32u numStrongHypothesesNow = dstNumRects;
-            ncvStat = ncvFilterHypotheses_host(
+            ncvStat = ncvGroupRectangles_host(
                h_hypothesesIntermediate,
                numStrongHypothesesNow,
                minNeighbors,
@@ -2031,7 +1860,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
        }
-        ncvStat = ncvFilterHypotheses_host(
+        ncvStat = ncvGroupRectangles_host(
            h_hypothesesIntermediate,
            dstNumRects,
            minNeighbors,
@@ -2285,133 +2114,6 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
 }
-NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
-                                   Ncv32u &numHypotheses,
-                                   Ncv32u minNeighbors,
-                                   Ncv32f intersectEps,
-                                   NCVVector<Ncv32u> *hypothesesWeights)
-{
-    ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
-                    hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
-    if (hypothesesWeights != NULL)
-    {
-        ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
-                        hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
-    }
-    if (numHypotheses == 0)
-    {
-        return NCV_SUCCESS;
-    }
-    std::vector<NcvRect32u> rects(numHypotheses);
-    memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
-    std::vector<Ncv32u> weights;
-    if (hypothesesWeights != NULL)
-    {
-        groupRectangles(rects, minNeighbors, intersectEps, &weights);
-    }
-    else
-    {
-        groupRectangles(rects, minNeighbors, intersectEps, NULL);
-    }
-    numHypotheses = (Ncv32u)rects.size();
-    if (numHypotheses > 0)
-    {
-        memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
-    }
-    if (hypothesesWeights != NULL)
-    {
-        memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
-    }
-    return NCV_SUCCESS;
-}
-template <class T>
-static NCVStatus drawRectsWrapperHost(T *h_dst,
-                                      Ncv32u dstStride,
-                                      Ncv32u dstWidth,
-                                      Ncv32u dstHeight,
-                                      NcvRect32u *h_rects,
-                                      Ncv32u numRects,
-                                      T color)
-{
-    ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
-    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
-    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
-    ncvAssertReturn(numRects != 0, NCV_SUCCESS);
-    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
-    for (Ncv32u i=0; i<numRects; i++)
-    {
-        NcvRect32u rect = h_rects[i];
-        if (rect.x < dstWidth)
-        {
-            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
-            {
-                h_dst[i*dstStride+rect.x] = color;
-            }
-        }
-        if (rect.x+rect.width-1 < dstWidth)
-        {
-            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
-            {
-                h_dst[i*dstStride+rect.x+rect.width-1] = color;
-            }
-        }
-        if (rect.y < dstHeight)
-        {
-            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
-            {
-                h_dst[rect.y*dstStride+j] = color;
-            }
-        }
-        if (rect.y + rect.height - 1 < dstHeight)
-        {
-            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
-            {
-                h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
-            }
-        }
-    }
-    return NCV_SUCCESS;
-}
-NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
-                               Ncv32u dstStride,
-                               Ncv32u dstWidth,
-                               Ncv32u dstHeight,
-                               NcvRect32u *h_rects,
-                               Ncv32u numRects,
-                               Ncv8u color)
-{
-    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
-}
-NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
-                                Ncv32u dstStride,
-                                Ncv32u dstWidth,
-                                Ncv32u dstHeight,
-                                NcvRect32u *h_rects,
-                                Ncv32u numRects,
-                                Ncv32u color)
-{
-    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
-}
 NCVStatus loadFromXML(const std::string &filename,
                      HaarClassifierCascadeDescriptor &haar,
                      std::vector<HaarStage64> &haarStages,

--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@@ -346,153 +346,107 @@ enum
    NCVPipeObjDet_VisualizeInPlace      = 0x004,
 };
-NCV_EXPORTS
-NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
+NCV_EXPORTS NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
-                                            NcvSize32u srcRoi,
+                                                        NcvSize32u srcRoi,
-                                            NCVVector<NcvRect32u> &d_dstRects,
+                                                        NCVVector<NcvRect32u> &d_dstRects,
-                                            Ncv32u &dstNumRects,
+                                                        Ncv32u &dstNumRects,
-                                            HaarClassifierCascadeDescriptor &haar,
+                                                        HaarClassifierCascadeDescriptor &haar,
-                                            NCVVector<HaarStage64> &h_HaarStages,
+                                                        NCVVector<HaarStage64> &h_HaarStages,
-                                            NCVVector<HaarStage64> &d_HaarStages,
+                                                        NCVVector<HaarStage64> &d_HaarStages,
-                                            NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                                        NCVVector<HaarClassifierNode128> &d_HaarNodes,
-                                            NCVVector<HaarFeature64> &d_HaarFeatures,
+                                                        NCVVector<HaarFeature64> &d_HaarFeatures,
-                                            NcvSize32u minObjSize,
+                                                        NcvSize32u minObjSize,
-                                            Ncv32u minNeighbors,      //default 4
+                                                        Ncv32u minNeighbors,      //default 4
-                                            Ncv32f scaleStep,         //default 1.2f
+                                                        Ncv32f scaleStep,         //default 1.2f
-                                            Ncv32u pixelStep,         //default 1
+                                                        Ncv32u pixelStep,         //default 1
-                                            Ncv32u flags,             //default NCVPipeObjDet_Default
+                                                        Ncv32u flags,             //default NCVPipeObjDet_Default
-                                            INCVMemAllocator &gpuAllocator,
+                                                        INCVMemAllocator &gpuAllocator,
-                                            INCVMemAllocator &cpuAllocator,
+                                                        INCVMemAllocator &cpuAllocator,
-                                            cudaDeviceProp &devProp,
+                                                        cudaDeviceProp &devProp,
-                                            cudaStream_t cuStream);
+                                                        cudaStream_t cuStream);
 #define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF
 #define HAAR_STDDEV_BORDER                  1
-NCV_EXPORTS
-NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
+NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
-                                               NCVMatrix<Ncv32f> &d_weights,
+                                                           NCVMatrix<Ncv32f> &d_weights,
-                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
+                                                           NCVMatrixAlloc<Ncv32u> &d_pixelMask,
-                                               Ncv32u &numDetections,
+                                                           Ncv32u &numDetections,
-                                               HaarClassifierCascadeDescriptor &haar,
+                                                           HaarClassifierCascadeDescriptor &haar,
-                                               NCVVector<HaarStage64> &h_HaarStages,
+                                                           NCVVector<HaarStage64> &h_HaarStages,
-                                               NCVVector<HaarStage64> &d_HaarStages,
+                                                           NCVVector<HaarStage64> &d_HaarStages,
-                                               NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                                           NCVVector<HaarClassifierNode128> &d_HaarNodes,
-                                               NCVVector<HaarFeature64> &d_HaarFeatures,
+                                                           NCVVector<HaarFeature64> &d_HaarFeatures,
-                                               NcvBool bMaskElements,
+                                                           NcvBool bMaskElements,
-                                               NcvSize32u anchorsRoi,
+                                                           NcvSize32u anchorsRoi,
-                                               Ncv32u pixelStep,
+                                                           Ncv32u pixelStep,
-                                               Ncv32f scaleArea,
+                                                           Ncv32f scaleArea,
-                                               INCVMemAllocator &gpuAllocator,
+                                                           INCVMemAllocator &gpuAllocator,
-                                               INCVMemAllocator &cpuAllocator,
+                                                           INCVMemAllocator &cpuAllocator,
-                                               cudaDeviceProp &devProp,
+                                                           cudaDeviceProp &devProp,
-                                               cudaStream_t cuStream);
+                                                           cudaStream_t cuStream);
-NCV_EXPORTS
-NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
+NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
-                                             NCVMatrix<Ncv32f> &h_weights,
+                                                         NCVMatrix<Ncv32f> &h_weights,
-                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,
+                                                         NCVMatrixAlloc<Ncv32u> &h_pixelMask,
-                                             Ncv32u &numDetections,
+                                                         Ncv32u &numDetections,
-                                             HaarClassifierCascadeDescriptor &haar,
+                                                         HaarClassifierCascadeDescriptor &haar,
-                                             NCVVector<HaarStage64> &h_HaarStages,
+                                                         NCVVector<HaarStage64> &h_HaarStages,
-                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                                         NCVVector<HaarClassifierNode128> &h_HaarNodes,
-                                             NCVVector<HaarFeature64> &h_HaarFeatures,
+                                                         NCVVector<HaarFeature64> &h_HaarFeatures,
-                                             NcvBool bMaskElements,
+                                                         NcvBool bMaskElements,
-                                             NcvSize32u anchorsRoi,
+                                                         NcvSize32u anchorsRoi,
-                                             Ncv32u pixelStep,
+                                                         Ncv32u pixelStep,
-                                             Ncv32f scaleArea);
+                                                         Ncv32f scaleArea);
-NCV_EXPORTS
-NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
-                                 Ncv32u dstStride,
-                                 Ncv32u dstWidth,
-                                 Ncv32u dstHeight,
-                                 NcvRect32u *d_rects,
-                                 Ncv32u numRects,
-                                 Ncv8u color,
-                                 cudaStream_t cuStream);
-NCV_EXPORTS
-NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
-                                  Ncv32u dstStride,
-                                  Ncv32u dstWidth,
-                                  Ncv32u dstHeight,
-                                  NcvRect32u *d_rects,
-                                  Ncv32u numRects,
-                                  Ncv32u color,
-                                  cudaStream_t cuStream);
-NCV_EXPORTS
-NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
-                               Ncv32u dstStride,
-                               Ncv32u dstWidth,
-                               Ncv32u dstHeight,
-                               NcvRect32u *h_rects,
-                               Ncv32u numRects,
-                               Ncv8u color);
-NCV_EXPORTS
-NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
-                                Ncv32u dstStride,
-                                Ncv32u dstWidth,
-                                Ncv32u dstHeight,
-                                NcvRect32u *h_rects,
-                                Ncv32u numRects,
-                                Ncv32u color);
 #define RECT_SIMILARITY_PROPORTION      0.2f
-NCV_EXPORTS
-NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
+NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
-                                         Ncv32u numPixelMaskDetections,
+                                                     Ncv32u numPixelMaskDetections,
-                                         NCVVector<NcvRect32u> &hypotheses,
+                                                     NCVVector<NcvRect32u> &hypotheses,
-                                         Ncv32u &totalDetections,
+                                                     Ncv32u &totalDetections,
-                                         Ncv32u totalMaxDetections,
+                                                     Ncv32u totalMaxDetections,
-                                         Ncv32u rectWidth,
+                                                     Ncv32u rectWidth,
-                                         Ncv32u rectHeight,
+                                                     Ncv32u rectHeight,
-                                         Ncv32f curScale,
+                                                     Ncv32f curScale,
-                                         cudaStream_t cuStream);
+                                                     cudaStream_t cuStream);
-NCV_EXPORTS
-NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
+NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
-                                       Ncv32u numPixelMaskDetections,
+                                                   Ncv32u numPixelMaskDetections,
-                                       NCVVector<NcvRect32u> &hypotheses,
+                                                   NCVVector<NcvRect32u> &hypotheses,
-                                       Ncv32u &totalDetections,
+                                                   Ncv32u &totalDetections,
-                                       Ncv32u totalMaxDetections,
+                                                   Ncv32u totalMaxDetections,
-                                       Ncv32u rectWidth,
+                                                   Ncv32u rectWidth,
-                                       Ncv32u rectHeight,
+                                                   Ncv32u rectHeight,
-                                       Ncv32f curScale);
+                                                   Ncv32f curScale);
-NCV_EXPORTS
-NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
+NCV_EXPORTS NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
-                                   Ncv32u &numHypotheses,
+                                               Ncv32u &numNodes, Ncv32u &numFeatures);
-                                   Ncv32u minNeighbors,
-                                   Ncv32f intersectEps,
-                                   NCVVector<Ncv32u> *hypothesesWeights);
+NCV_EXPORTS NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
+                                               HaarClassifierCascadeDescriptor &haar,
-NCV_EXPORTS
+                                               NCVVector<HaarStage64> &h_HaarStages,
-NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
+                                               NCVVector<HaarClassifierNode128> &h_HaarNodes,
-                                   Ncv32u &numNodes, Ncv32u &numFeatures);
+                                               NCVVector<HaarFeature64> &h_HaarFeatures);
-NCV_EXPORTS
-NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
+NCV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
-                                   HaarClassifierCascadeDescriptor &haar,
+                                             HaarClassifierCascadeDescriptor haar,
-                                   NCVVector<HaarStage64> &h_HaarStages,
+                                             NCVVector<HaarStage64> &h_HaarStages,
-                                   NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
-                                   NCVVector<HaarFeature64> &h_HaarFeatures);
+                                             NCVVector<HaarFeature64> &h_HaarFeatures);
-NCV_EXPORTS
-NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
-                                 HaarClassifierCascadeDescriptor haar,
-                                 NCVVector<HaarStage64> &h_HaarStages,
-                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,
-                                 NCVVector<HaarFeature64> &h_HaarFeatures);

--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -44,10 +44,6 @@
 #include <cuda_runtime.h>
 #include "NPP_staging.hpp"
-#if defined _SELF_TEST_
-#include <stdio.h>
-#endif
 texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
 texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
@@ -161,12 +157,6 @@ const Ncv32u NUM_SCAN_THREADS = 256;
 const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
-struct T_true {};
-struct T_false {};
-template <typename T, typename U> struct is_same : T_false {};
-template <typename T> struct is_same<T, T> : T_true {};
 template<class T_in, class T_out>
 struct _scanElemOp
 {
@@ -175,13 +165,16 @@ struct _scanElemOp
    {
        return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
    }
 private:
    template <int v> struct Int2Type { enum { value = v }; };
    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
    {
        return (T_out)elem;
    }
    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
    {
        return (T_out)(elem*elem);
@@ -190,25 +183,25 @@ private:
 template<class T>
-inline __device__ T readElem(T *d_src, Ncv32u srcStride, Ncv32u curElemOffs);
+inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
 template<>
-inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
-    return tex1Dfetch(tex8u, srcStride * blockIdx.x + curElemOffs);
+    return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
 }
 template<>
-inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
    return d_src[curElemOffs];
 }
 template<>
-inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
    return d_src[curElemOffs];
 }
@@ -233,7 +226,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32
 * \return None
 */
 template <class T_in, class T_out, bool tbDoSqr>
-__global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
+__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
                         T_out *d_II, Ncv32u IIstride)
 {
    //advance pointers to the current line
@@ -263,7 +256,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
        if (curElemOffs < srcWidth)
        {
            //load elements
-            curElem = readElem<T_in>(d_src, srcStride, curElemOffs);
+            curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
        }
        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
@@ -298,55 +291,28 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
                                T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
 {
    cudaChannelFormatDesc cfdTex;
+    size_t alignmentOffset = 0;
    if (sizeof(T_in) == 1)
    {
        cfdTex = cudaCreateChannelDesc<Ncv8u>();
-        size_t alignmentOffset;
        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+        if (alignmentOffset > 0)
+        {
+            ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
+        }
    }
    scanRows
        <T_in, T_out, tbDoSqr>
        <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
-        (d_src, roi.width, srcStride, d_dst, dstStride);
+        (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-#if defined _SELF_TEST_
-    T_in *h_src;
-    T_out *h_dst;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * roi.height * sizeof(T_in)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * roi.height * sizeof(T_out)), NPPST_MEM_ALLOC_ERR);
-    memset(h_src, 0, srcStride * roi.height * sizeof(T_in));
-    memset(h_dst, 0, dstStride * roi.height * sizeof(T_out));
-    ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * roi.height * sizeof(T_in), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * roi.height * sizeof(T_out), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    NcvBool bPass = true;
-    for (Ncv32u i=0; i<roi.height && bPass; i++)
-    {
-        T_out curElem = 0;
-        for (Ncv32u j=0; j<roi.width+1 && bPass; j++)
-        {
-            if (curElem != h_dst[i * dstStride + j])
-            {
-                printf("CIntegralImage::scanRowsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, curElem, h_dst[i * dstStride + j]);
-                bPass = false;
-            }
-            if (j < roi.width)
-            {
-                curElem += scanElemOp<T_op>(h_src[i*srcStride+j]);
-            }
-        }
-    }
-    ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
-    printf("CIntegralImage::scanRowsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
-#endif
    return NPPST_SUCCESS;
 }
-Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
+static Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
 {
    Ncv32u alignMask = allocatorAlignment-1;
    Ncv32u inverseAlignMask = ~alignMask;
@@ -676,7 +642,7 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
 //==============================================================================
 //
-// DownsampleNearest.cu
+// Decimate.cu
 //
 //==============================================================================
@@ -686,25 +652,25 @@ const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
 template<class T, NcvBool tbCacheTexture>
-__device__ T getElem_DownsampleNearest(Ncv32u x, T *d_src);
+__device__ T getElem_Decimate(Ncv32u x, T *d_src);
 template<>
-__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
+__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
 {
    return tex1Dfetch(tex32u, x);
 }
 template<>
-__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
+__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
 {
    return d_src[x];
 }
 template<>
-__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
+__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
 {
    uint2 tmp = tex1Dfetch(tex64u, x);
    Ncv64u res = (Ncv64u)tmp.y;
@@ -715,14 +681,14 @@ __device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_sr
 template<>
-__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
+__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
 {
    return d_src[x];
 }
 template <class T, NcvBool tbCacheTexture>
-__global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
+__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
                                      NcvSize32u dstRoi, Ncv32u scale)
 {
    int curX = blockIdx.x * blockDim.x + threadIdx.x;
@@ -733,12 +699,12 @@ __global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u
        return;
    }
-    d_dst[curY * dstStep + curX] = getElem_DownsampleNearest<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
+    d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
 }
 template <class T>
-static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
+static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
                                                T *d_dst, Ncv32u dstStep,
                                                NcvSize32u srcRoi, Ncv32u scale,
                                                NcvBool readThruTexture)
@@ -761,7 +727,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
    if (!readThruTexture)
    {
-        downsampleNearest_C1R
+        decimate_C1R
            <T, false>
            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
@@ -787,7 +753,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
        }
-        downsampleNearest_C1R
+        decimate_C1R
            <T, true>
            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
@@ -795,39 +761,12 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-#if defined _SELF_TEST_
-    T *h_src;
-    T *h_dst;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStep * srcRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStep * dstRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStep * srcRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStep * dstRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    bool bPass = true;
-    for (Ncv32u i=0; i<dstRoi.height && bPass; i++)
-    {
-        for (Ncv32u j=0; j<dstRoi.width && bPass; j++)
-        {
-            if (h_dst[i*dstStep+j] != h_src[i*scale*srcStep + j*scale])
-            {
-                printf("::downsampleNearestWrapperDevice self test failed: i=%d, j=%d, cpu=%ld, gpu=%ld\n", i, j, (long long)h_src[i*scale*srcStep + j*scale], (long long)h_dst[i*dstStep+j]);
-                bPass = false;
-            }
-        }
-    }
-    ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
-    printf("::downsampleNearestWrapperDevice %s\n", bPass?"PASSED":"FAILED");
-#endif
    return NPPST_SUCCESS;
 }
 template <class T>
-static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
+static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
                                              T *h_dst, Ncv32u dstStep,
                                              NcvSize32u srcRoi, Ncv32u scale)
 {
@@ -856,40 +795,40 @@ static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
 }
-#define implementNppDownsampleNearest(bit, typ) \
+#define implementNppDecimate(bit, typ) \
-    NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
+    NCVStatus nppiStDecimate_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
                                                     Ncv##bit##typ *d_dst, Ncv32u dstStep, \
                                                     NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
    { \
-        return downsampleNearestWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
+        return decimateWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
                                                           (Ncv##bit##u *)d_dst, dstStep, \
                                                           srcRoi, scale, readThruTexture); \
    }
-#define implementNppDownsampleNearestHost(bit, typ) \
+#define implementNppDecimateHost(bit, typ) \
-    NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
+    NCVStatus nppiStDecimate_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
                                                          Ncv##bit##typ *h_dst, Ncv32u dstStep, \
                                                          NcvSize32u srcRoi, Ncv32u scale) \
    { \
-        return downsampleNearestWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
+        return decimateWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
                                                         (Ncv##bit##u *)h_dst, dstStep, \
                                                         srcRoi, scale); \
    }
-implementNppDownsampleNearest(32, u)
+implementNppDecimate(32, u)
-implementNppDownsampleNearest(32, s)
+implementNppDecimate(32, s)
-implementNppDownsampleNearest(32, f)
+implementNppDecimate(32, f)
-implementNppDownsampleNearest(64, u)
+implementNppDecimate(64, u)
-implementNppDownsampleNearest(64, s)
+implementNppDecimate(64, s)
-implementNppDownsampleNearest(64, f)
+implementNppDecimate(64, f)
-implementNppDownsampleNearestHost(32, u)
+implementNppDecimateHost(32, u)
-implementNppDownsampleNearestHost(32, s)
+implementNppDecimateHost(32, s)
-implementNppDownsampleNearestHost(32, f)
+implementNppDecimateHost(32, f)
-implementNppDownsampleNearestHost(64, u)
+implementNppDecimateHost(64, u)
-implementNppDownsampleNearestHost(64, s)
+implementNppDecimateHost(64, s)
-implementNppDownsampleNearestHost(64, f)
+implementNppDecimateHost(64, f)
 //==============================================================================
@@ -1051,46 +990,6 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-#if defined _SELF_TEST_
-    Ncv32u *h_sum;
-    Ncv64u *h_sqsum;
-    Ncv32f *h_norm_d;
-    Ncv32u ExtHeight = roi.height + rect.y + rect.height;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_sum, sumStep * ExtHeight * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertCUDAReturn(cudaMallocHost(&h_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertCUDAReturn(cudaMallocHost(&h_norm_d, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_sum, d_sum, sumStep * ExtHeight * sizeof(Ncv32u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_sqsum, d_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_norm_d, d_norm, normStep * roi.height * sizeof(Ncv32f), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    Ncv32f *h_norm_h;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_norm_h, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertReturnNcvStat(nppRectStdDev_32f_C1R_host(h_sum, sqsumStep, h_sqsum, sqsumStep, h_norm_h, normStep, roi, rect, scaleArea));
-    const Ncv64f relEPS = 0.005;
-    bool bPass = true;
-    for (Ncv32u i=0; i<roi.height && bPass; i++)
-    {
-        for (Ncv32u j=0; j<roi.width && bPass; j++)
-        {
-            Ncv64f absErr = fabs(h_norm_h[i * normStep + j] - h_norm_d[i * normStep + j]);
-            Ncv64f relErr = absErr / h_norm_h[i * normStep + j];
-            if (relErr > relEPS)
-            {
-                printf("::ncvRectStdDev_32f_C1R self test failed: i=%d, j=%d, cpu=%f, gpu=%f\n", i, j, h_norm_h[i * normStep + j], h_norm_d[i * normStep + j]);
-                bPass = false;
-            }
-        }
-    }
-    ncvAssertCUDAReturn(cudaFreeHost(h_sum), NPPST_MEMFREE_ERR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_sqsum), NPPST_MEMFREE_ERR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_norm_d), NPPST_MEMFREE_ERR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_norm_h), NPPST_MEMFREE_ERR);
-    printf("::ncvRectStdDev_32f_C1R %s\n", bPass?"PASSED":"FAILED");
-#endif
    return NPPST_SUCCESS;
 }
@@ -1251,34 +1150,6 @@ NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
        (d_src, srcStride, d_dst, dstStride, srcRoi);
    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-#if defined _SELF_TEST_
-    Ncv32u widthExt = grid.x * TRANSPOSE_TILE_DIM;
-    Ncv32u heightExt = grid.y * TRANSPOSE_TILE_DIM;
-    T *h_src;
-    T *h_dst;
-    ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * heightExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
-    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * widthExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
-    memset(h_src, 0, srcStride * heightExt * sizeof(T));
-    memset(h_dst, 0, dstStride * widthExt * sizeof(T));
-    ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * heightExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * widthExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
-    NcvBool bPass = true;
-    for (Ncv32u i=0; i<srcRoi.height && bPass; i++)
-    {
-        for (Ncv32u j=0; j<srcRoi.width && bPass; j++)
-        {
-            if (h_src[i * srcStride + j] != h_dst[j * dstStride + i])
-            {
-                printf("CIntegralImage::transposeWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_src[j * srcStride + i], h_dst[i * dstStride + j]);
-                bPass = false;
-            }
-        }
-    }
-    ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
-    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
-    printf("CIntegralImage::transposeWrapperDevice %s\n", bPass?"PASSED":"FAILED");
-#endif
    return NPPST_SUCCESS;
 }
@@ -1341,6 +1212,20 @@ implementNppTransposeHost(64,s)
 implementNppTransposeHost(64,f)
+NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
+                                  void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
+{
+    return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
+}
+NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
+                                       void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
+{
+    return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
+}
 //==============================================================================
 //
 // Compact.cu

--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
@@ -96,65 +96,65 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
 * \return NCV status code
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
-                                          Ncv32u *d_dst, Ncv32u dstStep,
+                                 Ncv32u *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+                                 NcvBool readThruTexture);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
-                                          Ncv32s *d_dst, Ncv32u dstStep,
+                                 Ncv32s *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+                                 NcvBool readThruTexture);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
-                                          Ncv32f *d_dst, Ncv32u dstStep,
+                                 Ncv32f *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+                                 NcvBool readThruTexture);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
-* \see nppiStDownsampleNearest_32u_C1R
+* \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
-                                          Ncv64u *d_dst, Ncv32u dstStep,
+                                 Ncv64u *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+                                 NcvBool readThruTexture);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
-                                          Ncv64s *d_dst, Ncv32u dstStep,
+                                 Ncv64s *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+                                 NcvBool readThruTexture);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
+ * \see nppiStDecimate_32u_C1R
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
-                                          Ncv64f *d_dst, Ncv32u dstStep,
+                                 Ncv64f *d_dst, Ncv32u dstStep,
-                                          NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvSize32u srcRoi, Ncv32u scale,
-                                          NcvBool readThruTexture);
+                                 NcvBool readThruTexture);
 /**
@@ -170,59 +170,59 @@ NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
 * \return NCV status code
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
-                                               Ncv32u *h_dst, Ncv32u dstStep,
+                                      Ncv32u *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+                                      NcvSize32u srcRoi, Ncv32u scale);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
-                                               Ncv32s *h_dst, Ncv32u dstStep,
+                                      Ncv32s *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+                                      NcvSize32u srcRoi, Ncv32u scale);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
-                                               Ncv32f *h_dst, Ncv32u dstStep,
+                                      Ncv32f *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+                                      NcvSize32u srcRoi, Ncv32u scale);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
-                                               Ncv64u *h_dst, Ncv32u dstStep,
+                                      Ncv64u *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+                                      NcvSize32u srcRoi, Ncv32u scale);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
-                                               Ncv64s *h_dst, Ncv32u dstStep,
+                                      Ncv64s *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+                                      NcvSize32u srcRoi, Ncv32u scale);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
+ * \see nppiStDecimate_32u_C1R_host
 */
 NCV_EXPORTS
-NCVStatus nppiStDownsampleNearest_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
+NCVStatus nppiStDecimate_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
-                                               Ncv64f *h_dst, Ncv32u dstStep,
+                                      Ncv64f *h_dst, Ncv32u dstStep,
-                                               NcvSize32u srcRoi, Ncv32u scale);
+                                      NcvSize32u srcRoi, Ncv32u scale);
 /**
@@ -333,6 +333,15 @@ NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
                                  Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+/**
+ * Transposes an image. 128-bit pixels of any type, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
+                                  void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
 /**
 * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
 *
@@ -394,6 +403,15 @@ NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
                                       Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+/**
+ * Transposes an image. 128-bit pixels of any type, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
+                                       void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
 /**
 * Calculates the size of the temporary buffer for integral image creation
 *

--- a/modules/gpu/src/nvidia/core/NCV.cpp
+++ b/modules/gpu/src/nvidia/core/NCV.cpp
@@ -40,14 +40,9 @@
 //M*/
-#if !defined (HAVE_CUDA)
-#else /* !defined (HAVE_CUDA) */
 #include <ios>
 #include <stdarg.h>
+#include <vector>
 #include "NCV.hpp"
@@ -182,6 +177,78 @@ NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NC
 }
+NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
+                             const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
+                             Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
+{
+    NCVStatus ncvStat;
+    switch (dstType)
+    {
+    case NCVMemoryTypeHostPageable:
+    case NCVMemoryTypeHostPinned:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            for (Ncv32u i=0; i<height; i++)
+            {
+                memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    case NCVMemoryTypeDevice:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    default:
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;
+    }
+    return ncvStat;
+}
 //===================================================================
 //
 // NCVMemStackAllocator class members implementation
@@ -195,8 +262,10 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
    _maxSize(0),
    allocBegin(NULL),
    begin(NULL),
+    end(NULL),
    _memType(NCVMemoryTypeNone),
-    _alignment(alignment)
+    _alignment(alignment),
+    bReusesMemory(false)
 {
    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
@@ -573,4 +642,264 @@ double ncvEndQueryTimerMs(NcvTimer t)
    return res;
 }
-#endif /* !defined (HAVE_CUDA) */
+//===================================================================
+//
+// Operations with rectangles
+//
+//===================================================================
+//from OpenCV
+void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
+NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses,
+                                  Ncv32u &numHypotheses,
+                                  Ncv32u minNeighbors,
+                                  Ncv32f intersectEps,
+                                  NCVVector<Ncv32u> *hypothesesWeights)
+{
+    ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
+                    hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+    if (hypothesesWeights != NULL)
+    {
+        ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
+                        hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+    }
+    if (numHypotheses == 0)
+    {
+        return NCV_SUCCESS;
+    }
+    std::vector<NcvRect32u> rects(numHypotheses);
+    memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
+    std::vector<Ncv32u> weights;
+    if (hypothesesWeights != NULL)
+    {
+        groupRectangles(rects, minNeighbors, intersectEps, &weights);
+    }
+    else
+    {
+        groupRectangles(rects, minNeighbors, intersectEps, NULL);
+    }
+    numHypotheses = (Ncv32u)rects.size();
+    if (numHypotheses > 0)
+    {
+        memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
+    }
+    if (hypothesesWeights != NULL)
+    {
+        memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
+    }
+    return NCV_SUCCESS;
+}
+template <class T>
+static NCVStatus drawRectsWrapperHost(T *h_dst,
+                                      Ncv32u dstStride,
+                                      Ncv32u dstWidth,
+                                      Ncv32u dstHeight,
+                                      NcvRect32u *h_rects,
+                                      Ncv32u numRects,
+                                      T color)
+{
+    ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
+    ncvAssertReturn(numRects != 0, NCV_SUCCESS);
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
+    for (Ncv32u i=0; i<numRects; i++)
+    {
+        NcvRect32u rect = h_rects[i];
+        if (rect.x < dstWidth)
+        {
+            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
+            {
+                h_dst[i*dstStride+rect.x] = color;
+            }
+        }
+        if (rect.x+rect.width-1 < dstWidth)
+        {
+            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
+            {
+                h_dst[i*dstStride+rect.x+rect.width-1] = color;
+            }
+        }
+        if (rect.y < dstHeight)
+        {
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
+            {
+                h_dst[rect.y*dstStride+j] = color;
+            }
+        }
+        if (rect.y + rect.height - 1 < dstHeight)
+        {
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
+            {
+                h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
+            }
+        }
+    }
+    return NCV_SUCCESS;
+}
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
+                               Ncv32u dstStride,
+                               Ncv32u dstWidth,
+                               Ncv32u dstHeight,
+                               NcvRect32u *h_rects,
+                               Ncv32u numRects,
+                               Ncv8u color)
+{
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
+}
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
+                                Ncv32u dstStride,
+                                Ncv32u dstWidth,
+                                Ncv32u dstHeight,
+                                NcvRect32u *h_rects,
+                                Ncv32u numRects,
+                                Ncv32u color)
+{
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
+}
+const Ncv32u NUMTHREADS_DRAWRECTS = 32;
+const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
+template <class T>
+__global__ void drawRects(T *d_dst,
+                          Ncv32u dstStride,
+                          Ncv32u dstWidth,
+                          Ncv32u dstHeight,
+                          NcvRect32u *d_rects,
+                          Ncv32u numRects,
+                          T color)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    if (blockId > numRects * 4)
+    {
+        return;
+    }
+    NcvRect32u curRect = d_rects[blockId >> 2];
+    NcvBool bVertical = blockId & 0x1;
+    NcvBool bTopLeft = blockId & 0x2;
+    Ncv32u pt0x, pt0y;
+    if (bVertical)
+    {
+        Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
+        pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
+        pt0y = curRect.y;
+        if (pt0x < dstWidth)
+        {
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
+            {
+                Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
+                if (ptY < pt0y + curRect.height && ptY < dstHeight)
+                {
+                    d_dst[ptY * dstStride + pt0x] = color;
+                }
+            }
+        }
+    }
+    else
+    {
+        Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
+        pt0x = curRect.x;
+        pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
+        if (pt0y < dstHeight)
+        {
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
+            {
+                Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
+                if (ptX < pt0x + curRect.width && ptX < dstWidth)
+                {
+                    d_dst[pt0y * dstStride + ptX] = color;
+                }
+            }
+        }
+    }
+}
+template <class T>
+static NCVStatus drawRectsWrapperDevice(T *d_dst,
+                                        Ncv32u dstStride,
+                                        Ncv32u dstWidth,
+                                        Ncv32u dstHeight,
+                                        NcvRect32u *d_rects,
+                                        Ncv32u numRects,
+                                        T color,
+                                        cudaStream_t cuStream)
+{
+    ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
+    if (numRects == 0)
+    {
+        return NCV_SUCCESS;
+    }
+    dim3 grid(numRects * 4);
+    dim3 block(NUMTHREADS_DRAWRECTS);
+    if (grid.x > 65535)
+    {
+        grid.y = (grid.x + 65534) / 65535;
+        grid.x = 65535;
+    }
+    drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
+    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+    return NCV_SUCCESS;
+}
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
+                                 Ncv32u dstStride,
+                                 Ncv32u dstWidth,
+                                 Ncv32u dstHeight,
+                                 NcvRect32u *d_rects,
+                                 Ncv32u numRects,
+                                 Ncv8u color,
+                                 cudaStream_t cuStream)
+{
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
+}
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
+                                  Ncv32u dstStride,
+                                  Ncv32u dstWidth,
+                                  Ncv32u dstHeight,
+                                  NcvRect32u *d_rects,
+                                  Ncv32u numRects,
+                                  Ncv32u color,
+                                  cudaStream_t cuStream)
+{
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
+}
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -129,8 +129,8 @@ struct NcvRect8u
    Ncv8u y;
    Ncv8u width;
    Ncv8u height;
-    NcvRect8u() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect8u() : x(0), y(0), width(0), height(0) {};
-    NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
+    __host__ __device__ NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
 };
@@ -140,8 +140,8 @@ struct NcvRect32s
    Ncv32s y;          ///< y-coordinate of upper left corner.
    Ncv32s width;      ///< Rectangle width.
    Ncv32s height;     ///< Rectangle height.
-    NcvRect32s() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect32s() : x(0), y(0), width(0), height(0) {};
-    NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
+    __host__ __device__ NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
 };
@@ -151,8 +151,8 @@ struct NcvRect32u
    Ncv32u y;          ///< y-coordinate of upper left corner.
    Ncv32u width;      ///< Rectangle width.
    Ncv32u height;     ///< Rectangle height.
-    NcvRect32u() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect32u() : x(0), y(0), width(0), height(0) {};
-    NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
+    __host__ __device__ NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
 };
@@ -160,8 +160,8 @@ struct NcvSize32s
 {
    Ncv32s width;  ///< Rectangle width.
    Ncv32s height; ///< Rectangle height.
-    NcvSize32s() : width(0), height(0) {};
+    __host__ __device__ NcvSize32s() : width(0), height(0) {};
-    NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
+    __host__ __device__ NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
 };
@@ -169,8 +169,8 @@ struct NcvSize32u
 {
    Ncv32u width;  ///< Rectangle width.
    Ncv32u height; ///< Rectangle height.
-    NcvSize32u() : width(0), height(0) {};
+    __host__ __device__ NcvSize32u() : width(0), height(0) {};
-    NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
+    __host__ __device__ NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
 };
@@ -275,6 +275,7 @@ enum NCVStatus
 {
    //NCV statuses
    NCV_SUCCESS,
+    NCV_UNKNOWN_ERROR,
    NCV_CUDA_ERROR,
    NCV_NPP_ERROR,
@@ -501,13 +502,18 @@ private:
 /**
-* Copy dispatcher
+* Copy dispatchers
 */
 NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
                                       const void *src, NCVMemoryType srcType,
                                       size_t sz, cudaStream_t cuStream);
+NCV_EXPORTS NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
+                                         const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
+                                         Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream);
 /**
 * NCVVector (1D)
 */
@@ -532,7 +538,7 @@ public:
        _memtype = NCVMemoryTypeNone;
    }
-    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
    {
        if (howMuch == 0)
        {
@@ -600,7 +606,6 @@ public:
        this->_memtype = this->allocatedMem.begin.memtype;
    }
    ~NCVVectorAlloc()
    {
        NCVStatus ncvStat;
@@ -611,25 +616,22 @@ public:
        this->clear();
    }
    NcvBool isMemAllocated() const
    {
        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
    }
    Ncv32u getAllocatorsAlignment() const
    {
        return allocator.alignment();
    }
    NCVMemSegment getSegment() const
    {
        return allocatedMem;
    }
-private:		
+private:
    INCVMemAllocator &allocator;
    NCVMemSegment allocatedMem;
 };
@@ -658,7 +660,6 @@ public:
        this->bReused = true;
    }
    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
    {
        this->bReused = false;
@@ -674,7 +675,6 @@ public:
        this->bReused = true;
    }
    NcvBool isMemReused() const
    {
        return this->bReused;
@@ -703,7 +703,6 @@ public:
    virtual ~NCVMatrix() {}
    void clear()
    {
        _ptr = NULL;
@@ -713,14 +712,13 @@ public:
        _memtype = NCVMemoryTypeNone;
    }
    Ncv32u stride() const
    {
        return _pitch / sizeof(T);
    }
+    //a side effect of this function is that it copies everything in a single chunk, so the "padding" will be overwritten
-    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
    {
        if (howMuch == 0)
        {
@@ -748,6 +746,24 @@ public:
        return ncvStat;
    }
+    NCVStatus copy2D(NCVMatrix<T> &dst, NcvSize32u roi, cudaStream_t cuStream) const
+    {
+        ncvAssertReturn(this->width() >= roi.width && this->height() >= roi.height &&
+                        dst.width() >= roi.width && dst.height() >= roi.height, NCV_MEM_COPY_ERROR);
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && 
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper2D(dst._ptr, dst._pitch, dst._memtype,
+                                         this->_ptr, this->_pitch, this->_memtype,
+                                         roi.width * sizeof(T), roi.height, cuStream);
+        }
+        return ncvStat;
+    }
    T *ptr() const {return this->_ptr;}
    Ncv32u width() const {return this->_width;}
    Ncv32u height() const {return this->_height;}
@@ -817,19 +833,16 @@ public:
        this->clear();
    }
    NcvBool isMemAllocated() const
    {
        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
    }
    Ncv32u getAllocatorsAlignment() const
    {
        return allocator.alignment();
    }
    NCVMemSegment getSegment() const
    {
        return allocatedMem;
@@ -888,6 +901,23 @@ public:
        this->bReused = true;
    }
+    NCVMatrixReuse(const NCVMatrix<T> &mat, NcvRect32u roi)
+    {
+        this->bReused = false;
+        this->clear();
+        ncvAssertPrintReturn(roi.x < mat.width() && roi.y < mat.height() && \
+            roi.x + roi.width <= mat.width() && roi.y + roi.height <= mat.height(),
+            "NCVMatrixReuse ctor:: memory binding failed due to mismatching ROI and source matrix dims", );
+        this->_width = roi.width;
+        this->_height = roi.height;
+        this->_pitch = mat.pitch();
+        this->_ptr = mat.ptr() + roi.y * mat.stride() + roi.x;
+        this->_memtype = mat.memType();
+        this->bReused = true;
+    }
    NcvBool isMemReused() const
    {
@@ -899,4 +929,27 @@ private:
    NcvBool bReused;
 };
+/**
+* Operations with rectangles
+*/
+NCV_EXPORTS NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses, Ncv32u &numHypotheses,
+                                              Ncv32u minNeighbors, Ncv32f intersectEps, NCVVector<Ncv32u> *hypothesesWeights);
+NCV_EXPORTS NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                           NcvRect32u *h_rects, Ncv32u numRects, Ncv8u color);
+NCV_EXPORTS NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                            NcvRect32u *h_rects, Ncv32u numRects, Ncv32u color);
+NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                             NcvRect32u *d_rects, Ncv32u numRects, Ncv8u color, cudaStream_t cuStream);
+NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                              NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);
 #endif // _ncv_hpp_
--- a/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
@@ -150,14 +150,14 @@ namespace NCVRuntimeTemplateBool
    {
        //Convenience function used by the user
        //Takes a variable argument list, transforms it into a list
-        static void call(Func *functor, int dummy, ...)
+        static void call(Func *functor, ...)
        {
            //Vector used to collect arguments
            std::vector<int> templateParamList;
            //Variable argument list manipulation
            va_list listPointer;
-            va_start(listPointer, dummy);
+            va_start(listPointer, functor);
            //Collect parameters into the list
            for(int i=0; i<NumArguments; i++)
            {

--- a/modules/gpu/test/nvidia/TestHypothesesFilter.cpp
+++ b/modules/gpu/test/nvidia/TestHypothesesFilter.cpp
@@ -134,7 +134,7 @@ bool TestHypothesesFilter::process()
    Ncv32u numHypothesesSrc = h_vecSrc.length();
    NCV_SKIP_COND_BEGIN
-    ncvStat = ncvFilterHypotheses_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
+    ncvStat = ncvGroupRectangles_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
    NCV_SKIP_COND_END

--- a/modules/gpu/test/nvidia/TestResize.cpp
+++ b/modules/gpu/test/nvidia/TestResize.cpp
@@ -83,17 +83,17 @@ bool TestResize<T>::process()
    NCV_SKIP_COND_BEGIN
    if (sizeof(T) == sizeof(Ncv32u))
    {
-        ncvStat = nppiStDownsampleNearest_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
+        ncvStat = nppiStDecimate_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
-                                                  (Ncv32u *)d_small.ptr(), d_small.pitch(),
+                                         (Ncv32u *)d_small.ptr(), d_small.pitch(),
-                                                  srcSize, this->scaleFactor,
+                                         srcSize, this->scaleFactor,
-                                                  this->bTextureCache);
+                                         this->bTextureCache);
    }
    else if (sizeof(T) == sizeof(Ncv64u))
    {
-        ncvStat = nppiStDownsampleNearest_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
+        ncvStat = nppiStDecimate_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
-                                                  (Ncv64u *)d_small.ptr(), d_small.pitch(),
+                                         (Ncv64u *)d_small.ptr(), d_small.pitch(),
-                                                  srcSize, this->scaleFactor,
+                                         srcSize, this->scaleFactor,
-                                                  this->bTextureCache);
+                                         this->bTextureCache);
    }
    else
    {
@@ -107,15 +107,15 @@ bool TestResize<T>::process()
    NCV_SKIP_COND_BEGIN
    if (sizeof(T) == sizeof(Ncv32u))
    {
-        ncvStat = nppiStDownsampleNearest_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
+        ncvStat = nppiStDecimate_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
-                                                       (Ncv32u *)h_small.ptr(), h_small.pitch(),
+                                              (Ncv32u *)h_small.ptr(), h_small.pitch(),
-                                                       srcSize, this->scaleFactor);
+                                              srcSize, this->scaleFactor);
    }
    else if (sizeof(T) == sizeof(Ncv64u))
    {
-        ncvStat = nppiStDownsampleNearest_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
+        ncvStat = nppiStDecimate_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
-                                                       (Ncv64u *)h_small.ptr(), h_small.pitch(),
+                                              (Ncv64u *)h_small.ptr(), h_small.pitch(),
-                                                       srcSize, this->scaleFactor);
+                                              srcSize, this->scaleFactor);
    }
    else
    {