diff --git a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
deleted file mode 100644
index 98d5c227a1a540dd1ccd14a34215268b7a5ba011..0000000000000000000000000000000000000000
Binary files a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib and /dev/null differ
diff --git a/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib b/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib
deleted file mode 100644
index f8372bf1085e263c7086d4d0fded4f95d4e5806f..0000000000000000000000000000000000000000
Binary files a/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib and /dev/null differ
diff --git a/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a b/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a
deleted file mode 100644
index ab0150ef0da7d76ce939b6b74e18297244cf665b..0000000000000000000000000000000000000000
Binary files a/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a and /dev/null differ
diff --git a/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a b/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a
deleted file mode 100644
index 6c16959f3e22421aadf6ab5ca0de6f84b4afc141..0000000000000000000000000000000000000000
Binary files a/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a and /dev/null differ
diff --git a/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a b/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a
deleted file mode 100644
index 56c7bd56246a6704f64b2d97162f48841bdf6707..0000000000000000000000000000000000000000
Binary files a/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a and /dev/null differ
diff --git a/3rdparty/NPP_staging/npp_staging.h b/3rdparty/NPP_staging/npp_staging.h
deleted file mode 100644
index 6bb1e67d594dfc96cb9ee5b54cd44862dd6c887b..0000000000000000000000000000000000000000
--- a/3rdparty/NPP_staging/npp_staging.h
+++ /dev/null
@@ -1,784 +0,0 @@
-/*
-* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO USER:
-*
-* This source code is subject to NVIDIA ownership rights under U.S. and
-* international Copyright laws.
-*
-* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-* OR PERFORMANCE OF THIS SOURCE CODE.
-*
-* U.S. Government End Users.  This source code is a "commercial item" as
-* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-* "commercial computer software" and "commercial computer software
-* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-* and is provided to the U.S. Government only as a commercial end item.
-* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-* source code with only those rights set forth herein.
-*/
-#ifndef _npp_staging_h_
-#define _npp_staging_h_
-
-
-/**
-* \file npp_staging.h
-* NPP Staging Library (will become part of NPP next release)
-*/
-
-
-#ifdef __cplusplus
-
-
-/** \defgroup ctassert Compile-time assert functionality
-* @{
-*/
-
-
-    /**
-     * Compile-time assert namespace
-     */
-    namespace NppStCTprep
-    {
-        template <bool x>
-        struct CT_ASSERT_FAILURE;
-
-        template <>
-        struct CT_ASSERT_FAILURE<true> {};
-
-        template <int x>
-        struct assertTest{};
-    }
-
-
-    #define NPPST_CT_PREP_PASTE_AUX(a,b)      a##b                           ///< Concatenation indirection macro
-    #define NPPST_CT_PREP_PASTE(a,b)          NPPST_CT_PREP_PASTE_AUX(a, b)  ///< Concatenation macro
-
-
-    /**
-     * Performs compile-time assertion of a condition on the file scope
-     */
-    #define NPPST_CT_ASSERT(X) \
-        typedef NppStCTprep::assertTest<sizeof(NppStCTprep::CT_ASSERT_FAILURE< (bool)(X) >)> \
-        NPPST_CT_PREP_PASTE(__ct_assert_typedef_, __LINE__)
-
-
-/*@}*/
-
-
-#endif
-
-
-/** \defgroup typedefs NPP Integral and compound types of guaranteed size
- * @{
- */
-
-
-typedef               bool NppStBool; ///< Bool of size less than integer
-typedef          long long NppSt64s;  ///< 64-bit signed integer
-typedef unsigned long long NppSt64u;  ///< 64-bit unsigned integer
-typedef                int NppSt32s;  ///< 32-bit signed integer
-typedef       unsigned int NppSt32u;  ///< 32-bit unsigned integer
-typedef              short NppSt16s;  ///< 16-bit signed short
-typedef     unsigned short NppSt16u;  ///< 16-bit unsigned short
-typedef               char NppSt8s;   ///< 8-bit signed char
-typedef      unsigned char NppSt8u;   ///< 8-bit unsigned char
-typedef              float NppSt32f;  ///< 32-bit IEEE-754 (single precision) float
-typedef             double NppSt64f;  ///< 64-bit IEEE-754 (double precision) float
-
-
-/**
- * 2D Rectangle, 8-bit unsigned fields
- * This struct contains position and size information of a rectangle in two space
- */
-struct NppStRect8u
-{
-    NppSt8u x;          ///< x-coordinate of upper left corner
-    NppSt8u y;          ///< y-coordinate of upper left corner
-    NppSt8u width;      ///< Rectangle width
-    NppSt8u height;     ///< Rectangle height
-#ifdef __cplusplus
-    NppStRect8u() : x(0), y(0), width(0), height(0) {};
-    NppStRect8u(NppSt8u x, NppSt8u y, NppSt8u width, NppSt8u height) : x(x), y(y), width(width), height(height) {}
-#endif
-};
-
-
-/**
- * 2D Rectangle, 32-bit signed fields
- * This struct contains position and size information of a rectangle in two space
- */
-struct NppStRect32s
-{
-    NppSt32s x;          ///< x-coordinate of upper left corner
-    NppSt32s y;          ///< y-coordinate of upper left corner
-    NppSt32s width;      ///< Rectangle width
-    NppSt32s height;     ///< Rectangle height
-#ifdef __cplusplus
-    NppStRect32s() : x(0), y(0), width(0), height(0) {};
-    NppStRect32s(NppSt32s x, NppSt32s y, NppSt32s width, NppSt32s height) : x(x), y(y), width(width), height(height) {}
-#endif
-};
-
-
-/**
- * 2D Rectangle, 32-bit unsigned fields
- * This struct contains position and size information of a rectangle in two space
- */
-struct NppStRect32u
-{
-    NppSt32u x;          ///< x-coordinate of upper left corner
-    NppSt32u y;          ///< y-coordinate of upper left corner
-    NppSt32u width;      ///< Rectangle width
-    NppSt32u height;     ///< Rectangle height
-#ifdef __cplusplus
-    NppStRect32u() : x(0), y(0), width(0), height(0) {};
-    NppStRect32u(NppSt32u x, NppSt32u y, NppSt32u width, NppSt32u height) : x(x), y(y), width(width), height(height) {}
-#endif
-};
-
-
-/**
- * 2D Size, 32-bit signed fields
- * This struct typically represents the size of a a rectangular region in two space
- */
-struct NppStSize32s
-{
-    NppSt32s width;  ///< Rectangle width
-    NppSt32s height; ///< Rectangle height
-#ifdef __cplusplus
-    NppStSize32s() : width(0), height(0) {};
-    NppStSize32s(NppSt32s width, NppSt32s height) : width(width), height(height) {}
-#endif
-};
-
-
-/**
- * 2D Size, 32-bit unsigned fields
- * This struct typically represents the size of a a rectangular region in two space
- */
-struct NppStSize32u
-{
-    NppSt32u width;  ///< Rectangle width
-    NppSt32u height; ///< Rectangle height
-#ifdef __cplusplus
-    NppStSize32u() : width(0), height(0) {};
-    NppStSize32u(NppSt32u width, NppSt32u height) : width(width), height(height) {}
-#endif
-};
-
-
-/**
- * Error Status Codes
- *
- * Almost all NPP function return error-status information using
- * these return codes.
- * Negative return codes indicate errors, positive return codes indicate
- * warnings, a return code of 0 indicates success.
- */
-enum NppStStatus
-{
-    //already present in NPP
-    __NPP_SUCCESS                      = 0,   ///< Successful operation (same as NPP_NO_ERROR)
-    __NPP_ERROR                        = -1,  ///< Unknown error
-    __NPP_CUDA_KERNEL_EXECUTION_ERROR  = -3,  ///< CUDA kernel execution error
-    __NPP_NULL_POINTER_ERROR           = -4,  ///< NULL pointer argument error
-    __NPP_TEXTURE_BIND_ERROR           = -24, ///< CUDA texture binding error or non-zero offset returned
-    __NPP_MEMCPY_ERROR                 = -13, ///< CUDA memory copy error
-    __NPP_MEM_ALLOC_ERR                = -12, ///< CUDA memory allocation error
-    __NPP_MEMFREE_ERR                  = -15, ///< CUDA memory deallocation error
-
-    //to be added
-    NPP_INVALID_ROI,                        ///< Invalid region of interest argument
-    NPP_INVALID_STEP,                       ///< Invalid image lines step argument (check sign, alignment, relation to image width)
-    NPP_INVALID_SCALE,                      ///< Invalid scale parameter passed
-    NPP_MEM_INSUFFICIENT_BUFFER,            ///< Insufficient user-allocated buffer
-    NPP_MEM_RESIDENCE_ERROR,                ///< Memory residence error detected (check if pointers should be device or pinned)
-    NPP_MEM_INTERNAL_ERROR,                 ///< Internal memory management error
-};
-
-
-/*@}*/
-
-
-#ifdef __cplusplus
-
-
-/** \defgroup ct_typesize_checks Client-side sizeof types compile-time check
-* @{
-*/
-    NPPST_CT_ASSERT(sizeof(NppStBool) <= 4);
-    NPPST_CT_ASSERT(sizeof(NppSt64s) == 8);
-    NPPST_CT_ASSERT(sizeof(NppSt64u) == 8);
-    NPPST_CT_ASSERT(sizeof(NppSt32s) == 4);
-    NPPST_CT_ASSERT(sizeof(NppSt32u) == 4);
-    NPPST_CT_ASSERT(sizeof(NppSt16s) == 2);
-    NPPST_CT_ASSERT(sizeof(NppSt16u) == 2);
-    NPPST_CT_ASSERT(sizeof(NppSt8s) == 1);
-    NPPST_CT_ASSERT(sizeof(NppSt8u) == 1);
-    NPPST_CT_ASSERT(sizeof(NppSt32f) == 4);
-    NPPST_CT_ASSERT(sizeof(NppSt64f) == 8);
-    NPPST_CT_ASSERT(sizeof(NppStRect8u) == sizeof(NppSt32u));
-    NPPST_CT_ASSERT(sizeof(NppStRect32s) == 4 * sizeof(NppSt32s));
-    NPPST_CT_ASSERT(sizeof(NppStRect32u) == 4 * sizeof(NppSt32u));
-    NPPST_CT_ASSERT(sizeof(NppStSize32u) == 2 * sizeof(NppSt32u));
-/*@}*/
-
-
-#endif
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/** \defgroup core_npp NPP Core
- * Basic functions for CUDA streams management.
- * WARNING: These functions couldn't be exported into DLL, so they can be used only with static version of NPP_staging
- * @{
- */
-
-
-/**
- * Gets an active CUDA stream used by NPP (Not an API yet!)
- * \return Current CUDA stream
- */
-cudaStream_t nppStGetActiveCUDAstream();
-
-
-/**
- * Sets an active CUDA stream used by NPP (Not an API yet!)
- * \param cudaStream        [IN] cudaStream CUDA stream to become current
- * \return CUDA stream used before
- */
-cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
-
-
-/*@}*/
-
-
-/** \defgroup nppi NPP Image Processing
-* @{
-*/
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel.
- *
- * \param d_src             [IN] Source image pointer (CUDA device memory)
- * \param srcStep           [IN] Source image line step
- * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
- * \param dstStep           [IN] Destination image line step
- * \param srcRoi            [IN] Region of interest in the source image
- * \param scale             [IN] Downsampling scale factor (positive integer)
- * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
- *
- * \return NPP status code
- */
-NppStStatus nppiStDownsampleNearest_32u_C1R(NppSt32u *d_src, NppSt32u srcStep,
-                                            NppSt32u *d_dst, NppSt32u dstStep,
-                                            NppStSize32u srcRoi, NppSt32u scale,
-                                            NppStBool readThruTexture);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
- */
-NppStStatus nppiStDownsampleNearest_32s_C1R(NppSt32s *d_src, NppSt32u srcStep,
-                                            NppSt32s *d_dst, NppSt32u dstStep,
-                                            NppStSize32u srcRoi, NppSt32u scale,
-                                            NppStBool readThruTexture);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
- */
-NppStStatus nppiStDownsampleNearest_32f_C1R(NppSt32f *d_src, NppSt32u srcStep,
-                                            NppSt32f *d_dst, NppSt32u dstStep,
-                                            NppStSize32u srcRoi, NppSt32u scale,
-                                            NppStBool readThruTexture);
-
-
-/**
-* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
-* \see nppiStDownsampleNearest_32u_C1R
-*/
-NppStStatus nppiStDownsampleNearest_64u_C1R(NppSt64u *d_src, NppSt32u srcStep,
-                                            NppSt64u *d_dst, NppSt32u dstStep,
-                                            NppStSize32u srcRoi, NppSt32u scale,
-                                            NppStBool readThruTexture);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
- */
-NppStStatus nppiStDownsampleNearest_64s_C1R(NppSt64s *d_src, NppSt32u srcStep,
-                                            NppSt64s *d_dst, NppSt32u dstStep,
-                                            NppStSize32u srcRoi, NppSt32u scale,
-                                            NppStBool readThruTexture);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
- * \see nppiStDownsampleNearest_32u_C1R
- */
-NppStStatus nppiStDownsampleNearest_64f_C1R(NppSt64f *d_src, NppSt32u srcStep,
-                                            NppSt64f *d_dst, NppSt32u dstStep,
-                                            NppStSize32u srcRoi, NppSt32u scale,
-                                            NppStBool readThruTexture);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel. Host implementation.
- *
- * \param h_src             [IN] Source image pointer (Host or pinned memory)
- * \param srcStep           [IN] Source image line step
- * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
- * \param dstStep           [IN] Destination image line step
- * \param srcRoi            [IN] Region of interest in the source image
- * \param scale             [IN] Downsampling scale factor (positive integer)
- *
- * \return NPP status code
- */
-NppStStatus nppiStDownsampleNearest_32u_C1R_host(NppSt32u *h_src, NppSt32u srcStep,
-                                                 NppSt32u *h_dst, NppSt32u dstStep,
-                                                 NppStSize32u srcRoi, NppSt32u scale);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
- */
-NppStStatus nppiStDownsampleNearest_32s_C1R_host(NppSt32s *h_src, NppSt32u srcStep,
-                                                 NppSt32s *h_dst, NppSt32u dstStep,
-                                                 NppStSize32u srcRoi, NppSt32u scale);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
- */
-NppStStatus nppiStDownsampleNearest_32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep,
-                                                 NppSt32f *h_dst, NppSt32u dstStep,
-                                                 NppStSize32u srcRoi, NppSt32u scale);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
- */
-NppStStatus nppiStDownsampleNearest_64u_C1R_host(NppSt64u *h_src, NppSt32u srcStep,
-                                                 NppSt64u *h_dst, NppSt32u dstStep,
-                                                 NppStSize32u srcRoi, NppSt32u scale);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
- */
-NppStStatus nppiStDownsampleNearest_64s_C1R_host(NppSt64s *h_src, NppSt32u srcStep,
-                                                 NppSt64s *h_dst, NppSt32u dstStep,
-                                                 NppStSize32u srcRoi, NppSt32u scale);
-
-
-/**
- * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
- * \see nppiStDownsampleNearest_32u_C1R_host
- */
-NppStStatus nppiStDownsampleNearest_64f_C1R_host(NppSt64f *h_src, NppSt32u srcStep,
-                                                 NppSt64f *h_dst, NppSt32u dstStep,
-                                                 NppStSize32u srcRoi, NppSt32u scale);
-
-
-/**
- * Computes standard deviation for each rectangular region of the input image using integral images.
- *
- * \param d_sum             [IN] Integral image pointer (CUDA device memory)
- * \param sumStep           [IN] Integral image line step
- * \param d_sqsum           [IN] Squared integral image pointer (CUDA device memory)
- * \param sqsumStep         [IN] Squared integral image line step
- * \param d_norm            [OUT] Stddev image pointer (CUDA device memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
- * \param normStep          [IN] Stddev image line step
- * \param roi               [IN] Region of interest in the source image
- * \param rect              [IN] Rectangular region to calculate stddev over
- * \param scaleArea         [IN] Multiplication factor to account decimated scale
- * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
- *
- * \return NPP status code
- */
-NppStStatus nppiStRectStdDev_32f_C1R(NppSt32u *d_sum, NppSt32u sumStep,
-                                     NppSt64u *d_sqsum, NppSt32u sqsumStep,
-                                     NppSt32f *d_norm, NppSt32u normStep,
-                                     NppStSize32u roi, NppStRect32u rect,
-                                     NppSt32f scaleArea, NppStBool readThruTexture);
-
-
-/**
- * Computes standard deviation for each rectangular region of the input image using integral images. Host implementation
- *
- * \param h_sum             [IN] Integral image pointer (Host or pinned memory)
- * \param sumStep           [IN] Integral image line step
- * \param h_sqsum           [IN] Squared integral image pointer (Host or pinned memory)
- * \param sqsumStep         [IN] Squared integral image line step
- * \param h_norm            [OUT] Stddev image pointer (Host or pinned memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
- * \param normStep          [IN] Stddev image line step
- * \param roi               [IN] Region of interest in the source image
- * \param rect              [IN] Rectangular region to calculate stddev over
- * \param scaleArea         [IN] Multiplication factor to account decimated scale
- *
- * \return NPP status code
- */
-NppStStatus nppiStRectStdDev_32f_C1R_host(NppSt32u *h_sum, NppSt32u sumStep,
-                                          NppSt64u *h_sqsum, NppSt32u sqsumStep,
-                                          NppSt32f *h_norm, NppSt32u normStep,
-                                          NppStSize32u roi, NppStRect32u rect,
-                                          NppSt32f scaleArea);
-
-
-/**
- * Transposes an image. 32-bit unsigned pixels, single channel
- *
- * \param d_src             [IN] Source image pointer (CUDA device memory)
- * \param srcStride         [IN] Source image line step
- * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
- * \param dstStride         [IN] Destination image line step
- * \param srcRoi            [IN] Region of interest of the source image
- *
- * \return NPP status code
- */
-NppStStatus nppiStTranspose_32u_C1R(NppSt32u *d_src, NppSt32u srcStride,
-                                    NppSt32u *d_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 32-bit signed pixels, single channel
- * \see nppiStTranspose_32u_C1R
- */
-NppStStatus nppiStTranspose_32s_C1R(NppSt32s *d_src, NppSt32u srcStride,
-                                    NppSt32s *d_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 32-bit float pixels, single channel
- * \see nppiStTranspose_32u_C1R
- */
-NppStStatus nppiStTranspose_32f_C1R(NppSt32f *d_src, NppSt32u srcStride,
-                                    NppSt32f *d_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 64-bit unsigned pixels, single channel
- * \see nppiStTranspose_32u_C1R
- */
-NppStStatus nppiStTranspose_64u_C1R(NppSt64u *d_src, NppSt32u srcStride,
-                                    NppSt64u *d_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 64-bit signed pixels, single channel
- * \see nppiStTranspose_32u_C1R
- */
-NppStStatus nppiStTranspose_64s_C1R(NppSt64s *d_src, NppSt32u srcStride,
-                                    NppSt64s *d_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 64-bit float pixels, single channel
- * \see nppiStTranspose_32u_C1R
- */
-NppStStatus nppiStTranspose_64f_C1R(NppSt64f *d_src, NppSt32u srcStride,
-                                    NppSt64f *d_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
- *
- * \param h_src             [IN] Source image pointer (Host or pinned memory)
- * \param srcStride         [IN] Source image line step
- * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
- * \param dstStride         [IN] Destination image line step
- * \param srcRoi            [IN] Region of interest of the source image
- *
- * \return NPP status code
- */
-NppStStatus nppiStTranspose_32u_C1R_host(NppSt32u *h_src, NppSt32u srcStride,
-                                         NppSt32u *h_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 32-bit signed pixels, single channel. Host implementation
- * \see nppiStTranspose_32u_C1R_host
- */
-NppStStatus nppiStTranspose_32s_C1R_host(NppSt32s *h_src, NppSt32u srcStride,
-                                         NppSt32s *h_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 32-bit float pixels, single channel. Host implementation
- * \see nppiStTranspose_32u_C1R_host
- */
-NppStStatus nppiStTranspose_32f_C1R_host(NppSt32f *h_src, NppSt32u srcStride,
-                                         NppSt32f *h_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 64-bit unsigned pixels, single channel. Host implementation
- * \see nppiStTranspose_32u_C1R_host
- */
-NppStStatus nppiStTranspose_64u_C1R_host(NppSt64u *h_src, NppSt32u srcStride,
-                                         NppSt64u *h_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 64-bit signed pixels, single channel. Host implementation
- * \see nppiStTranspose_32u_C1R_host
- */
-NppStStatus nppiStTranspose_64s_C1R_host(NppSt64s *h_src, NppSt32u srcStride,
-                                         NppSt64s *h_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Transposes an image. 64-bit float pixels, single channel. Host implementation
- * \see nppiStTranspose_32u_C1R_host
- */
-NppStStatus nppiStTranspose_64f_C1R_host(NppSt64f *h_src, NppSt32u srcStride,
-                                         NppSt64f *h_dst, NppSt32u dstStride, NppStSize32u srcRoi);
-
-
-/**
- * Calculates the size of the temporary buffer for integral image creation
- *
- * \param roiSize           [IN] Size of the input image
- * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
- *
- * \return NPP status code
- */
-NppStStatus nppiStIntegralGetSize_8u32u(NppStSize32u roiSize, NppSt32u *pBufsize);
-
-
-/**
- * Calculates the size of the temporary buffer for integral image creation
- * \see nppiStIntegralGetSize_8u32u
- */
-NppStStatus nppiStIntegralGetSize_32f32f(NppStSize32u roiSize, NppSt32u *pBufsize);
-
-
-/**
- * Creates an integral image representation for the input image
- *
- * \param d_src             [IN] Source image pointer (CUDA device memory)
- * \param srcStep           [IN] Source image line step
- * \param d_dst             [OUT] Destination integral image pointer (CUDA device memory)
- * \param dstStep           [IN] Destination image line step
- * \param roiSize           [IN] Region of interest of the source image
- * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
- * \param bufSize           [IN] Size of the pBuffer in bytes
- *
- * \return NPP status code
- */
-NppStStatus nppiStIntegral_8u32u_C1R(NppSt8u *d_src, NppSt32u srcStep,
-                                     NppSt32u *d_dst, NppSt32u dstStep, NppStSize32u roiSize,
-                                     NppSt8u *pBuffer, NppSt32u bufSize);
-
-
-/**
- * Creates an integral image representation for the input image
- * \see nppiStIntegral_8u32u_C1R
- */
-NppStStatus nppiStIntegral_32f32f_C1R(NppSt32f *d_src, NppSt32u srcStep,
-                                      NppSt32f *d_dst, NppSt32u dstStep, NppStSize32u roiSize,
-                                      NppSt8u *pBuffer, NppSt32u bufSize);
-
-
-/**
- * Creates an integral image representation for the input image. Host implementation
- *
- * \param h_src             [IN] Source image pointer (Host or pinned memory)
- * \param srcStep           [IN] Source image line step
- * \param h_dst             [OUT] Destination integral image pointer (Host or pinned memory)
- * \param dstStep           [IN] Destination image line step
- * \param roiSize           [IN] Region of interest of the source image
- *
- * \return NPP status code
- */
-NppStStatus nppiStIntegral_8u32u_C1R_host(NppSt8u *h_src, NppSt32u srcStep,
-                                          NppSt32u *h_dst, NppSt32u dstStep, NppStSize32u roiSize);
-
-
-/**
- * Creates an integral image representation for the input image. Host implementation
- * \see nppiStIntegral_8u32u_C1R_host
- */
-NppStStatus nppiStIntegral_32f32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep,
-                                           NppSt32f *h_dst, NppSt32u dstStep, NppStSize32u roiSize);
-
-
-/**
- * Calculates the size of the temporary buffer for squared integral image creation
- *
- * \param roiSize           [IN] Size of the input image
- * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
- *
- * \return NPP status code
- */
-NppStStatus nppiStSqrIntegralGetSize_8u64u(NppStSize32u roiSize, NppSt32u *pBufsize);
-
-
-/**
- * Creates a squared integral image representation for the input image
- *
- * \param d_src             [IN] Source image pointer (CUDA device memory)
- * \param srcStep           [IN] Source image line step
- * \param d_dst             [OUT] Destination squared integral image pointer (CUDA device memory)
- * \param dstStep           [IN] Destination image line step
- * \param roiSize           [IN] Region of interest of the source image
- * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
- * \param bufSize           [IN] Size of the pBuffer in bytes
- *
- * \return NPP status code
- */
-NppStStatus nppiStSqrIntegral_8u64u_C1R(NppSt8u *d_src, NppSt32u srcStep,
-                                        NppSt64u *d_dst, NppSt32u dstStep, NppStSize32u roiSize,
-                                        NppSt8u *pBuffer, NppSt32u bufSize);
-
-
-/**
- * Creates a squared integral image representation for the input image. Host implementation
- *
- * \param h_src             [IN] Source image pointer (Host or pinned memory)
- * \param srcStep           [IN] Source image line step
- * \param h_dst             [OUT] Destination squared integral image pointer (Host or pinned memory)
- * \param dstStep           [IN] Destination image line step
- * \param roiSize           [IN] Region of interest of the source image
- *
- * \return NPP status code
- */
-NppStStatus nppiStSqrIntegral_8u64u_C1R_host(NppSt8u *h_src, NppSt32u srcStep,
-                                             NppSt64u *h_dst, NppSt32u dstStep, NppStSize32u roiSize);
-
-
-/*@}*/
-
-
-/** \defgroup npps NPP Signal Processing
-* @{
-*/
-
-
-/**
- * Calculates the size of the temporary buffer for vector compaction. 32-bit unsigned values
- *
- * \param srcLen            [IN] Length of the input vector in elements
- * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
- *
- * \return NPP status code
- */
-NppStStatus nppsStCompactGetSize_32u(NppSt32u srcLen, NppSt32u *pBufsize);
-
-
-/**
- * Calculates the size of the temporary buffer for vector compaction. 32-bit signed values
- * \see nppsStCompactGetSize_32u
- */
-NppStStatus nppsStCompactGetSize_32s(NppSt32u srcLen, NppSt32u *pBufsize);
-
-
-/**
- * Calculates the size of the temporary buffer for vector compaction. 32-bit float values
- * \see nppsStCompactGetSize_32u
- */
-NppStStatus nppsStCompactGetSize_32f(NppSt32u srcLen, NppSt32u *pBufsize);
-
-
-/**
- * Compacts the input vector by removing elements of specified value. 32-bit unsigned values
- *
- * \param d_src             [IN] Source vector pointer (CUDA device memory)
- * \param srcLen            [IN] Source vector length
- * \param d_dst             [OUT] Destination vector pointer (CUDA device memory)
- * \param p_dstLen          [OUT] Pointer to the destination vector length (Pinned memory or NULL)
- * \param elemRemove        [IN] The value to be removed
- * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
- * \param bufSize           [IN] Size of the pBuffer in bytes
- *
- * \return NPP status code
- */
-NppStStatus nppsStCompact_32u(NppSt32u *d_src, NppSt32u srcLen,
-                              NppSt32u *d_dst, NppSt32u *p_dstLen,
-                              NppSt32u elemRemove,
-                              NppSt8u *pBuffer, NppSt32u bufSize);
-
-
-/**
- * Compacts the input vector by removing elements of specified value. 32-bit signed values
- * \see nppsStCompact_32u
- */
-NppStStatus nppsStCompact_32s(NppSt32s *d_src, NppSt32u srcLen,
-                              NppSt32s *d_dst, NppSt32u *p_dstLen,
-                              NppSt32s elemRemove,
-                              NppSt8u *pBuffer, NppSt32u bufSize);
-
-
-/**
- * Compacts the input vector by removing elements of specified value. 32-bit float values
- * \see nppsStCompact_32u
- */
-NppStStatus nppsStCompact_32f(NppSt32f *d_src, NppSt32u srcLen,
-                              NppSt32f *d_dst, NppSt32u *p_dstLen,
-                              NppSt32f elemRemove,
-                              NppSt8u *pBuffer, NppSt32u bufSize);
-
-
-/**
- * Compacts the input vector by removing elements of specified value. 32-bit unsigned values. Host implementation
- *
- * \param h_src             [IN] Source vector pointer (CUDA device memory)
- * \param srcLen            [IN] Source vector length
- * \param h_dst             [OUT] Destination vector pointer (CUDA device memory)
- * \param dstLen            [OUT] Pointer to the destination vector length (can be NULL)
- * \param elemRemove        [IN] The value to be removed
- *
- * \return NPP status code
- */
-NppStStatus nppsStCompact_32u_host(NppSt32u *h_src, NppSt32u srcLen,
-                                   NppSt32u *h_dst, NppSt32u *dstLen, NppSt32u elemRemove);
-
-
-/**
- * Compacts the input vector by removing elements of specified value. 32-bit signed values. Host implementation
- * \see nppsStCompact_32u_host
- */
-NppStStatus nppsStCompact_32s_host(NppSt32s *h_src, NppSt32u srcLen,
-                                   NppSt32s *h_dst, NppSt32u *dstLen, NppSt32s elemRemove);
-
-
-/**
- * Compacts the input vector by removing elements of specified value. 32-bit float values. Host implementation
- * \see nppsStCompact_32u_host
- */
-NppStStatus nppsStCompact_32f_host(NppSt32f *h_src, NppSt32u srcLen,
-                                   NppSt32f *h_dst, NppSt32u *dstLen, NppSt32f elemRemove);
-
-
-/*@}*/
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif // _npp_staging_h_
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index f25036ac242f3a07e85ae20e83f45541264362f5..e99cbc78fcd0fb58713d6aff4544c9567dd45337 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -36,10 +36,12 @@ file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})
 
 if (HAVE_CUDA AND MSVC)
-    file(GLOB ncv_srcs "src/nvidia/*.cpp")
-	file(GLOB ncv_hdrs "src/nvidia/*.h*")
-	file(GLOB ncv_cuda "src/nvidia/*.cu")
-	source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+    file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
+	file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")	
+	file(GLOB_RECURSE ncv_hdr1 "src/nvidia/*.hpp")	
+	file(GLOB_RECURSE ncv_hdr2 "src/nvidia/*.h")	
+	source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdr1} ${ncv_hdr2} ${ncv_cuda})	
+	include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
 endif()
 
 if (HAVE_CUDA)		
@@ -74,17 +76,13 @@ if (HAVE_CUDA)
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-	endif()
-   
-
-    include(FindNPP_staging.cmake)
-    include_directories(${NPPST_INC})   
+	endif()     
     
 	CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
 	#CUDA_BUILD_CLEAN_TARGET()
 endif()
 
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdr1} ${ncv_hdr2} ${ncv_cuda} ${cuda_objs})
 
 if(PCHSupport_FOUND)
 	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
@@ -117,8 +115,7 @@ set_target_properties(${the_target} PROPERTIES
 target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
 
 if (HAVE_CUDA)
-	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
-    target_link_libraries(${the_target} ${NPPST_LIB})
+	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})    
     CUDA_ADD_CUFFT_TO_TARGET(${the_target})
 endif()
 
diff --git a/modules/gpu/FindNPP_staging.cmake b/modules/gpu/FindNPP_staging.cmake
deleted file mode 100644
index e478695c698285cee31ac6f8d1a7af3d99639c8f..0000000000000000000000000000000000000000
--- a/modules/gpu/FindNPP_staging.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-if(CMAKE_SIZEOF_VOID_P EQUAL 4)			
-    set(BIT_SUFF 32)
-else()
-    set(BIT_SUFF 64)
-endif()
-
-if (APPLE)
-    set(PLATFORM_SUFF Darwin)
-elseif (UNIX)
-    set(PLATFORM_SUFF Linux)
-else()
-    set(PLATFORM_SUFF Windows)
-endif()
-
-set(LIB_FILE NPP_staging_static_${PLATFORM_SUFF}_${BIT_SUFF}_v1)
-
-find_library(NPPST_LIB 
-    NAMES "${LIB_FILE}" "lib${LIB_FILE}" 
-    PATHS "${CMAKE_SOURCE_DIR}/3rdparty/NPP_staging" 
-    DOC "NPP staging library"
-    )	
-
-SET(NPPST_INC "${CMAKE_SOURCE_DIR}//3rdparty/NPP_staging")
- 
\ No newline at end of file
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 5a54072a827eabc2ba456a95758d54d16577b689..c6a23debaab57898f0d90fcb460bd8ee29b6d069 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -83,25 +83,25 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );		
     }
     else if (src.elemSize() == 4)
     {
-        NppStSize32u sz;
+        NcvSize32u sz;
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<NppSt32u*>(src.ptr<NppSt32u>()), src.step, 
-            dst.ptr<NppSt32u>(), dst.step, sz) );
+        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), src.step, 
+            dst.ptr<Ncv32u>(), dst.step, sz) );
     }
     else // if (src.elemSize() == 8)
     {
-        NppStSize32u sz;
+        NcvSize32u sz;
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<NppSt64u*>(src.ptr<NppSt64u>()), src.step, 
-            dst.ptr<NppSt64u>(), dst.step, sz) );
+        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), src.step, 
+            dst.ptr<Ncv64u>(), dst.step, sz) );		
     }
 
     cudaSafeCall( cudaThreadSynchronize() );
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 15e5d7ffb85fa2ed621de2f3167cede0b3eb6755..37acc59bd1df2cf5d20d6a452e6439fc98f105ce 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -126,7 +126,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
             minNeighbors,
             scaleStep, 1,
             flags,
-            *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor,  0);
+            *gpuAllocator, *cpuAllocator, devProp, 0);
         ncvAssertReturnNcvStat(ncvStat);
         ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
                        
@@ -146,8 +146,8 @@ private:
         ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
 
         // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
-        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice);        
-        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned);
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, devProp.textureAlignment);        
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment);
 
         ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
         ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
@@ -212,7 +212,7 @@ private:
         roi.height = d_src.height();
         Ncv32u numDetections;
         ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0);
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);
 
         ncvAssertReturnNcvStat(ncvStat);
         ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 1711c3c9eef4a79ace4b2ec1bfad4052615fae83..15a95a447976dcac6c3e4f85813d8371a9ed70d4 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -560,16 +560,19 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer)
 
     sum.create(src.rows + 1, src.cols + 1, CV_32S);
     
-    NppStSize32u roiSize;
+    NcvSize32u roiSize;
     roiSize.width = src.cols;
     roiSize.height = src.rows;
 
-    NppSt32u bufSize;
-    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize) );
+	cudaDeviceProp prop;
+	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+    Ncv32u bufSize;
+    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
     ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
 
-    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<NppSt8u*>(src.ptr<NppSt8u>()), src.step, 
-        sum.ptr<NppSt32u>(), sum.step, roiSize, buffer.ptr<NppSt8u>(), bufSize) );
+    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), src.step, 
+        sum.ptr<Ncv32u>(), sum.step, roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
 
     cudaSafeCall( cudaThreadSynchronize() );
 }
@@ -600,19 +603,20 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum)
 {
     CV_Assert(src.type() == CV_8U);
 
-    NppStSize32u roiSize;
+    NcvSize32u roiSize;
     roiSize.width = src.cols;
     roiSize.height = src.rows;
 
-    NppSt32u bufSize;
-    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize));
+	cudaDeviceProp prop;
+	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+    Ncv32u bufSize;
+    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
     GpuMat buf(1, bufSize, CV_8U);
 
     sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(
-            const_cast<NppSt8u*>(src.ptr<NppSt8u>(0)), src.step, 
-            sqsum.ptr<NppSt64u>(0), sqsum.step, roiSize, 
-            buf.ptr<NppSt8u>(0), bufSize));
+    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), src.step, 
+            sqsum.ptr<Ncv64u>(0), sqsum.step, roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
 
     cudaSafeCall( cudaThreadSynchronize() );
 }
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
index a501d6525eae795357a9940320baa0e88a44fbeb..dc4796989a3ea51fb615068375912431e99bcd0b 100644
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -57,8 +57,8 @@
 
 #include <algorithm>
 
-#include "npp.h"
 #include "NCV.hpp"
+#include "NPP_staging/NPP_staging.hpp"
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVHaarObjectDetection.hpp"
 
@@ -970,8 +970,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
                                                Ncv32f scaleArea,
                                                INCVMemAllocator &gpuAllocator,
                                                INCVMemAllocator &cpuAllocator,
-                                               Ncv32u devPropMajor,
-                                               Ncv32u devPropMinor,
+                                               cudaDeviceProp &devProp,
                                                cudaStream_t cuStream)
 {
     ncvAssertReturn(d_integralImage.memType() == d_weights.memType() &&
@@ -1077,15 +1076,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
     Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
                                           (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
 
-    NcvBool bTexCacheCascade = devPropMajor < 2;
+    NcvBool bTexCacheCascade = devProp.major < 2;
     NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
-    NcvBool bDoAtomicCompaction = devPropMajor >= 2 || (devPropMajor == 1 && devPropMinor >= 3);
+    NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
 
     NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
     NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
 
     Ncv32u szNppCompactTmpBuf;
-    nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf);
+    nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf, devProp);
     if (bDoAtomicCompaction)
     {
         szNppCompactTmpBuf = 0;
@@ -1185,11 +1184,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
         }
         else
         {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
             nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(),
                                       d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
         }
         numDetections = *hp_numDet;
     }
@@ -1240,11 +1239,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
         }
         else
         {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(),
                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
         }
 
         swap(d_ptrNowData, d_ptrNowTmp);
@@ -1310,11 +1309,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
         }
         else
         {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
         }
 
         swap(d_ptrNowData, d_ptrNowTmp);
@@ -1371,11 +1370,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
         }
         else
         {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
         }
 
         swap(d_ptrNowData, d_ptrNowTmp);
@@ -1715,8 +1714,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
 
                                             INCVMemAllocator &gpuAllocator,
                                             INCVMemAllocator &cpuAllocator,
-                                            Ncv32u devPropMajor,
-                                            Ncv32u devPropMinor,
+                                            cudaDeviceProp &devProp,
                                             cudaStream_t cuStream)
 {
     ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
@@ -1773,12 +1771,12 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
     NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
     ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
 
-    NppStStatus nppStat;
+    NCVStatus nppStat;
     Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
-    nppStat = nppiStIntegralGetSize_8u32u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
-    nppStat = nppiStSqrIntegralGetSize_8u64u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+    nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
+    nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
     NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
     ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
 
@@ -1786,15 +1784,15 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
 
     nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
                                        d_integralImage.ptr(), d_integralImage.pitch(),
-                                       NppStSize32u(d_srcImg.width(), d_srcImg.height()),
-                                       d_tmpIIbuf.ptr(), szTmpBufIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+                                       NcvSize32u(d_srcImg.width(), d_srcImg.height()),
+                                       d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
 
     nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
                                           d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
-                                          NppStSize32u(d_srcImg.width(), d_srcImg.height()),
-                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+                                          NcvSize32u(d_srcImg.width(), d_srcImg.height()),
+                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
 
     NCV_SKIP_COND_END
 
@@ -1859,7 +1857,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
         Ncv32u scale = scalesVector[i];
 
         NcvSize32u srcRoi, scaledIIRoi, searchRoi;
-        NppStSize32u srcIIRoi;
+        NcvSize32u srcIIRoi;
         srcRoi.width = d_srcImg.width();
         srcRoi.height = d_srcImg.height();
         srcIIRoi.width = srcRoi.width + 1;
@@ -1875,15 +1873,15 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
             d_integralImage.ptr(), d_integralImage.pitch(),
             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
             srcIIRoi, scale, true);
-        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+        ncvAssertReturnNcvStat(nppStat);
 
         nppStat = nppiStDownsampleNearest_64u_C1R(
             d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
             srcIIRoi, scale, true);
-        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+        ncvAssertReturnNcvStat(nppStat);
 
-        const NppStRect32u rect(
+        const NcvRect32u rect(
             HAAR_STDDEV_BORDER,
             HAAR_STDDEV_BORDER,
             haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
@@ -1892,9 +1890,9 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
             d_rectStdDev.ptr(), d_rectStdDev.pitch(),
-            NppStSize32u(searchRoi.width, searchRoi.height), rect,
+            NcvSize32u(searchRoi.width, searchRoi.height), rect,
             (Ncv32f)scale*scale, true);
-        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+        ncvAssertReturnNcvStat(nppStat);
 
         NCV_SKIP_COND_END
 
@@ -1904,8 +1902,8 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
             detectionsOnThisScale,
             haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
             searchRoi, pixelStep, (Ncv32f)scale*scale,
-            gpuAllocator, cpuAllocator, devPropMajor, devPropMinor, cuStream);
-        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+            gpuAllocator, cpuAllocator, devProp, cuStream);
+        ncvAssertReturnNcvStat(nppStat);
 
         NCV_SKIP_COND_BEGIN
 
@@ -2250,6 +2248,10 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
     return ncvStat;
 }
 
+
+
+
+
 NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
                                    Ncv32u &numHypotheses,
                                    Ncv32u minNeighbors,
@@ -2539,7 +2541,7 @@ NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
 }
 
 
-NCVStatus ncvHaarStoreNVBIN_host(std::string &filename,
+NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
                                  HaarClassifierCascadeDescriptor haar,
                                  NCVVector<HaarStage64> &h_HaarStages,
                                  NCVVector<HaarClassifierNode128> &h_HaarNodes,
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
index d9f500d7a0f48b2313fe2ac32cb892acea4ed6f3..bb463d8f4c7c37ca52f6112c8bdb5bc640ae7142 100644
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@@ -75,13 +75,13 @@ struct HaarFeature64
 
 #define HaarFeature64_CreateCheck_MaxRectField                  0xFF
 
-    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight)
+    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u /*clsWidth*/, Ncv32u /*clsHeight*/)
     {
         ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);
-        ((NcvRect8u*)&(this->_ui2.x))->x = rectX;
-        ((NcvRect8u*)&(this->_ui2.x))->y = rectY;
-        ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth;
-        ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight;
+        ((NcvRect8u*)&(this->_ui2.x))->x = (Ncv8u)rectX;
+        ((NcvRect8u*)&(this->_ui2.x))->y = (Ncv8u)rectY;
+        ((NcvRect8u*)&(this->_ui2.x))->width = (Ncv8u)rectWidth;
+        ((NcvRect8u*)&(this->_ui2.x))->height = (Ncv8u)rectHeight;
         return NCV_SUCCESS;
     }
 
@@ -306,11 +306,11 @@ struct HaarStage64
 };
 
 
-NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8);
-NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
-NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
-NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
-NPPST_CT_ASSERT(sizeof(HaarStage64) == 8);
+NCV_CT_ASSERT(sizeof(HaarFeature64) == 8);
+NCV_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
+NCV_CT_ASSERT(sizeof(HaarStage64) == 8);
 
 
 //==============================================================================
@@ -347,7 +347,7 @@ enum
     NCVPipeObjDet_VisualizeInPlace      = 0x004,
 };
 
-
+NCV_EXPORTS
 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
                                             NcvSize32u srcRoi,
                                             NCVVector<NcvRect32u> &d_dstRects,
@@ -367,15 +367,14 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
 
                                             INCVMemAllocator &gpuAllocator,
                                             INCVMemAllocator &cpuAllocator,
-                                            Ncv32u devPropMajor,
-                                            Ncv32u devPropMinor,
+                                            cudaDeviceProp &devProp,
                                             cudaStream_t cuStream);
 
 
 #define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF
 #define HAAR_STDDEV_BORDER                  1
 
-
+NCV_EXPORTS
 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
                                                NCVMatrix<Ncv32f> &d_weights,
                                                NCVMatrixAlloc<Ncv32u> &d_pixelMask,
@@ -391,11 +390,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
                                                Ncv32f scaleArea,
                                                INCVMemAllocator &gpuAllocator,
                                                INCVMemAllocator &cpuAllocator,
-                                               Ncv32u devPropMajor,
-                                               Ncv32u devPropMinor,
+                                               cudaDeviceProp &devProp,
                                                cudaStream_t cuStream);
 
-
+NCV_EXPORTS
 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
                                              NCVMatrix<Ncv32f> &h_weights,
                                              NCVMatrixAlloc<Ncv32u> &h_pixelMask,
@@ -409,7 +407,7 @@ NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
                                              Ncv32u pixelStep,
                                              Ncv32f scaleArea);
 
-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
                                  Ncv32u dstStride,
                                  Ncv32u dstWidth,
@@ -419,7 +417,7 @@ NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
                                  Ncv8u color,
                                  cudaStream_t cuStream);
 
-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
                                   Ncv32u dstStride,
                                   Ncv32u dstWidth,
@@ -429,7 +427,7 @@ NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
                                   Ncv32u color,
                                   cudaStream_t cuStream);
 
-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
                                Ncv32u dstStride,
                                Ncv32u dstWidth,
@@ -438,7 +436,7 @@ NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
                                Ncv32u numRects,
                                Ncv8u color);
 
-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
                                 Ncv32u dstStride,
                                 Ncv32u dstWidth,
@@ -450,7 +448,7 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
 
 #define RECT_SIMILARITY_PROPORTION      0.2f
 
-
+NCV_EXPORTS
 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
                                          Ncv32u numPixelMaskDetections,
                                          NCVVector<NcvRect32u> &hypotheses,
@@ -461,7 +459,7 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
                                          Ncv32f curScale,
                                          cudaStream_t cuStream);
 
-
+NCV_EXPORTS
 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
                                        Ncv32u numPixelMaskDetections,
                                        NCVVector<NcvRect32u> &hypotheses,
@@ -471,18 +469,18 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
                                        Ncv32u rectHeight,
                                        Ncv32f curScale);
 
-
+NCV_EXPORTS
 NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
                                    Ncv32u &numHypotheses,
                                    Ncv32u minNeighbors,
                                    Ncv32f intersectEps,
                                    NCVVector<Ncv32u> *hypothesesWeights);
 
-
+NCV_EXPORTS
 NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
                                    Ncv32u &numNodes, Ncv32u &numFeatures);
 
-
+NCV_EXPORTS
 NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
                                    HaarClassifierCascadeDescriptor &haar,
                                    NCVVector<HaarStage64> &h_HaarStages,
@@ -490,6 +488,7 @@ NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
                                    NCVVector<HaarFeature64> &h_HaarFeatures);
 
 
+NCV_EXPORTS
 NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
                                  HaarClassifierCascadeDescriptor haar,
                                  NCVVector<HaarStage64> &h_HaarStages,
diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66f9dcdbbd44667a2b447e0bae94455e93140411
--- /dev/null
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -0,0 +1,1704 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include <vector>
+#include <cuda_runtime.h>
+#include "NPP_staging.hpp"
+
+#if defined _SELF_TEST_
+#include <stdio.h>
+#endif
+
+
+texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
+texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
+texture<uint2,  1, cudaReadModeElementType> tex64u;
+
+
+//==============================================================================
+//
+// CUDA streams handling
+//
+//==============================================================================
+
+
+static cudaStream_t nppStream = 0;
+
+
+cudaStream_t nppStGetActiveCUDAstream(void)
+{
+    return nppStream;
+}
+
+
+
+cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
+{
+    cudaStream_t tmp = nppStream;
+    nppStream = cudaStream;
+    return tmp;
+}
+
+
+//==============================================================================
+//
+// BlockScan.cuh
+//
+//==============================================================================
+
+
+//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
+//assuming size <= WARP_SIZE and size is power of 2
+template <class T>
+inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
+{
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+    s_Data[pos] = 0;
+    pos += K_WARP_SIZE;
+    s_Data[pos] = idata;
+
+    for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
+    {
+        s_Data[pos] += s_Data[pos - offset];
+    }
+
+    return s_Data[pos];
+}
+
+
+template <class T>
+inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
+{
+    return warpScanInclusive(idata, s_Data) - idata;
+}
+
+
+template <class T, Ncv32u tiNumScanThreads>
+inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
+{
+    if (tiNumScanThreads > K_WARP_SIZE)
+    {
+        //Bottom-level inclusive warp scan
+        T warpResult = warpScanInclusive(idata, s_Data);
+
+        //Save top elements of each warp for exclusive warp scan
+        //sync to wait for warp scans to complete (because s_Data is being overwritten)
+        __syncthreads();
+        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
+        {
+            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
+        }
+
+        //wait for warp scans to complete
+        __syncthreads();
+
+        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
+        {
+            //grab top warp elements
+            T val = s_Data[threadIdx.x];
+            //calculate exclusive scan and write back to shared memory
+            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
+        }
+
+        //return updated warp scans with exclusive scan results
+        __syncthreads();
+        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
+    }
+    else
+    {
+        return warpScanInclusive(idata, s_Data);
+    }
+}
+
+
+//==============================================================================
+//
+// IntegralImage.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_SCAN_THREADS = 256;
+const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
+
+
+template<class T_in, class T_out>
+struct _scanElemOp
+{
+    template<bool tbDoSqr>
+    static inline __host__ __device__ T_out scanElemOp(T_in elem);
+
+    template<>
+    static inline __host__ __device__ T_out scanElemOp<false>(T_in elem)
+    {
+        return (T_out)elem;
+    }
+
+    template<>
+    static inline __host__ __device__ T_out scanElemOp<true>(T_in elem)
+    {
+        return (T_out)(elem*elem);
+    }
+};
+
+
+template<class T>
+inline __device__ T readElem(T *d_src, Ncv32u srcStride, Ncv32u curElemOffs);
+
+
+template<>
+inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
+{
+    return tex1Dfetch(tex8u, srcStride * blockIdx.x + curElemOffs);
+}
+
+
+template<>
+inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
+{
+    return d_src[curElemOffs];
+}
+
+
+template<>
+inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
+{
+    return d_src[curElemOffs];
+}
+
+
+/**
+* \brief Segmented scan kernel
+*
+* Calculates per-row prefix scans of the input image.
+* Out-of-bounds safe: reads 'size' elements, writes 'size+1' elements
+*
+* \tparam T_in      Type of input image elements
+* \tparam T_out     Type of output image elements
+* \tparam T_op      Defines an operation to be performed on the input image pixels
+*
+* \param d_src      [IN] Source image pointer
+* \param srcWidth   [IN] Source image width
+* \param srcStride  [IN] Source image stride
+* \param d_II       [OUT] Output image pointer
+* \param IIstride   [IN] Output image stride
+*
+* \return None
+*/
+template <class T_in, class T_out, bool tbDoSqr>
+__global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
+                         T_out *d_II, Ncv32u IIstride)
+{
+    //advance pointers to the current line
+    if (sizeof(T_in) != 1)
+    {
+        d_src += srcStride * blockIdx.x;
+    }
+    //for initial image 8bit source we use texref tex8u
+    d_II += IIstride * blockIdx.x;
+
+    Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
+    Ncv32u offsetX = 0;
+
+    __shared__ T_out shmem[NUM_SCAN_THREADS * 2];
+    __shared__ T_out carryElem;
+    carryElem = 0;
+    __syncthreads();
+
+    while (numBuckets--)
+    {
+        Ncv32u curElemOffs = offsetX + threadIdx.x;
+        T_out curScanElem;
+
+        T_in curElem;
+        T_out curElemMod;
+
+        if (curElemOffs < srcWidth)
+        {
+            //load elements
+            curElem = readElem<T_in>(d_src, srcStride, curElemOffs);
+        }
+        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
+
+        //inclusive scan
+        curScanElem = blockScanInclusive<T_out, NUM_SCAN_THREADS>(curElemMod, shmem);
+
+        if (curElemOffs <= srcWidth)
+        {
+            //make scan exclusive and write the bucket to the output buffer
+            d_II[curElemOffs] = carryElem + curScanElem - curElemMod;
+            offsetX += NUM_SCAN_THREADS;
+        }
+
+        //remember last element for subsequent buckets adjustment
+        __syncthreads();
+        if (threadIdx.x == NUM_SCAN_THREADS-1)
+        {
+            carryElem += curScanElem;
+        }
+        __syncthreads();
+    }
+
+    if (offsetX == srcWidth && !threadIdx.x)
+    {
+        d_II[offsetX] = carryElem;
+    }
+}
+
+
+template <bool tbDoSqr, class T_in, class T_out>
+NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
+                                T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
+{
+    cudaChannelFormatDesc cfdTex;
+    if (sizeof(T_in) == 1)
+    {
+        cfdTex = cudaCreateChannelDesc<Ncv8u>();
+        size_t alignmentOffset;
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+    }
+    scanRows
+        <T_in, T_out, tbDoSqr>
+        <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
+        (d_src, roi.width, srcStride, d_dst, dstStride);
+    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+#if defined _SELF_TEST_
+    T_in *h_src;
+    T_out *h_dst;
+    ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * roi.height * sizeof(T_in)), NPPST_MEM_ALLOC_ERR);
+    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * roi.height * sizeof(T_out)), NPPST_MEM_ALLOC_ERR);
+    memset(h_src, 0, srcStride * roi.height * sizeof(T_in));
+    memset(h_dst, 0, dstStride * roi.height * sizeof(T_out));
+    ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * roi.height * sizeof(T_in), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * roi.height * sizeof(T_out), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+    NcvBool bPass = true;
+    for (Ncv32u i=0; i<roi.height && bPass; i++)
+    {
+        T_out curElem = 0;
+        for (Ncv32u j=0; j<roi.width+1 && bPass; j++)
+        {
+            if (curElem != h_dst[i * dstStride + j])
+            {
+                printf("CIntegralImage::scanRowsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, curElem, h_dst[i * dstStride + j]);
+                bPass = false;
+            }
+            if (j < roi.width)
+            {
+                curElem += scanElemOp<T_op>(h_src[i*srcStride+j]);
+            }
+        }
+    }
+    ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
+    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
+    printf("CIntegralImage::scanRowsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
+#endif
+
+    return NPPST_SUCCESS;
+}
+
+
+Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
+{
+    Ncv32u alignMask = allocatorAlignment-1;
+    Ncv32u inverseAlignMask = ~alignMask;
+    Ncv32u dimBytes = dim * elemTypeSize;
+    Ncv32u pitch = (dimBytes + alignMask) & inverseAlignMask;
+    Ncv32u PaddedDim = pitch / elemTypeSize;
+    return PaddedDim;
+}
+
+
+template <class T_in, class T_out>
+NCVStatus ncvIntegralImage_device(T_in *d_src, Ncv32u srcStep,
+                                  T_out *d_dst, Ncv32u dstStep, NcvSize32u roi,
+                                  INCVMemAllocator &gpuAllocator)
+{
+    ncvAssertReturn(sizeof(T_out) == sizeof(Ncv32u), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
+                      gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roi.width * sizeof(T_in) &&
+                      dstStep >= (roi.width + 1) * sizeof(T_out) &&
+                      srcStep % sizeof(T_in) == 0 &&
+                      dstStep % sizeof(T_out) == 0, NPPST_INVALID_STEP);
+    srcStep /= sizeof(T_in);
+    dstStep /= sizeof(T_out);
+
+    Ncv32u WidthII = roi.width + 1;
+    Ncv32u HeightII = roi.height + 1;
+    Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
+    Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
+
+    NCVMatrixAlloc<T_out> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
+    ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    NCVMatrixAlloc<T_out> Tmp32_2(gpuAllocator, PaddedHeightII32, PaddedWidthII32);
+    ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_2.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn(Tmp32_1.pitch() * Tmp32_1.height() == Tmp32_2.pitch() * Tmp32_2.height(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat;
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = scanRowsWrapperDevice
+        <false>
+        (d_src, srcStep, Tmp32_1.ptr(), PaddedWidthII32, roi);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedWidthII32*sizeof(Ncv32u),
+                                      (Ncv32u *)Tmp32_2.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = scanRowsWrapperDevice
+        <false>
+        (Tmp32_2.ptr(), PaddedHeightII32, Tmp32_1.ptr(), PaddedHeightII32, NcvSize32u(roi.height, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u),
+                                      (Ncv32u *)d_dst, dstStep*sizeof(Ncv32u), NcvSize32u(HeightII, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    NCV_SKIP_COND_END
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus ncvSquaredIntegralImage_device(Ncv8u *d_src, Ncv32u srcStep,
+                                         Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roi,
+                                         INCVMemAllocator &gpuAllocator)
+{
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
+                      gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roi.width &&
+                      dstStep >= (roi.width + 1) * sizeof(Ncv64u) &&
+                      dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
+    dstStep /= sizeof(Ncv64u);
+
+    Ncv32u WidthII = roi.width + 1;
+    Ncv32u HeightII = roi.height + 1;
+    Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
+    Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
+    Ncv32u PaddedWidthII64 = getPaddedDimension(WidthII, sizeof(Ncv64u), gpuAllocator.alignment());
+    Ncv32u PaddedHeightII64 = getPaddedDimension(HeightII, sizeof(Ncv64u), gpuAllocator.alignment());
+    Ncv32u PaddedWidthMax = PaddedWidthII32 > PaddedWidthII64 ? PaddedWidthII32 : PaddedWidthII64;
+    Ncv32u PaddedHeightMax = PaddedHeightII32 > PaddedHeightII64 ? PaddedHeightII32 : PaddedHeightII64;
+
+    NCVMatrixAlloc<Ncv32u> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
+    ncvAssertReturn(Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    NCVMatrixAlloc<Ncv64u> Tmp64(gpuAllocator, PaddedWidthMax, PaddedHeightMax);
+    ncvAssertReturn(Tmp64.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVMatrixReuse<Ncv32u> Tmp32_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII32, PaddedHeightII32);
+    ncvAssertReturn(Tmp32_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
+    NCVMatrixReuse<Ncv64u> Tmp64_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII64, PaddedHeightII64);
+    ncvAssertReturn(Tmp64_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat;
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = scanRowsWrapperDevice
+        <true, Ncv8u, Ncv32u>
+        (d_src, srcStep, Tmp32_2.ptr(), PaddedWidthII32, roi);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_32u_C1R(Tmp32_2.ptr(), PaddedWidthII32*sizeof(Ncv32u),
+                                      Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = scanRowsWrapperDevice
+        <false, Ncv32u, Ncv64u>
+        (Tmp32_1.ptr(), PaddedHeightII32, Tmp64_2.ptr(), PaddedHeightII64, NcvSize32u(roi.height, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_64u_C1R(Tmp64_2.ptr(), PaddedHeightII64*sizeof(Ncv64u),
+                                      d_dst, dstStep*sizeof(Ncv64u), NcvSize32u(HeightII, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    NCV_SKIP_COND_END
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+
+    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device((Ncv8u*)NULL, roiSize.width,
+                                                  (Ncv32u*)NULL, (roiSize.width+1) * sizeof(Ncv32u),
+                                                  roiSize, gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+
+    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device((Ncv32f*)NULL, roiSize.width * sizeof(Ncv32f),
+                                                  (Ncv32f*)NULL, (roiSize.width+1) * sizeof(Ncv32f),
+                                                  roiSize, gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+
+    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvSquaredIntegralImage_device(NULL, roiSize.width,
+                                                         NULL, (roiSize.width+1) * sizeof(Ncv64u),
+                                                         roiSize, gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                   Ncv32u *d_dst, Ncv32u dstStep,
+                                   NcvSize32u roiSize, Ncv8u *pBuffer,
+                                   Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                    Ncv32f *d_dst, Ncv32u dstStep,
+                                    NcvSize32u roiSize, Ncv8u *pBuffer,
+                                    Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                      Ncv64u *d_dst, Ncv32u dstStep,
+                                      NcvSize32u roiSize, Ncv8u *pBuffer,
+                                      Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvSquaredIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                        Ncv32u *h_dst, Ncv32u dstStep,
+                                        NcvSize32u roiSize)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roiSize.width &&
+                      dstStep >= (roiSize.width + 1) * sizeof(Ncv32u) &&
+                      dstStep % sizeof(Ncv32u) == 0, NPPST_INVALID_STEP);
+    dstStep /= sizeof(Ncv32u);
+
+    Ncv32u WidthII = roiSize.width + 1;
+    Ncv32u HeightII = roiSize.height + 1;
+
+    memset(h_dst, 0, WidthII * sizeof(Ncv32u));
+    for (Ncv32u i=1; i<HeightII; i++)
+    {
+        h_dst[i * dstStep] = 0;
+        for (Ncv32u j=1; j<WidthII; j++)
+        {
+            Ncv32u top = h_dst[(i-1) * dstStep + j];
+            Ncv32u left = h_dst[i * dstStep + (j - 1)];
+            Ncv32u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
+            Ncv32u elem = h_src[(i - 1) * srcStep + (j - 1)];
+            h_dst[i * dstStep + j] = elem + left - topleft + top;
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_32f32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                         Ncv32f *h_dst, Ncv32u dstStep,
+                                         NcvSize32u roiSize)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roiSize.width * sizeof(Ncv32f) &&
+                      dstStep >= (roiSize.width + 1) * sizeof(Ncv32f) &&
+                      srcStep % sizeof(Ncv32f) == 0 &&
+                      dstStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
+    srcStep /= sizeof(Ncv32f);
+    dstStep /= sizeof(Ncv32f);
+
+    Ncv32u WidthII = roiSize.width + 1;
+    Ncv32u HeightII = roiSize.height + 1;
+
+    memset(h_dst, 0, WidthII * sizeof(Ncv32u));
+    for (Ncv32u i=1; i<HeightII; i++)
+    {
+        h_dst[i * dstStep] = 0.0f;
+        for (Ncv32u j=1; j<WidthII; j++)
+        {
+            Ncv32f top = h_dst[(i-1) * dstStep + j];
+            Ncv32f left = h_dst[i * dstStep + (j - 1)];
+            Ncv32f topleft = h_dst[(i - 1) * dstStep + (j - 1)];
+            Ncv32f elem = h_src[(i - 1) * srcStep + (j - 1)];
+            h_dst[i * dstStep + j] = elem + left - topleft + top;
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                           Ncv64u *h_dst, Ncv32u dstStep,
+                                           NcvSize32u roiSize)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roiSize.width &&
+                      dstStep >= (roiSize.width + 1) * sizeof(Ncv64u) &&
+                      dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
+    dstStep /= sizeof(Ncv64u);
+
+    Ncv32u WidthII = roiSize.width + 1;
+    Ncv32u HeightII = roiSize.height + 1;
+
+    memset(h_dst, 0, WidthII * sizeof(Ncv64u));
+    for (Ncv32u i=1; i<HeightII; i++)
+    {
+        h_dst[i * dstStep] = 0;
+        for (Ncv32u j=1; j<WidthII; j++)
+        {
+            Ncv64u top = h_dst[(i-1) * dstStep + j];
+            Ncv64u left = h_dst[i * dstStep + (j - 1)];
+            Ncv64u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
+            Ncv64u elem = h_src[(i - 1) * srcStep + (j - 1)];
+            h_dst[i * dstStep + j] = elem*elem + left - topleft + top;
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// DownsampleNearest.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
+const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
+
+
+template<class T, NcvBool tbCacheTexture>
+__device__ T getElem_DownsampleNearest(Ncv32u x, T *d_src);
+
+
+template<>
+__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
+{
+    return tex1Dfetch(tex32u, x);
+}
+
+
+template<>
+__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
+{
+    return d_src[x];
+}
+
+
+template<>
+__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
+{
+    uint2 tmp = tex1Dfetch(tex64u, x);
+    Ncv64u res = (Ncv64u)tmp.y;
+    res <<= 32;
+    res |= tmp.x;
+    return res;
+}
+
+
+template<>
+__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
+{
+    return d_src[x];
+}
+
+
+template <class T, NcvBool tbCacheTexture>
+__global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
+                                      NcvSize32u dstRoi, Ncv32u scale)
+{
+    int curX = blockIdx.x * blockDim.x + threadIdx.x;
+    int curY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (curX >= dstRoi.width || curY >= dstRoi.height)
+    {
+        return;
+    }
+
+    d_dst[curY * dstStep + curX] = getElem_DownsampleNearest<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
+}
+
+
+template <class T>
+static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
+                                                T *d_dst, Ncv32u dstStep,
+                                                NcvSize32u srcRoi, Ncv32u scale,
+                                                NcvBool readThruTexture)
+{
+    ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
+    ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
+                      dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale), NPPST_INVALID_STEP);
+    srcStep /= sizeof(T);
+    dstStep /= sizeof(T);
+
+    NcvSize32u dstRoi;
+    dstRoi.width = srcRoi.width / scale;
+    dstRoi.height = srcRoi.height / scale;
+
+    dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
+              (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
+    dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
+
+    if (!readThruTexture)
+    {
+        downsampleNearest_C1R
+            <T, false>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    }
+    else
+    {
+        cudaChannelFormatDesc cfdTexSrc;
+
+        if (sizeof(T) == sizeof(Ncv32u))
+        {
+            cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
+
+            size_t alignmentOffset;
+            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
+            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+        }
+        else
+        {
+            cfdTexSrc = cudaCreateChannelDesc<uint2>();
+
+            size_t alignmentOffset;
+            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
+            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+        }
+
+        downsampleNearest_C1R
+            <T, true>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    }
+
+    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+#if defined _SELF_TEST_
+    T *h_src;
+    T *h_dst;
+    ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStep * srcRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
+    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStep * dstRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
+    ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStep * srcRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStep * dstRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+
+    bool bPass = true;
+
+    for (Ncv32u i=0; i<dstRoi.height && bPass; i++)
+    {
+        for (Ncv32u j=0; j<dstRoi.width && bPass; j++)
+        {
+            if (h_dst[i*dstStep+j] != h_src[i*scale*srcStep + j*scale])
+            {
+                printf("::downsampleNearestWrapperDevice self test failed: i=%d, j=%d, cpu=%ld, gpu=%ld\n", i, j, (long long)h_src[i*scale*srcStep + j*scale], (long long)h_dst[i*dstStep+j]);
+                bPass = false;
+            }
+        }
+    }
+
+    ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
+    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
+    printf("::downsampleNearestWrapperDevice %s\n", bPass?"PASSED":"FAILED");
+#endif
+
+    return NPPST_SUCCESS;
+}
+
+
+template <class T>
+static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
+                                              T *h_dst, Ncv32u dstStep,
+                                              NcvSize32u srcRoi, Ncv32u scale)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width != 0 && srcRoi.height != 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
+    ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
+                      dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale) &&
+                      srcStep % sizeof(T) == 0 && dstStep % sizeof(T) == 0, NPPST_INVALID_STEP);
+    srcStep /= sizeof(T);
+    dstStep /= sizeof(T);
+
+    NcvSize32u dstRoi;
+    dstRoi.width = srcRoi.width / scale;
+    dstRoi.height = srcRoi.height / scale;
+
+    for (Ncv32u i=0; i<dstRoi.height; i++)
+    {
+        for (Ncv32u j=0; j<dstRoi.width; j++)
+        {
+            h_dst[i*dstStep+j] = h_src[i*scale*srcStep + j*scale];
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+#define implementNppDownsampleNearest(bit, typ) \
+    NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
+                                                     Ncv##bit##typ *d_dst, Ncv32u dstStep, \
+                                                     NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
+    { \
+        return downsampleNearestWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
+                                                           (Ncv##bit##u *)d_dst, dstStep, \
+                                                           srcRoi, scale, readThruTexture); \
+    }
+
+
+#define implementNppDownsampleNearestHost(bit, typ) \
+    NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
+                                                          Ncv##bit##typ *h_dst, Ncv32u dstStep, \
+                                                          NcvSize32u srcRoi, Ncv32u scale) \
+    { \
+        return downsampleNearestWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
+                                                         (Ncv##bit##u *)h_dst, dstStep, \
+                                                         srcRoi, scale); \
+    }
+
+
+implementNppDownsampleNearest(32, u)
+implementNppDownsampleNearest(32, s)
+implementNppDownsampleNearest(32, f)
+implementNppDownsampleNearest(64, u)
+implementNppDownsampleNearest(64, s)
+implementNppDownsampleNearest(64, f)
+implementNppDownsampleNearestHost(32, u)
+implementNppDownsampleNearestHost(32, s)
+implementNppDownsampleNearestHost(32, f)
+implementNppDownsampleNearestHost(64, u)
+implementNppDownsampleNearestHost(64, s)
+implementNppDownsampleNearestHost(64, f)
+
+
+//==============================================================================
+//
+// RectStdDev.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
+
+
+template <NcvBool tbCacheTexture>
+__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
+{
+    if (tbCacheTexture)
+    {
+        return tex1Dfetch(tex32u, x);
+    }
+    else
+    {
+        return d_sum[x];
+    }
+}
+
+
+template <NcvBool tbCacheTexture>
+__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
+{
+    if (tbCacheTexture)
+    {
+        uint2 tmp = tex1Dfetch(tex64u, x);
+        Ncv64u res = (Ncv64u)tmp.y;
+        res <<= 32;
+        res |= tmp.x;
+        return res;
+    }
+    else
+    {
+        return d_sqsum[x];
+    }
+}
+
+
+template <NcvBool tbCacheTexture>
+__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+                                   Ncv32f *d_norm, Ncv32u normStep,
+                                   NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
+{
+    Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
+    if (x_offs >= roi.width)
+    {
+        return;
+    }
+
+    Ncv32u sum_offset = blockIdx.y * sumStep + x_offs;
+    Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
+
+    //OPT: try swapping order (could change cache hit/miss ratio)
+    Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
+    Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
+    Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
+
+    Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
+    sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
+    sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
+    sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
+    Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
+
+    Ncv32f mean = sum_val * invRectArea;
+
+    //////////////////////////////////////////////////////////////////////////
+    // sqsum_val_res = sqsum_val / rectArea
+    //////////////////////////////////////////////////////////////////////////
+
+    Ncv32f sqsum_val_1 = __ull2float_rz(sqsum_val);
+    Ncv64u sqsum_val_2 = __float2ull_rz(sqsum_val_1);
+    Ncv64u sqsum_val_3 = sqsum_val - sqsum_val_2;
+    Ncv32f sqsum_val_4 = __ull2float_rn(sqsum_val_3);
+    sqsum_val_1 *= invRectArea;
+    sqsum_val_4 *= invRectArea;
+    Ncv32f sqsum_val_res = sqsum_val_1 + sqsum_val_4;
+
+    //////////////////////////////////////////////////////////////////////////
+    // variance = sqsum_val_res - mean * mean
+    //////////////////////////////////////////////////////////////////////////
+
+#if defined DISABLE_MAD_SELECTIVELY
+    Ncv32f variance = sqsum_val_2 - __fmul_rn(mean, mean);
+#else
+    Ncv32f variance = sqsum_val_res - mean * mean;
+#endif
+
+    //////////////////////////////////////////////////////////////////////////
+    // stddev = sqrtf(variance)
+    //////////////////////////////////////////////////////////////////////////
+
+    //Ncv32f stddev = sqrtf(variance);
+    Ncv32f stddev = __fsqrt_rn(variance);
+
+    d_norm[blockIdx.y * normStep + x_offs] = stddev;
+}
+
+
+NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+                                   Ncv32f *d_norm, Ncv32u normStep,
+                                   NcvSize32u roi, NcvRect32u rect,
+                                   Ncv32f scaleArea, NcvBool readThruTexture)
+{
+    ncvAssertReturn(d_sum != NULL && d_sqsum != NULL && d_norm != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
+                      sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
+                      normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
+                      sumStep % sizeof(Ncv32u) == 0 &&
+                      sqsumStep % sizeof(Ncv64u) == 0 &&
+                      normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
+    ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
+    sumStep /= sizeof(Ncv32u);
+    sqsumStep /= sizeof(Ncv64u);
+    normStep /= sizeof(Ncv32f);
+
+    Ncv32f rectArea = rect.width * rect.height * scaleArea;
+    Ncv32f invRectArea = 1.0f / rectArea;
+
+    dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
+    dim3 block(NUM_RECTSTDDEV_THREADS);
+
+    if (!readThruTexture)
+    {
+        rectStdDev_32f_C1R
+            <false>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
+    }
+    else
+    {
+        cudaChannelFormatDesc cfdTexSrc;
+        cudaChannelFormatDesc cfdTexSqr;
+        cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
+        cfdTexSqr = cudaCreateChannelDesc<uint2>();
+
+        size_t alignmentOffset;
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+
+        rectStdDev_32f_C1R
+            <true>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
+    }
+
+    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+#if defined _SELF_TEST_
+    Ncv32u *h_sum;
+    Ncv64u *h_sqsum;
+    Ncv32f *h_norm_d;
+    Ncv32u ExtHeight = roi.height + rect.y + rect.height;
+    ncvAssertCUDAReturn(cudaMallocHost(&h_sum, sumStep * ExtHeight * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
+    ncvAssertCUDAReturn(cudaMallocHost(&h_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u)), NPPST_MEM_ALLOC_ERR);
+    ncvAssertCUDAReturn(cudaMallocHost(&h_norm_d, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
+    ncvAssertCUDAReturn(cudaMemcpy(h_sum, d_sum, sumStep * ExtHeight * sizeof(Ncv32u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+    ncvAssertCUDAReturn(cudaMemcpy(h_sqsum, d_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+    ncvAssertCUDAReturn(cudaMemcpy(h_norm_d, d_norm, normStep * roi.height * sizeof(Ncv32f), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+
+    Ncv32f *h_norm_h;
+    ncvAssertCUDAReturn(cudaMallocHost(&h_norm_h, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
+
+    ncvAssertReturnNcvStat(nppRectStdDev_32f_C1R_host(h_sum, sqsumStep, h_sqsum, sqsumStep, h_norm_h, normStep, roi, rect, scaleArea));
+
+    const Ncv64f relEPS = 0.005;
+    bool bPass = true;
+    for (Ncv32u i=0; i<roi.height && bPass; i++)
+    {
+        for (Ncv32u j=0; j<roi.width && bPass; j++)
+        {
+            Ncv64f absErr = fabs(h_norm_h[i * normStep + j] - h_norm_d[i * normStep + j]);
+            Ncv64f relErr = absErr / h_norm_h[i * normStep + j];
+
+            if (relErr > relEPS)
+            {
+                printf("::ncvRectStdDev_32f_C1R self test failed: i=%d, j=%d, cpu=%f, gpu=%f\n", i, j, h_norm_h[i * normStep + j], h_norm_d[i * normStep + j]);
+                bPass = false;
+            }
+        }
+    }
+    ncvAssertCUDAReturn(cudaFreeHost(h_sum), NPPST_MEMFREE_ERR);
+    ncvAssertCUDAReturn(cudaFreeHost(h_sqsum), NPPST_MEMFREE_ERR);
+    ncvAssertCUDAReturn(cudaFreeHost(h_norm_d), NPPST_MEMFREE_ERR);
+    ncvAssertCUDAReturn(cudaFreeHost(h_norm_h), NPPST_MEMFREE_ERR);
+    printf("::ncvRectStdDev_32f_C1R %s\n", bPass?"PASSED":"FAILED");
+#endif
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep,
+                                        Ncv64u *h_sqsum, Ncv32u sqsumStep,
+                                        Ncv32f *h_norm, Ncv32u normStep,
+                                        NcvSize32u roi, NcvRect32u rect,
+                                        Ncv32f scaleArea)
+{
+    ncvAssertReturn(h_sum != NULL && h_sqsum != NULL && h_norm != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
+                      sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
+                      normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
+                      sumStep % sizeof(Ncv32u) == 0 &&
+                      sqsumStep % sizeof(Ncv64u) == 0 &&
+                      normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
+    ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
+    sumStep /= sizeof(Ncv32u);
+    sqsumStep /= sizeof(Ncv64u);
+    normStep /= sizeof(Ncv32f);
+
+    Ncv32f rectArea = rect.width * rect.height * scaleArea;
+    Ncv32f invRectArea = 1.0f / rectArea;
+
+    for (Ncv32u i=0; i<roi.height; i++)
+    {
+        for (Ncv32u j=0; j<roi.width; j++)
+        {
+            Ncv32u sum_offset = i * sumStep + j;
+            Ncv32u sqsum_offset = i * sqsumStep + j;
+
+            Ncv32u sum_tl = h_sum[sum_offset + rect.y * sumStep + rect.x];
+            Ncv32u sum_bl = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x];
+            Ncv32u sum_tr = h_sum[sum_offset + rect.y * sumStep + rect.x + rect.width];
+            Ncv32u sum_br = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width];
+            Ncv64f sum_val = sum_br + sum_tl - sum_tr - sum_bl;
+
+            Ncv64u sqsum_tl = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x];
+            Ncv64u sqsum_bl = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x];
+            Ncv64u sqsum_tr = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x + rect.width];
+            Ncv64u sqsum_br = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width];
+            Ncv64f sqsum_val = (Ncv64f)(sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl);
+
+            Ncv64f mean = sum_val * invRectArea;
+            Ncv64f sqsum_val_2 = sqsum_val / rectArea;
+            Ncv64f variance = sqsum_val_2 - mean * mean;
+
+            h_norm[i * normStep + j] = (Ncv32f)sqrt(variance);
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// Transpose.cu
+//
+//==============================================================================
+
+
+const Ncv32u TRANSPOSE_TILE_DIM   = 16;
+const Ncv32u TRANSPOSE_BLOCK_ROWS = 16;
+
+
+/**
+* \brief Matrix transpose kernel
+*
+* Calculates transpose of the input image
+* \see TRANSPOSE_TILE_DIM
+*
+* \tparam T_in      Type of input image elements
+* \tparam T_out     Type of output image elements
+*
+* \param d_src      [IN] Source image pointer
+* \param srcStride  [IN] Source image stride
+* \param d_dst      [OUT] Output image pointer
+* \param dstStride  [IN] Output image stride
+*
+* \return None
+*/
+template <class T>
+__global__ void transpose(T *d_src, Ncv32u srcStride,
+                          T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
+{
+    __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM+1];
+
+    Ncv32u blockIdx_x, blockIdx_y;
+
+    // do diagonal reordering
+    if (gridDim.x == gridDim.y)
+    {
+        blockIdx_y = blockIdx.x;
+        blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
+    }
+    else
+    {
+        Ncv32u bid = blockIdx.x + gridDim.x * blockIdx.y;
+        blockIdx_y = bid % gridDim.y;
+        blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
+    }
+
+    Ncv32u xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
+    Ncv32u yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
+    Ncv32u index_in = xIndex + yIndex * srcStride;
+
+    xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
+    yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+    Ncv32u index_out = xIndex + yIndex * dstStride;
+
+    for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
+    {
+        tile[threadIdx.y+i][threadIdx.x] = d_src[index_in+i*srcStride];
+    }
+
+    __syncthreads();
+
+    if (xIndex < srcRoi.height)
+    {
+        for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
+        {
+            if (yIndex + i < srcRoi.width)
+            {
+                d_dst[index_out+i*dstStride] = tile[threadIdx.x][threadIdx.y+i];
+            }
+        }
+    }
+}
+
+
+template <class T>
+NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
+                                   T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
+{
+    ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
+                      dstStride >= srcRoi.height * sizeof(T) &&
+                      srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
+    srcStride /= sizeof(T);
+    dstStride /= sizeof(T);
+
+    dim3 grid((srcRoi.width + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM,
+              (srcRoi.height + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM);
+    dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
+    transpose
+        <T>
+        <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+        (d_src, srcStride, d_dst, dstStride, srcRoi);
+    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+#if defined _SELF_TEST_
+    Ncv32u widthExt = grid.x * TRANSPOSE_TILE_DIM;
+    Ncv32u heightExt = grid.y * TRANSPOSE_TILE_DIM;
+    T *h_src;
+    T *h_dst;
+    ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * heightExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
+    ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * widthExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
+    memset(h_src, 0, srcStride * heightExt * sizeof(T));
+    memset(h_dst, 0, dstStride * widthExt * sizeof(T));
+    ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * heightExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+    ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * widthExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
+    NcvBool bPass = true;
+    for (Ncv32u i=0; i<srcRoi.height && bPass; i++)
+    {
+        for (Ncv32u j=0; j<srcRoi.width && bPass; j++)
+        {
+            if (h_src[i * srcStride + j] != h_dst[j * dstStride + i])
+            {
+                printf("CIntegralImage::transposeWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_src[j * srcStride + i], h_dst[i * dstStride + j]);
+                bPass = false;
+            }
+        }
+    }
+    ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
+    ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
+    printf("CIntegralImage::transposeWrapperDevice %s\n", bPass?"PASSED":"FAILED");
+#endif
+
+    return NPPST_SUCCESS;
+}
+
+
+template <class T>
+static NCVStatus transposeWrapperHost(T *h_src, Ncv32u srcStride,
+                                        T *h_dst, Ncv32u dstStride, NcvSize32u srcRoi)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
+                      dstStride >= srcRoi.height * sizeof(T) &&
+                      srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
+    srcStride /= sizeof(T);
+    dstStride /= sizeof(T);
+
+    for (Ncv32u i=0; i<srcRoi.height; i++)
+    {
+        for (Ncv32u j=0; j<srcRoi.width; j++)
+        {
+            h_dst[j*dstStride+i] = h_src[i*srcStride + j];
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+#define implementNppTranspose(bit, typ) \
+    NCVStatus nppiStTranspose_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
+                                             Ncv##bit##typ *d_dst, Ncv32u dstStep, NcvSize32u srcRoi) \
+    { \
+        return transposeWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
+                                                   (Ncv##bit##u *)d_dst, dstStep, srcRoi); \
+    }
+
+
+#define implementNppTransposeHost(bit, typ) \
+    NCVStatus nppiStTranspose_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
+                                                  Ncv##bit##typ *h_dst, Ncv32u dstStep, \
+                                                  NcvSize32u srcRoi) \
+    { \
+        return transposeWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
+                                                 (Ncv##bit##u *)h_dst, dstStep, srcRoi); \
+    }
+
+
+implementNppTranspose(32,u)
+implementNppTranspose(32,s)
+implementNppTranspose(32,f)
+implementNppTranspose(64,u)
+implementNppTranspose(64,s)
+implementNppTranspose(64,f)
+
+implementNppTransposeHost(32,u)
+implementNppTransposeHost(32,s)
+implementNppTransposeHost(32,f)
+implementNppTransposeHost(64,u)
+implementNppTransposeHost(64,s)
+implementNppTransposeHost(64,f)
+
+
+//==============================================================================
+//
+// Compact.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_REMOVE_THREADS = 256;
+
+
+template <bool bRemove, bool bWritePartial>
+__global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
+                                Ncv32u *d_offsets, Ncv32u *d_blockSums,
+                                Ncv32u elemRemove)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
+
+    if (elemAddrIn > srcLen + blockDim.x)
+    {
+        return;
+    }
+
+    __shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2];
+
+    Ncv32u scanElem = 0;
+    if (elemAddrIn < srcLen)
+    {
+        if (bRemove)
+        {
+            scanElem = (d_src[elemAddrIn] != elemRemove) ? 1 : 0;
+        }
+        else
+        {
+            scanElem = d_src[elemAddrIn];
+        }
+    }
+
+    Ncv32u localScanInc = blockScanInclusive<Ncv32u, NUM_REMOVE_THREADS>(scanElem, shmem);
+    __syncthreads();
+
+    if (elemAddrIn < srcLen)
+    {
+        if (threadIdx.x == NUM_REMOVE_THREADS-1 && bWritePartial)
+        {
+            d_blockSums[blockId] = localScanInc;
+        }
+
+        if (bRemove)
+        {
+            d_offsets[elemAddrIn] = localScanInc - scanElem;
+        }
+        else
+        {
+            d_src[elemAddrIn] = localScanInc - scanElem;
+        }
+    }
+}
+
+
+__global__ void removePass2Adjust(Ncv32u *d_offsets, Ncv32u srcLen, Ncv32u *d_blockSums)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
+    if (elemAddrIn >= srcLen)
+    {
+        return;
+    }
+
+    __shared__ Ncv32u valOffs;
+    valOffs = d_blockSums[blockId];
+    __syncthreads();
+
+    d_offsets[elemAddrIn] += valOffs;
+}
+
+
+__global__ void removePass3Compact(Ncv32u *d_src, Ncv32u srcLen,
+                                   Ncv32u *d_offsets, Ncv32u *d_dst,
+                                   Ncv32u elemRemove, Ncv32u *dstLenValue)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
+    if (elemAddrIn >= srcLen)
+    {
+        return;
+    }
+
+    Ncv32u elem = d_src[elemAddrIn];
+    Ncv32u elemAddrOut = d_offsets[elemAddrIn];
+    if (elem != elemRemove)
+    {
+        d_dst[elemAddrOut] = elem;
+    }
+
+    if (elemAddrIn == srcLen-1)
+    {
+        if (elem != elemRemove)
+        {
+            *dstLenValue = elemAddrOut + 1;
+        }
+        else
+        {
+            *dstLenValue = elemAddrOut;
+        }
+    }
+}
+
+
+NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
+                                   Ncv32u *d_dst, Ncv32u *dstLenPinned,
+                                   Ncv32u elemRemove,
+                                   INCVMemAllocator &gpuAllocator)
+{
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
+
+    if (srcLen == 0)
+    {
+        if (dstLenPinned != NULL)
+        {
+            *dstLenPinned = 0;
+        }
+        return NPPST_SUCCESS;
+    }
+
+    std::vector<Ncv32u> partSumNums;
+    std::vector<Ncv32u> partSumOffsets;
+    Ncv32u partSumLastNum = srcLen;
+    Ncv32u partSumLastOffs = 0;
+    do
+    {
+        partSumNums.push_back(partSumLastNum);
+        partSumOffsets.push_back(partSumLastOffs);
+
+        Ncv32u curPartSumAlignedLength = alignUp(partSumLastNum * sizeof(Ncv32u),
+                                                 gpuAllocator.alignment()) / sizeof(Ncv32u);
+        partSumLastOffs += curPartSumAlignedLength;
+
+        partSumLastNum = (partSumLastNum + NUM_REMOVE_THREADS - 1) / NUM_REMOVE_THREADS;
+    }
+    while (partSumLastNum>1);
+    partSumNums.push_back(partSumLastNum);
+    partSumOffsets.push_back(partSumLastOffs);
+
+    NCVVectorAlloc<Ncv32u> d_hierSums(gpuAllocator, partSumLastOffs+1);
+    ncvAssertReturn(gpuAllocator.isCounting() || d_hierSums.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    NCVVectorAlloc<Ncv32u> d_numDstElements(gpuAllocator, 1);
+    ncvAssertReturn(gpuAllocator.isCounting() || d_numDstElements.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    dim3 block(NUM_REMOVE_THREADS);
+
+    //calculate zero-level partial sums for indices calculation
+    if (partSumNums.size() > 2)
+    {
+        dim3 grid(partSumNums[1]);
+
+        if (grid.x > 65535)
+        {
+            grid.y = (grid.x + 65534) / 65535;
+            grid.x = 65535;
+        }
+        removePass1Scan
+            <true, true>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcLen,
+             d_hierSums.ptr(),
+             d_hierSums.ptr() + partSumOffsets[1],
+             elemRemove);
+        ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+        //calculate hierarchical partial sums
+        for (Ncv32u i=1; i<partSumNums.size()-1; i++)
+        {
+            dim3 grid(partSumNums[i+1]);
+            if (grid.x > 65535)
+            {
+                grid.y = (grid.x + 65534) / 65535;
+                grid.x = 65535;
+            }
+            if (grid.x != 1)
+            {
+                removePass1Scan
+                    <false, true>
+                    <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+                    (d_hierSums.ptr() + partSumOffsets[i],
+                     partSumNums[i], NULL,
+                     d_hierSums.ptr() + partSumOffsets[i+1],
+                     NULL);
+            }
+            else
+            {
+                removePass1Scan
+                    <false, false>
+                    <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+                    (d_hierSums.ptr() + partSumOffsets[i],
+                     partSumNums[i], NULL,
+                     NULL,
+                     NULL);
+            }
+            ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+        }
+
+        //adjust hierarchical partial sums
+        for (Ncv32s i=(Ncv32s)partSumNums.size()-3; i>=0; i--)
+        {
+            dim3 grid(partSumNums[i+1]);
+            if (grid.x > 65535)
+            {
+                grid.y = (grid.x + 65534) / 65535;
+                grid.x = 65535;
+            }
+            removePass2Adjust
+                <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+                (d_hierSums.ptr() + partSumOffsets[i], partSumNums[i],
+                 d_hierSums.ptr() + partSumOffsets[i+1]);
+            ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+        }
+    }
+    else
+    {
+        dim3 grid(partSumNums[1]);
+        removePass1Scan
+            <true, false>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcLen,
+             d_hierSums.ptr(),
+             NULL, elemRemove);
+        ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+    }
+
+    //compact source vector using indices
+    dim3 grid(partSumNums[1]);
+    if (grid.x > 65535)
+    {
+        grid.y = (grid.x + 65534) / 65535;
+        grid.x = 65535;
+    }
+    removePass3Compact
+        <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+        (d_src, srcLen, d_hierSums.ptr(), d_dst,
+         elemRemove, d_numDstElements.ptr());
+    ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    //get number of dst elements
+    if (dstLenPinned != NULL)
+    {
+        ncvAssertCUDAReturn(cudaMemcpyAsync(dstLenPinned, d_numDstElements.ptr(), sizeof(Ncv32u),
+                                              cudaMemcpyDeviceToHost, nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
+    }
+
+    NCV_SKIP_COND_END
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+
+    if (srcLen == 0)
+    {
+        *pBufsize = 0;
+        return NPPST_SUCCESS;
+    }
+
+    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = compactVector_32u_device(NULL, srcLen, NULL, NULL, 0xC001C0DE,
+                                                 gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
+}
+
+
+NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
+}
+
+
+NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
+                            Ncv32u *d_dst, Ncv32u *p_dstLen,
+                            Ncv32u elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = compactVector_32u_device(d_src, srcLen, d_dst, p_dstLen, elemRemove,
+                                                 gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
+                            Ncv32s *d_dst, Ncv32u *p_dstLen,
+                            Ncv32s elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
+                             *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
+}
+
+
+NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
+                            Ncv32f *d_dst, Ncv32u *p_dstLen,
+                            Ncv32f elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
+                             *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
+}
+
+
+NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
+                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+
+    if (srcLen == 0)
+    {
+        if (dstLen != NULL)
+        {
+            *dstLen = 0;
+        }
+        return NPPST_SUCCESS;
+    }
+
+    Ncv32u dstIndex = 0;
+    for (Ncv32u srcIndex=0; srcIndex<srcLen; srcIndex++)
+    {
+        if (h_src[srcIndex] != elemRemove)
+        {
+            h_dst[dstIndex++] = h_src[srcIndex];
+        }
+    }
+
+    if (dstLen != NULL)
+    {
+        *dstLen = dstIndex;
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
+                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove)
+{
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+}
+
+
+NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
+                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove)
+{
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+}
diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..16695e3f780189384f770d33622889530ccf84cb
--- /dev/null
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
@@ -0,0 +1,637 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _npp_staging_hpp_
+#define _npp_staging_hpp_
+
+#include "NCV.hpp"
+
+
+/**
+* \file NPP_staging.hpp
+* NPP Staging Library
+*/
+
+
+/** \defgroup core_npp NPPST Core
+ * Basic functions for CUDA streams management.
+ * @{
+ */
+
+
+/**
+ * Gets an active CUDA stream used by NPPST
+ * NOT THREAD SAFE
+ * \return Current CUDA stream
+ */
+cudaStream_t nppStGetActiveCUDAstream();
+
+
+/**
+ * Sets an active CUDA stream used by NPPST
+ * NOT THREAD SAFE
+ * \param cudaStream        [IN] cudaStream CUDA stream to become current
+ * \return CUDA stream used before
+ */
+cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
+
+
+/*@}*/
+
+
+/** \defgroup nppi NPPST Image Processing
+* @{
+*/
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel.
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest in the source image
+ * \param scale             [IN] Downsampling scale factor (positive integer)
+ * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
+                                          Ncv32u *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
+                                          Ncv32s *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                          Ncv32f *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
+* \see nppiStDownsampleNearest_32u_C1R
+*/
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
+                                          Ncv64u *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
+                                          Ncv64s *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
+                                          Ncv64f *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel. Host implementation.
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest in the source image
+ * \param scale             [IN] Downsampling scale factor (positive integer)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
+                                               Ncv32u *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
+                                               Ncv32s *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                               Ncv32f *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
+                                               Ncv64u *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
+                                               Ncv64s *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
+                                               Ncv64f *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Computes standard deviation for each rectangular region of the input image using integral images.
+ *
+ * \param d_sum             [IN] Integral image pointer (CUDA device memory)
+ * \param sumStep           [IN] Integral image line step
+ * \param d_sqsum           [IN] Squared integral image pointer (CUDA device memory)
+ * \param sqsumStep         [IN] Squared integral image line step
+ * \param d_norm            [OUT] Stddev image pointer (CUDA device memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
+ * \param normStep          [IN] Stddev image line step
+ * \param roi               [IN] Region of interest in the source image
+ * \param rect              [IN] Rectangular region to calculate stddev over
+ * \param scaleArea         [IN] Multiplication factor to account decimated scale
+ * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+                                   Ncv32f *d_norm, Ncv32u normStep,
+                                   NcvSize32u roi, NcvRect32u rect,
+                                   Ncv32f scaleArea, NcvBool readThruTexture);
+
+
+/**
+ * Computes standard deviation for each rectangular region of the input image using integral images. Host implementation
+ *
+ * \param h_sum             [IN] Integral image pointer (Host or pinned memory)
+ * \param sumStep           [IN] Integral image line step
+ * \param h_sqsum           [IN] Squared integral image pointer (Host or pinned memory)
+ * \param sqsumStep         [IN] Squared integral image line step
+ * \param h_norm            [OUT] Stddev image pointer (Host or pinned memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
+ * \param normStep          [IN] Stddev image line step
+ * \param roi               [IN] Region of interest in the source image
+ * \param rect              [IN] Rectangular region to calculate stddev over
+ * \param scaleArea         [IN] Multiplication factor to account decimated scale
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep,
+                                        Ncv64u *h_sqsum, Ncv32u sqsumStep,
+                                        Ncv32f *h_norm, Ncv32u normStep,
+                                        NcvSize32u roi, NcvRect32u rect,
+                                        Ncv32f scaleArea);
+
+
+/**
+ * Transposes an image. 32-bit unsigned pixels, single channel
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStride         [IN] Source image line step
+ * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
+ * \param dstStride         [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32u_C1R(Ncv32u *d_src, Ncv32u srcStride,
+                                  Ncv32u *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit signed pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32s_C1R(Ncv32s *d_src, Ncv32u srcStride,
+                                  Ncv32s *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit float pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32f_C1R(Ncv32f *d_src, Ncv32u srcStride,
+                                  Ncv32f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit unsigned pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64u_C1R(Ncv64u *d_src, Ncv32u srcStride,
+                                  Ncv64u *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit signed pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64s_C1R(Ncv64s *d_src, Ncv32u srcStride,
+                                  Ncv64s *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit float pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
+                                  Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStride         [IN] Source image line step
+ * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
+ * \param dstStride         [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStride,
+                                       Ncv32u *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit signed pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStride,
+                                       Ncv32s *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit float pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStride,
+                                       Ncv32f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit unsigned pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStride,
+                                       Ncv64u *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit signed pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStride,
+                                       Ncv64s *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit float pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
+                                       Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ *
+ * \param roiSize           [IN] Size of the input image
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ * \see nppiStIntegralGetSize_8u32u
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination integral image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                   Ncv32u *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                   Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image
+ * \see nppiStIntegral_8u32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                    Ncv32f *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                    Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination integral image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                        Ncv32u *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/**
+ * Creates an integral image representation for the input image. Host implementation
+ * \see nppiStIntegral_8u32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_32f32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                         Ncv32f *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/**
+ * Calculates the size of the temporary buffer for squared integral image creation
+ *
+ * \param roiSize           [IN] Size of the input image
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates a squared integral image representation for the input image
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination squared integral image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                      Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                      Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates a squared integral image representation for the input image. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination squared integral image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                           Ncv64u *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/*@}*/
+
+
+/** \defgroup npps NPPST Signal Processing
+* @{
+*/
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit unsigned values
+ *
+ * \param srcLen            [IN] Length of the input vector in elements
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit signed values
+ * \see nppsStCompactGetSize_32u
+ */
+NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit float values
+ * \see nppsStCompactGetSize_32u
+ */
+NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit unsigned values
+ *
+ * \param d_src             [IN] Source vector pointer (CUDA device memory)
+ * \param srcLen            [IN] Source vector length
+ * \param d_dst             [OUT] Destination vector pointer (CUDA device memory)
+ * \param p_dstLen          [OUT] Pointer to the destination vector length (Pinned memory or NULL)
+ * \param elemRemove        [IN] The value to be removed
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
+                            Ncv32u *d_dst, Ncv32u *p_dstLen,
+                            Ncv32u elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit signed values
+ * \see nppsStCompact_32u
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
+                            Ncv32s *d_dst, Ncv32u *p_dstLen,
+                            Ncv32s elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit float values
+ * \see nppsStCompact_32u
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
+                            Ncv32f *d_dst, Ncv32u *p_dstLen,
+                            Ncv32f elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit unsigned values. Host implementation
+ *
+ * \param h_src             [IN] Source vector pointer (CUDA device memory)
+ * \param srcLen            [IN] Source vector length
+ * \param h_dst             [OUT] Destination vector pointer (CUDA device memory)
+ * \param dstLen            [OUT] Pointer to the destination vector length (can be NULL)
+ * \param elemRemove        [IN] The value to be removed
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
+                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit signed values. Host implementation
+ * \see nppsStCompact_32u_host
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
+                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit float values. Host implementation
+ * \see nppsStCompact_32u_host
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
+                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove);
+
+
+/*@}*/
+
+
+#endif // _npp_staging_hpp_
diff --git a/modules/gpu/src/nvidia/NCV.cpp b/modules/gpu/src/nvidia/core/NCV.cpp
similarity index 88%
rename from modules/gpu/src/nvidia/NCV.cpp
rename to modules/gpu/src/nvidia/core/NCV.cpp
index fef9c08f5b7c3f59dfba2bfa768d67314e5dd05b..c276cff61110d47b8cd692147d5e0b3e8692fb46 100644
--- a/modules/gpu/src/nvidia/NCV.cpp
+++ b/modules/gpu/src/nvidia/core/NCV.cpp
@@ -40,15 +40,13 @@
 //M*/
 
 
-#include <precomp.hpp>
-
-
 #if !defined (HAVE_CUDA)
 
 
 #else /* !defined (HAVE_CUDA) */
 
 
+#include <ios>
 #include <stdarg.h>
 #include "NCV.hpp"
 
@@ -94,17 +92,6 @@ void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
 //==============================================================================
 
 
-NCVStatus GPUAlignmentValue(Ncv32u &alignment)
-{
-    int curDev;
-    cudaDeviceProp curProp;
-    ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR);
-    alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major);
-    return NCV_SUCCESS;
-}
-
-
 Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
 {
     Ncv32u alignMask = alignment-1;
@@ -216,7 +203,7 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
 }
 
 
-NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment)
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
     :
     currentSize(0),
     _maxSize(0),
@@ -229,17 +216,26 @@ NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity,
 
     allocBegin = NULL;
 
-    switch (memT)
+    if (reusePtr == NULL)
     {
-    case NCVMemoryTypeDevice:
-        ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
-        break;
-    case NCVMemoryTypeHostPinned:
-        ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
-        break;
-    case NCVMemoryTypeHostPageable:
-        allocBegin = (Ncv8u *)malloc(capacity);
-        break;
+        bReusesMemory = false;
+        switch (memT)
+        {
+        case NCVMemoryTypeDevice:
+            ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPinned:
+            ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPageable:
+            allocBegin = (Ncv8u *)malloc(capacity);
+            break;
+        }
+    }
+    else
+    {
+        bReusesMemory = true;
+        allocBegin = (Ncv8u *)reusePtr;
     }
 
     if (capacity == 0)
@@ -260,18 +256,23 @@ NCVMemStackAllocator::~NCVMemStackAllocator()
     if (allocBegin != NULL)
     {
         ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
-        switch (_memType)
+
+        if (!bReusesMemory)
         {
-        case NCVMemoryTypeDevice:
-            ncvAssertCUDAReturn(cudaFree(allocBegin), );
-            break;
-        case NCVMemoryTypeHostPinned:
-            ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
-            break;
-        case NCVMemoryTypeHostPageable:
-            free(allocBegin);
-            break;
+            switch (_memType)
+            {
+            case NCVMemoryTypeDevice:
+                ncvAssertCUDAReturn(cudaFree(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPinned:
+                ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPageable:
+                free(allocBegin);
+                break;
+            }
         }
+
         allocBegin = NULL;
     }
 }
@@ -356,14 +357,14 @@ size_t NCVMemStackAllocator::maxSize(void) const
 //===================================================================
 
 
-NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT)
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
     :
     currentSize(0),
     _maxSize(0),
-    _memType(memT)
+    _memType(memT),
+    _alignment(alignment)
 {
     ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
-    ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", );
 }
 
 
diff --git a/modules/gpu/src/nvidia/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp
similarity index 76%
rename from modules/gpu/src/nvidia/NCV.hpp
rename to modules/gpu/src/nvidia/core/NCV.hpp
index a71f650252f63d548ed2eaae5c9e8a48d9c40288..81eb417fbe12e123c1ceffcebbbfee2a900d73c5 100644
--- a/modules/gpu/src/nvidia/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -42,8 +42,49 @@
 #ifndef _ncv_hpp_
 #define _ncv_hpp_
 
+#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined CVAPI_EXPORTS //&& !defined(__CUDACC__) 
+    #define NCV_EXPORTS __declspec(dllexport)
+#else
+    #define NCV_EXPORTS
+#endif
+
 #include <cuda_runtime.h>
-#include "npp_staging.h"
+
+
+//==============================================================================
+//
+// Compile-time assert functionality
+//
+//==============================================================================
+
+
+/**
+* Compile-time assert namespace
+*/
+namespace NcvCTprep
+{
+    template <bool x>
+    struct CT_ASSERT_FAILURE;
+
+    template <>
+    struct CT_ASSERT_FAILURE<true> {};
+
+    template <int x>
+    struct assertTest{};
+}
+
+
+#define NCV_CT_PREP_PASTE_AUX(a,b)      a##b                           ///< Concatenation indirection macro
+#define NCV_CT_PREP_PASTE(a,b)          NCV_CT_PREP_PASTE_AUX(a, b)  ///< Concatenation macro
+
+
+/**
+* Performs compile-time assertion of a condition on the file scope
+*/
+#define NCV_CT_ASSERT(X) \
+    typedef NcvCTprep::assertTest<sizeof(NcvCTprep::CT_ASSERT_FAILURE< (bool)(X) >)> \
+    NCV_CT_PREP_PASTE(__ct_assert_typedef_, __LINE__)
+
 
 
 //==============================================================================
@@ -82,62 +123,72 @@ typedef              float Ncv32f;
 typedef             double Ncv64f;
 
 
-typedef struct
+struct NcvRect8u
 {
     Ncv8u x;
     Ncv8u y;
     Ncv8u width;
     Ncv8u height;
-} NcvRect8u;
+    NcvRect8u() : x(0), y(0), width(0), height(0) {};
+    NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
+};
 
 
-typedef struct
+struct NcvRect32s
 {
     Ncv32s x;          ///< x-coordinate of upper left corner.
     Ncv32s y;          ///< y-coordinate of upper left corner.
     Ncv32s width;      ///< Rectangle width.
     Ncv32s height;     ///< Rectangle height.
-} NcvRect32s;
+    NcvRect32s() : x(0), y(0), width(0), height(0) {};
+    NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
+};
 
 
-typedef struct
+struct NcvRect32u
 {
     Ncv32u x;          ///< x-coordinate of upper left corner.
     Ncv32u y;          ///< y-coordinate of upper left corner.
     Ncv32u width;      ///< Rectangle width.
     Ncv32u height;     ///< Rectangle height.
-} NcvRect32u;
+    NcvRect32u() : x(0), y(0), width(0), height(0) {};
+    NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
+};
 
 
-typedef struct 
+struct NcvSize32s
 {
     Ncv32s width;  ///< Rectangle width.
     Ncv32s height; ///< Rectangle height.
-} NcvSize32s;
+    NcvSize32s() : width(0), height(0) {};
+    NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
+};
 
 
-typedef struct 
+struct NcvSize32u
 {
     Ncv32u width;  ///< Rectangle width.
     Ncv32u height; ///< Rectangle height.
-} NcvSize32u;
-
-
-NPPST_CT_ASSERT(sizeof(NcvBool) <= 4);
-NPPST_CT_ASSERT(sizeof(Ncv64s) == 8);
-NPPST_CT_ASSERT(sizeof(Ncv64u) == 8);
-NPPST_CT_ASSERT(sizeof(Ncv32s) == 4);
-NPPST_CT_ASSERT(sizeof(Ncv32u) == 4);
-NPPST_CT_ASSERT(sizeof(Ncv16s) == 2);
-NPPST_CT_ASSERT(sizeof(Ncv16u) == 2);
-NPPST_CT_ASSERT(sizeof(Ncv8s) == 1);
-NPPST_CT_ASSERT(sizeof(Ncv8u) == 1);
-NPPST_CT_ASSERT(sizeof(Ncv32f) == 4);
-NPPST_CT_ASSERT(sizeof(Ncv64f) == 8);
-NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
-NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
-NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
-NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
+    NcvSize32u() : width(0), height(0) {};
+    NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
+};
+
+
+NCV_CT_ASSERT(sizeof(NcvBool) <= 4);
+NCV_CT_ASSERT(sizeof(Ncv64s) == 8);
+NCV_CT_ASSERT(sizeof(Ncv64u) == 8);
+NCV_CT_ASSERT(sizeof(Ncv32s) == 4);
+NCV_CT_ASSERT(sizeof(Ncv32u) == 4);
+NCV_CT_ASSERT(sizeof(Ncv16s) == 2);
+NCV_CT_ASSERT(sizeof(Ncv16u) == 2);
+NCV_CT_ASSERT(sizeof(Ncv8s) == 1);
+NCV_CT_ASSERT(sizeof(Ncv8u) == 1);
+NCV_CT_ASSERT(sizeof(Ncv32f) == 4);
+NCV_CT_ASSERT(sizeof(Ncv64f) == 8);
+NCV_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
+NCV_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
 
 
 //==============================================================================
@@ -162,13 +213,13 @@ const Ncv32u K_LOG2_WARP_SIZE = 5;
 #define NCV_CT_PREP_STRINGIZE(x)        NCV_CT_PREP_STRINGIZE_AUX(x)
 
 
-void ncvDebugOutput(const char *msg, ...);
+NCV_EXPORTS void ncvDebugOutput(const char *msg, ...);
 
 
 typedef void NCVDebugOutputHandler(const char* msg);
 
 
-void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
+NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
 
 
 #define ncvAssertPrintCheck(pred, msg) \
@@ -222,6 +273,7 @@ void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
 */
 enum NCVStatus
 {
+    //NCV statuses
     NCV_SUCCESS,
 
     NCV_CUDA_ERROR,
@@ -257,6 +309,24 @@ enum NCVStatus
     NCV_NOIMPL_HAAR_TILTED_FEATURES,
 
     NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,
+
+    //NPP statuses
+    NPPST_SUCCESS = NCV_SUCCESS,              ///< Successful operation (same as NPP_NO_ERROR)
+    NPPST_ERROR,                              ///< Unknown error
+    NPPST_CUDA_KERNEL_EXECUTION_ERROR,        ///< CUDA kernel execution error
+    NPPST_NULL_POINTER_ERROR,                 ///< NULL pointer argument error
+    NPPST_TEXTURE_BIND_ERROR,                 ///< CUDA texture binding error or non-zero offset returned
+    NPPST_MEMCPY_ERROR,                       ///< CUDA memory copy error
+    NPPST_MEM_ALLOC_ERR,                      ///< CUDA memory allocation error
+    NPPST_MEMFREE_ERR,                        ///< CUDA memory deallocation error
+
+    //NPPST statuses
+    NPPST_INVALID_ROI,                        ///< Invalid region of interest argument
+    NPPST_INVALID_STEP,                       ///< Invalid image lines step argument (check sign, alignment, relation to image width)
+    NPPST_INVALID_SCALE,                      ///< Invalid scale parameter passed
+    NPPST_MEM_INSUFFICIENT_BUFFER,            ///< Insufficient user-allocated buffer
+    NPPST_MEM_RESIDENCE_ERROR,                ///< Memory residence error detected (check if pointers should be device or pinned)
+    NPPST_MEM_INTERNAL_ERROR,                 ///< Internal memory management error
 };
 
 
@@ -285,11 +355,11 @@ enum NCVStatus
 
 typedef struct _NcvTimer *NcvTimer;
 
-NcvTimer ncvStartTimer(void);
+NCV_EXPORTS NcvTimer ncvStartTimer(void);
 
-double ncvEndQueryTimerUs(NcvTimer t);
+NCV_EXPORTS double ncvEndQueryTimerUs(NcvTimer t);
 
-double ncvEndQueryTimerMs(NcvTimer t);
+NCV_EXPORTS double ncvEndQueryTimerMs(NcvTimer t);
 
 
 //==============================================================================
@@ -299,16 +369,10 @@ double ncvEndQueryTimerMs(NcvTimer t);
 //==============================================================================
 
 
-/**
-* Alignment of GPU memory chunks in bytes
-*/
-NCVStatus GPUAlignmentValue(Ncv32u &alignment);
-
-
 /**
 * Calculates the aligned top bound value
 */
-Ncv32u alignUp(Ncv32u what, Ncv32u alignment);
+NCV_EXPORTS Ncv32u alignUp(Ncv32u what, Ncv32u alignment);
 
 
 /**
@@ -326,7 +390,7 @@ enum NCVMemoryType
 /**
 * NCVMemPtr
 */
-struct NCVMemPtr
+struct NCV_EXPORTS NCVMemPtr
 {
     void *ptr;
     NCVMemoryType memtype;
@@ -337,7 +401,7 @@ struct NCVMemPtr
 /**
 * NCVMemSegment
 */
-struct NCVMemSegment
+struct NCV_EXPORTS NCVMemSegment
 {
     NCVMemPtr begin;
     size_t size;
@@ -348,7 +412,7 @@ struct NCVMemSegment
 /**
 * INCVMemAllocator (Interface)
 */
-class INCVMemAllocator
+class NCV_EXPORTS INCVMemAllocator
 {
 public:
     virtual ~INCVMemAllocator() = 0;
@@ -370,7 +434,7 @@ inline INCVMemAllocator::~INCVMemAllocator() {}
 /**
 * NCVMemStackAllocator
 */
-class NCVMemStackAllocator : public INCVMemAllocator
+class NCV_EXPORTS NCVMemStackAllocator : public INCVMemAllocator
 {
     NCVMemStackAllocator();
     NCVMemStackAllocator(const NCVMemStackAllocator &);
@@ -378,7 +442,7 @@ class NCVMemStackAllocator : public INCVMemAllocator
 public:
 
     explicit NCVMemStackAllocator(Ncv32u alignment);
-    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment);
+    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr=NULL);
     virtual ~NCVMemStackAllocator();
 
     virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
@@ -400,17 +464,18 @@ private:
     Ncv8u *end;
     size_t currentSize;
     size_t _maxSize;
+    NcvBool bReusesMemory;
 };
 
 
 /**
 * NCVMemNativeAllocator
 */
-class NCVMemNativeAllocator : public INCVMemAllocator
+class NCV_EXPORTS NCVMemNativeAllocator : public INCVMemAllocator
 {
 public:
 
-    NCVMemNativeAllocator(NCVMemoryType memT);
+    NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment);
     virtual ~NCVMemNativeAllocator();
 
     virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
@@ -438,9 +503,9 @@ private:
 /**
 * Copy dispatcher
 */
-NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
-                           const void *src, NCVMemoryType srcType,
-                           size_t sz, cudaStream_t cuStream);
+NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
+                                       const void *src, NCVMemoryType srcType,
+                                       size_t sz, cudaStream_t cuStream);
 
 
 /**
@@ -514,6 +579,7 @@ class NCVVectorAlloc : public NCVVector<T>
 {
     NCVVectorAlloc();
     NCVVectorAlloc(const NCVVectorAlloc &);
+	NCVVectorAlloc& operator=(const NCVVectorAlloc<T>&);	
 
 public:
 
@@ -563,8 +629,7 @@ public:
         return allocatedMem;
     }
 
-private:
-
+private:		
     INCVMemAllocator &allocator;
     NCVMemSegment allocatedMem;
 };
@@ -707,7 +772,7 @@ class NCVMatrixAlloc : public NCVMatrix<T>
 {
     NCVMatrixAlloc();
     NCVMatrixAlloc(const NCVMatrixAlloc &);
-
+	NCVMatrixAlloc& operator=(const NCVMatrixAlloc &);
 public:
 
     NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0)
diff --git a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
similarity index 73%
rename from modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
rename to modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
index 14d16bb3b973cb67ede3fc0c3aff355cbf1fe1ce..3d3f94f8b1aa3632401a8f80e7a9585f59607e42 100644
--- a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
@@ -1,3 +1,51 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+
+#include <stdarg.h>
+#include <vector>
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // The Loki Library
 // Copyright (c) 2001 by Andrei Alexandrescu
@@ -14,13 +62,6 @@
 // http://loki-lib.sourceforge.net/index.php?n=Main.License
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef _ncvruntimetemplates_hpp_
-#define _ncvruntimetemplates_hpp_
-
-#include <stdarg.h>
-#include <vector>
-
-
 namespace Loki
 {
     //==============================================================================
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp
index 959cca2350f071908f6e40deb54ae63861ec3e8d..79ed09439d0cbf36a01792d335faa4ed1c27ffa3 100644
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -68,51 +68,51 @@ namespace cv { namespace gpu { namespace device
 
     //! Read Write Traits
 
-    template <size_t src_elem_size, size_t dst_elem_size>
-    struct UnReadWriteTraits_
-    {
-        enum {shift=1};
-    };
-    template <size_t src_elem_size>
-    struct UnReadWriteTraits_<src_elem_size, 1>
-    {
-        enum {shift=4};
-    };
-    template <size_t src_elem_size>
-    struct UnReadWriteTraits_<src_elem_size, 2>
-    {
-        enum {shift=2};
+    template <size_t src_elem_size, size_t dst_elem_size>
+    struct UnReadWriteTraits_
+    {
+        enum {shift=1};
+    };
+    template <size_t src_elem_size>
+    struct UnReadWriteTraits_<src_elem_size, 1>
+    {
+        enum {shift=4};
+    };
+    template <size_t src_elem_size>
+    struct UnReadWriteTraits_<src_elem_size, 2>
+    {
+        enum {shift=2};
     };
-    template <typename T, typename D> struct UnReadWriteTraits
-    {
-        enum {shift=UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift};
-        
-        typedef typename TypeVec<T, shift>::vec_t read_type;
-        typedef typename TypeVec<D, shift>::vec_t write_type;
+    template <typename T, typename D> struct UnReadWriteTraits
+    {
+        enum {shift=UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift};
+        
+        typedef typename TypeVec<T, shift>::vec_t read_type;
+        typedef typename TypeVec<D, shift>::vec_t write_type;
     };
 
-    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
-    struct BinReadWriteTraits_
-    {
-        enum {shift=1};
+    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
+    struct BinReadWriteTraits_
+    {
+        enum {shift=1};
     };
-    template <size_t src_elem_size1, size_t src_elem_size2>
-    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
-    {
-        enum {shift=4};
+    template <size_t src_elem_size1, size_t src_elem_size2>
+    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
+    {
+        enum {shift=4};
     };
-    template <size_t src_elem_size1, size_t src_elem_size2>
-    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
-    {
-        enum {shift=2};
+    template <size_t src_elem_size1, size_t src_elem_size2>
+    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
+    {
+        enum {shift=2};
     };
-    template <typename T1, typename T2, typename D> struct BinReadWriteTraits
-    {
-        enum {shift=BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
-
-        typedef typename TypeVec<T1, shift>::vec_t read_type1;
-        typedef typename TypeVec<T2, shift>::vec_t read_type2;
-        typedef typename TypeVec<D , shift>::vec_t write_type;
+    template <typename T1, typename T2, typename D> struct BinReadWriteTraits
+    {
+        enum {shift=BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
+
+        typedef typename TypeVec<T1, shift>::vec_t read_type1;
+        typedef typename TypeVec<T2, shift>::vec_t read_type2;
+        typedef typename TypeVec<D , shift>::vec_t write_type;
     };
 
     //! Transform kernels
@@ -122,14 +122,14 @@ namespace cv { namespace gpu { namespace device
     {
         template <typename T, typename D, typename UnOp, typename Mask>
         static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
                 dst.x = op(src.x);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
         static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
                 dst.x = op(src1.x, src2.x);
         }
@@ -138,18 +138,18 @@ namespace cv { namespace gpu { namespace device
     {
         template <typename T, typename D, typename UnOp, typename Mask>
         static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
-                dst.x = op(src.x);
+                dst.x = op(src.x);
             if (mask(y, x_shifted + 1))
                 dst.y = op(src.y);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
         static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
+                dst.x = op(src1.x, src2.x);
             if (mask(y, x_shifted + 1))
                 dst.y = op(src1.y, src2.y);
         }
@@ -158,22 +158,22 @@ namespace cv { namespace gpu { namespace device
     {
         template <typename T, typename D, typename UnOp, typename Mask>
         static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
-                dst.x = op(src.x);
+                dst.x = op(src.x);
             if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
+                dst.y = op(src.y);
             if (mask(y, x_shifted + 2))
                 dst.z = op(src.z);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
         static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
+                dst.x = op(src1.x, src2.x);
             if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
+                dst.y = op(src1.y, src2.y);
             if (mask(y, x_shifted + 2))
                 dst.z = op(src1.z, src2.z);
         }
@@ -182,65 +182,65 @@ namespace cv { namespace gpu { namespace device
     {
         template <typename T, typename D, typename UnOp, typename Mask>
         static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
-                dst.x = op(src.x);
+                dst.x = op(src.x);
             if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
+                dst.y = op(src.y);
             if (mask(y, x_shifted + 2))
-                dst.z = op(src.z);
+                dst.z = op(src.z);
             if (mask(y, x_shifted + 3))
                 dst.w = op(src.w);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
         static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
             if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
+                dst.x = op(src1.x, src2.x);
             if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
+                dst.y = op(src1.y, src2.y);
             if (mask(y, x_shifted + 2))
-                dst.z = op(src1.z, src2.z);
+                dst.z = op(src1.z, src2.z);
             if (mask(y, x_shifted + 3))
                 dst.w = op(src1.w, src2.w);
         }
     };
 
-    template <typename T, typename D, typename UnOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
-    {
-        typedef typename UnReadWriteTraits<T, D>::read_type read_type;
-        typedef typename UnReadWriteTraits<T, D>::write_type write_type;
-        const int shift = UnReadWriteTraits<T, D>::shift;
-
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * shift;
-
-        if (y < src_.rows)
-        {
-            const T* src = src_.ptr(y);
-            D* dst = dst_.ptr(y);
-
-            if (x_shifted + shift - 1 < src_.cols)
-            {
-                read_type src_n_el = ((const read_type*)src)[x];
-                write_type dst_n_el;
-
-                OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                ((write_type*)dst)[x] = dst_n_el;
-            }
-            else
-            {
-                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
-                {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src[real_x]);
-                }
-            }
-        }
+    template <typename T, typename D, typename UnOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
+    {
+        typedef typename UnReadWriteTraits<T, D>::read_type read_type;
+        typedef typename UnReadWriteTraits<T, D>::write_type write_type;
+        const int shift = UnReadWriteTraits<T, D>::shift;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * shift;
+
+        if (y < src_.rows)
+        {
+            const T* src = src_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + shift - 1 < src_.cols)
+            {
+                read_type src_n_el = ((const read_type*)src)[x];
+                write_type dst_n_el;
+
+                OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src[real_x]);
+                }
+            }
+        }
     }
 
     template <typename T, typename D, typename UnOp, typename Mask>
@@ -255,44 +255,44 @@ namespace cv { namespace gpu { namespace device
         }
     }
 
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
-        const Mask mask, BinOp op)
-    {
-        typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
-        typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
-        typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
-        const int shift = BinReadWriteTraits<T1, T2, D>::shift;
-
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * shift;
-
-        if (y < src1_.rows)
-        {
-            const T1* src1 = src1_.ptr(y);
-            const T2* src2 = src2_.ptr(y);
-            D* dst = dst_.ptr(y);
-
-            if (x_shifted + shift - 1 < src1_.cols)
-            {
-                read_type1 src1_n_el = ((const read_type1*)src1)[x];
-                read_type2 src2_n_el = ((const read_type2*)src2)[x];
-                write_type dst_n_el;
-                
-                OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                ((write_type*)dst)[x] = dst_n_el;
-            }
-            else
-            {
-                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
-                {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src1[real_x], src2[real_x]);
-                }
-            }
-        }
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
+        const Mask mask, BinOp op)
+    {
+        typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
+        typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
+        typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
+        const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * shift;
+
+        if (y < src1_.rows)
+        {
+            const T1* src1 = src1_.ptr(y);
+            const T2* src2 = src2_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + shift - 1 < src1_.cols)
+            {
+                read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                write_type dst_n_el;
+                
+                OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src1[real_x], src2[real_x]);
+                }
+            }
+        }
     }
 
     template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
@@ -355,11 +355,11 @@ namespace cv
             template <typename T, typename D, typename UnOp, typename Mask>
             static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
                              cudaStream_t stream = 0)
-            {
+            {
                 const int shift = device::UnReadWriteTraits<T, D>::shift;
 
                 dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);            
+                dim3 grid(1, 1, 1);            
 
                 grid.x = divUp(src.cols, threads.x * shift);
                 grid.y = divUp(src.rows, threads.y);        
@@ -373,7 +373,7 @@ namespace cv
             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
             static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
                              BinOp op, const Mask& mask, cudaStream_t stream = 0)
-            {
+            {
                 const int shift = device::BinReadWriteTraits<T1, T2, D>::shift;
 
                 dim3 threads(16, 16, 1);
@@ -392,7 +392,7 @@ namespace cv
         template <typename T, typename D, typename UnOp, typename Mask>
         static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
             cudaStream_t stream = 0)
-        {
+        {
             TransformChooser<device::VecTraits<T>::cn == 1 && device::VecTraits<D>::cn == 1 && device::UnReadWriteTraits<T, D>::shift != 1>::call(src, dst, op, mask, stream);
         }
 
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index d5ad3cc7271f716c5154c3108eefe4059b9eb66a..ca073119eeb494660a1e81c00bf4b17c88f818b6 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -69,9 +69,9 @@
     #include "cufft.h"
     #include "opencv2/gpu/stream_accessor.hpp"
     #include "npp.h"    
-    #include "npp_staging.h"
-
-	#include "nvidia/NCV.hpp"
+    
+	#include "nvidia/core/NCV.hpp"
+	#include "nvidia/NPP_staging/npp_staging.hpp"
 	#include "nvidia/NCVHaarObjectDetection.hpp"
 
 #define CUDART_MINIMUM_REQUIRED_VERSION 3020
diff --git a/modules/gtest/src/gtestcv.cpp b/modules/gtest/src/gtestcv.cpp
index 25082ca1af770d5f5b044a4f2551966cb0253a9e..3b3b695235b2d8063c6608005cf4700141c84280 100644
--- a/modules/gtest/src/gtestcv.cpp
+++ b/modules/gtest/src/gtestcv.cpp
@@ -1378,7 +1378,7 @@ cmpEpsFlt_(const _Tp* src1, const _Tp* src2, size_t total, int imaxdiff, size_t
     {
         _Tp a = src1[i], b = src2[i];
         if( a < 0 ) a ^= C; if( b < 0 ) b ^= C;
-        _Tp d = std::abs(a - b);
+        _Tp d = std::abs(double(a - b));
         if( d > imaxdiff )
         {
             idx = i + startidx;
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 1a601589a9d688851d32380dee98be05ab078ac9..079d3af271d27e19f507798c565f8e8a0b4a4556 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -3,8 +3,8 @@
 #
 # ----------------------------------------------------------------------------
 
-add_subdirectory(c)
-add_subdirectory(cpp)
+#add_subdirectory(c)
+#add_subdirectory(cpp)
 add_subdirectory(gpu)
 
 if(0)
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index de9fe6ef8090ff83efdb21028354ffda1982afb3..7b39e6ee6b5dd259c39495ec840ca11df61970a0 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -14,10 +14,12 @@ if (BUILD_EXAMPLES)
         "${CMAKE_SOURCE_DIR}/modules/legacy/include"
         "${CMAKE_SOURCE_DIR}/modules/contrib/include"
         "${CMAKE_SOURCE_DIR}/modules/gpu/include"
+		"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia"
+		"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core"
     )   
     
     if(HAVE_CUDA)
-        include_directories(${CUDA_INCLUDE_DIRS})
+        include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core)
     endif()
 
     if(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample b/samples/gpu/cascadeclassifier_nvidia_api.cpp
similarity index 74%
rename from modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
rename to samples/gpu/cascadeclassifier_nvidia_api.cpp
index c1926a38c28ec8f46b5a5552193e05db431332b4..ff99f2c361e39e4ce49c6d219c6b0cdf45b63c98 100644
--- a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -1,60 +1,25 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
-// 
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
+#pragma warning( disable : 4201 4408 4127 4100)
 #include <cstdio>
-#include <cuda_runtime.h>
 
-#define CV_NO_BACKWARD_COMPATIBILITY
+#include "cvconfig.h"
+#if !defined(HAVE_CUDA)
+	int main( int argc, const char** argv ) { return printf("Please compile the librarary with CUDA support."), -1; }
+#else
 
-#include "opencv2/opencv.hpp"
 
+#include <cuda_runtime.h>
+#include "opencv2/opencv.hpp"
 #include "NCVHaarObjectDetection.hpp"
 
+
+
 using namespace cv;
-using namespace std;
 
-const Size preferredVideoFrameSize(640, 480);
 
-string preferredClassifier = "haarcascade_frontalface_alt.xml";
-string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed";
+const Size2i preferredVideoFrameSize(640, 480);
+
+std::string preferredClassifier = "haarcascade_frontalface_alt.xml";
+std::string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed";
 
 
 void printSyntax(void)
@@ -62,7 +27,6 @@ void printSyntax(void)
     printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n");
 }
 
-
 void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...)
 {    
     int fontFace = CV_FONT_HERSHEY_PLAIN;
@@ -83,7 +47,6 @@ void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...)
     va_end(arg_ptr);    
 }
 
-
 NCVStatus process(Mat *srcdst,
                   Ncv32u width, Ncv32u height,
                   NcvBool bShowAllHypotheses, NcvBool bLargestFace,
@@ -104,15 +67,16 @@ NCVStatus process(Mat *srcdst,
     ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
     NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);
     ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);        
+    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);
     ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
 
-    Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride());
+    NCV_SKIP_COND_BEGIN
+
+    for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++)
+    {
+        memcpy(h_src.ptr() + i * h_src.stride(), srcdst->ptr(i), srcdst->cols);
+    }
 
-    NCV_SKIP_COND_BEGIN        
-    
-    (*srcdst).copyTo(h_src_hdr);
-    
     ncvStat = h_src.copySolid(d_src, 0);
     ncvAssertReturnNcvStat(ncvStat);
     ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
@@ -130,8 +94,9 @@ NCVStatus process(Mat *srcdst,
         haar.ClassifierSize,
         bShowAllHypotheses ? 0 : 4,
         1.2f, 1,
-        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace,
-        gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0);
+        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0)
+        | NCVPipeObjDet_VisualizeInPlace,
+        gpuAllocator, cpuAllocator, devProp, 0);
     ncvAssertReturnNcvStat(ncvStat);
     ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
 
@@ -141,14 +106,16 @@ NCVStatus process(Mat *srcdst,
     ncvAssertReturnNcvStat(ncvStat);
     ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
 
-    h_src_hdr.copyTo(*srcdst);
-    
+    for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++)
+    {
+        memcpy(srcdst->ptr(i), h_src.ptr() + i * h_src.stride(), srcdst->cols);
+    }
+
     NCV_SKIP_COND_END
 
     return NCV_SUCCESS;
 }
 
-
 int main( int argc, const char** argv )
 {
     NCVStatus ncvStat;
@@ -160,14 +127,19 @@ int main( int argc, const char** argv )
     printf("  Space - Switch between NCV and OpenCV\n");
     printf("  L     - Switch between FullSearch and LargestFace modes\n");
     printf("  U     - Toggle unfiltered hypotheses visualization in FullSearch\n");
-    
-    if (argc != 4 && argc != 1)
-        return printSyntax(), -1;
-
+	
     VideoCapture capture;    
-    Size frameSize;
+    bool bQuit = false;
+
+    Size2i frameSize;
 
-    if (argc == 1 || strcmp(argv[1], "-c") == 0)
+    if (argc != 4 && argc != 1)
+    {
+        printSyntax();
+        return -1;
+    }
+
+   if (argc == 1 || strcmp(argv[1], "-c") == 0)
     {
         // Camera input is specified
         int camIdx = (argc == 3) ? atoi(argv[2]) : 0;
@@ -192,14 +164,26 @@ int main( int argc, const char** argv )
         return printSyntax(), -1;
 
     NcvBool bUseOpenCV = true;
-    NcvBool bLargestFace = true;
-    NcvBool bShowAllHypotheses = false;    
+    NcvBool bLargestFace = false; //LargestFace=true is used usually during training
+    NcvBool bShowAllHypotheses = false;
 
-    string classifierFile = (argc == 1) ? preferredClassifier : argv[3];
-    
     CascadeClassifier classifierOpenCV;
+    std::string classifierFile;
+    if (argc == 1)
+    {
+        classifierFile = preferredClassifier;
+    }
+    else
+    {
+        classifierFile.assign(argv[3]);
+    }
+
     if (!classifierOpenCV.load(classifierFile))
-        return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1;
+    {
+        printf("Error (in OpenCV) opening classifier\n");
+        printSyntax();
+        return -1;
+    }
 
     int devId;
     ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);
@@ -214,9 +198,9 @@ int main( int argc, const char** argv )
     //
     //==============================================================================
 
-    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice);
+    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice, devProp.textureAlignment);
     ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);
-    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned);
+    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment);
     ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);
 
     Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
@@ -278,32 +262,36 @@ int main( int argc, const char** argv )
     // Main processing loop
     //
     //==============================================================================
-    
-    namedWindow(wndTitle, 1);
 
+	namedWindow(wndTitle, 1);
     Mat frame, gray, frameDisp;
 
-    for(;;)
+    do
     {
-        // For camera and video file, capture the next image                
+		// For camera and video file, capture the next image                
         capture >> frame;
         if (frame.empty())
             break;
-                
+
+        Mat gray;
         cvtColor(frame, gray, CV_BGR2GRAY);
 
+        //
         // process
+        //
+
         NcvSize32u minSize = haar.ClassifierSize;
         if (bLargestFace)
         {
             Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;
             Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;
             Ncv32u ratioSmallest = std::min(ratioX, ratioY);
-            ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f);
+            ratioSmallest = std::max((Ncv32u)(ratioSmallest / 2.5f), (Ncv32u)1);
             minSize.width *= ratioSmallest;
             minSize.height *= ratioSmallest;
         }
-        
+
+        Ncv32f avgTime;
         NcvTimer timer = ncvStartTimer();
 
         if (!bUseOpenCV)
@@ -324,15 +312,16 @@ int main( int argc, const char** argv )
                 rectsOpenCV,
                 1.2f,
                 bShowAllHypotheses && !bLargestFace ? 0 : 4,
-                (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE,
+                (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0)
+                | CV_HAAR_SCALE_IMAGE,
                 Size(minSize.width, minSize.height));
 
             for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)
                 rectangle(gray, rectsOpenCV[rt], Scalar(255));
         }
 
-        Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);
-        
+        avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);
+
         cvtColor(gray, frameDisp, CV_GRAY2BGR);
 
         imagePrintf(frameDisp, 0, CV_RGB(255,  0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":"");
@@ -347,16 +336,25 @@ int main( int argc, const char** argv )
         case ' ':
             bUseOpenCV = !bUseOpenCV;
             break;
-        case 'L':case 'l':
+        case 'L':
+        case 'l':
             bLargestFace = !bLargestFace;
             break;
-        case 'U':case 'u':
+        case 'U':
+        case 'u':
             bShowAllHypotheses = !bShowAllHypotheses;
             break;
         case 27:
-            return 0;            
+            bQuit = true;
+            break;
         }
-    }
-        
+
+    } while (!bQuit);
+
+    cvDestroyWindow(wndTitle.c_str());
+
     return 0;
 }
+
+
+#endif
\ No newline at end of file
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index 4a25f2f76ddaa819577f7309b07465bfd003b2f4..ca0e601ef7471c07e6cc41f822731926905c466b 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -3,12 +3,15 @@
 # ----------------------------------------------------------------------------
 project(opencv_test_gpu)
 
+set(the_target "opencv_test_gpu")
+
 file(GLOB test_srcs "src/*.cpp")
-source_group("Src" FILES ${test_srcs})
 file(GLOB test_hdrs "src/*.h*")
+source_group("Src" FILES ${test_srcs})
 source_group("Include" FILES ${test_hdrs})
 
-set(the_target "opencv_test_gpu")
+
+
 
 include_directories	(
 			"${CMAKE_SOURCE_DIR}/include/opencv"
@@ -26,11 +29,21 @@ include_directories	(
 			"${CMAKE_SOURCE_DIR}/modules/ml/include"
 			"${CMAKE_CURRENT_SOURCE_DIR}/src"
 			"${CMAKE_CURRENT_BINARY_DIR}"
+			"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia"
+			"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core"
 			)
 
 include_directories(../cxts)
 
-add_executable(${the_target} ${test_srcs} ${test_hdrs})
+if(HAVE_CUDA)
+	include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
+	
+	file(GLOB nvidia "src/nvidia/*.*")
+	SET(ncv_cpp ../../modules/gpu/src/nvidia/core/NCV.cpp)
+	source_group("Src\\nvidia" FILES ${nvidia})
+endif()
+
+add_executable(${the_target} ${test_srcs} ${test_hdrs} ${nvidia} ${ncv_cpp})
 
 # Additional target properties
 set_target_properties(${the_target} PROPERTIES
diff --git a/tests/gpu/src/nvidia/NCVAutoTestLister.hpp b/tests/gpu/src/nvidia/NCVAutoTestLister.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9f4438b8034e1414b4a2b5f848086c5a92b2a99
--- /dev/null
+++ b/tests/gpu/src/nvidia/NCVAutoTestLister.hpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _ncvautotestlister_hpp_
+#define _ncvautotestlister_hpp_
+
+#include <vector>
+
+#include "NCVTest.hpp"
+
+class NCVAutoTestLister
+{
+public:
+
+    NCVAutoTestLister(std::string testSuiteName, NcvBool bStopOnFirstFail=false, NcvBool bCompactOutput=true)
+        :
+    testSuiteName(testSuiteName),
+    bStopOnFirstFail(bStopOnFirstFail),
+    bCompactOutput(bCompactOutput)
+    {
+    }
+
+    void add(INCVTest *test)
+    {
+        this->tests.push_back(test);
+    }
+
+    void invoke()
+    {
+        Ncv32u nPassed = 0;
+        Ncv32u nFailed = 0;
+        Ncv32u nFailedMem = 0;
+
+        if (bCompactOutput)
+        {
+            printf("Test suite '%s' with %d tests\n", 
+                testSuiteName.c_str(),
+                (int)(this->tests.size()));
+        }
+
+        for (Ncv32u i=0; i<this->tests.size(); i++)
+        {
+            INCVTest &curTest = *tests[i];
+
+            NCVTestReport curReport;
+            bool res = curTest.executeTest(curReport);
+
+            if (!bCompactOutput)
+            {
+                printf("Test %3i %16s; Consumed mem GPU = %8d, CPU = %8d; %s\n",
+                    i,
+                    curTest.getName().c_str(),
+                    curReport.statsNums["MemGPU"],
+                    curReport.statsNums["MemCPU"],
+                    curReport.statsText["rcode"].c_str());
+            }
+
+            if (res)
+            {
+                nPassed++;
+                if (bCompactOutput)
+                {
+                    printf(".");
+                }
+            }
+            else
+            {
+                if (!curReport.statsText["rcode"].compare("FAILED"))
+                {
+                    nFailed++;
+                    if (bCompactOutput)
+                    {
+                        printf("x");
+                    }
+                    if (bStopOnFirstFail)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    nFailedMem++;
+                    if (bCompactOutput)
+                    {
+                        printf("m");
+                    }
+                }
+            }
+            fflush(stdout);
+        }
+        if (bCompactOutput)
+        {
+            printf("\n");
+        }
+
+        printf("Test suite '%s' complete: %d total, %d passed, %d memory errors, %d failed\n\n", 
+            testSuiteName.c_str(),
+            (int)(this->tests.size()),
+            nPassed,
+            nFailedMem,
+            nFailed);
+    }
+
+    ~NCVAutoTestLister()
+    {
+        for (Ncv32u i=0; i<this->tests.size(); i++)
+        {
+            delete tests[i];
+        }
+    }
+
+private:
+
+    NcvBool bStopOnFirstFail;
+    NcvBool bCompactOutput;
+    std::string testSuiteName;
+    std::vector<INCVTest *> tests;
+};
+
+#endif // _ncvautotestlister_hpp_
diff --git a/tests/gpu/src/nvidia/NCVTest.hpp b/tests/gpu/src/nvidia/NCVTest.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8c2d9729e6448563c2df3bf91e053aad13df939
--- /dev/null
+++ b/tests/gpu/src/nvidia/NCVTest.hpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _ncvtest_hpp_
+#define _ncvtest_hpp_
+
+#pragma warning( disable : 4201 4408 4127 4100)
+
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <algorithm>
+#include <fstream>
+
+#include <cuda_runtime.h>
+#include "NPP_staging.hpp"
+
+
+struct NCVTestReport
+{
+    std::map<std::string, Ncv32u> statsNums;
+    std::map<std::string, std::string> statsText;
+};
+
+
+class INCVTest
+{
+public:
+    virtual bool executeTest(NCVTestReport &report) = 0;
+    virtual std::string getName() const = 0;
+};
+
+
+class NCVTestProvider : public INCVTest
+{
+public:
+
+    NCVTestProvider(std::string testName)
+        :
+        testName(testName)
+    {
+        int devId;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&this->devProp, devId), "Error returned from cudaGetDeviceProperties", );
+    }
+
+    virtual bool init() = 0;
+    virtual bool process() = 0;
+    virtual bool deinit() = 0;
+    virtual bool toString(std::ofstream &strOut) = 0;
+
+    virtual std::string getName() const
+    {
+        return this->testName;
+    }
+
+    virtual ~NCVTestProvider()
+    {
+        deinitMemory();
+    }
+
+    virtual bool executeTest(NCVTestReport &report)
+    {
+        bool res;
+        report.statsText["rcode"] = "FAILED";
+
+        res = initMemory(report);
+        if (!res)
+        {
+            dumpToFile(report);
+            deinitMemory();
+            return false;
+        }
+
+        res = init();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinit();
+            deinitMemory();
+            return false;
+        }
+
+        res = process();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinit();
+            deinitMemory();
+            return false;
+        }
+
+        res = deinit();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinitMemory();
+            return false;
+        }
+
+        deinitMemory();
+
+        report.statsText["rcode"] = "Passed";
+        return true;
+    }
+
+protected:
+
+    cudaDeviceProp devProp;
+    std::auto_ptr<INCVMemAllocator> allocatorGPU;
+    std::auto_ptr<INCVMemAllocator> allocatorCPU;
+
+private:
+
+    std::string testName;
+
+    bool initMemory(NCVTestReport &report)
+    {
+        this->allocatorGPU.reset(new NCVMemStackAllocator(devProp.textureAlignment));
+        this->allocatorCPU.reset(new NCVMemStackAllocator(devProp.textureAlignment));
+
+        if (!this->allocatorGPU.get()->isInitialized() ||
+            !this->allocatorCPU.get()->isInitialized())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+
+        if (!this->process())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+
+        Ncv32u maxGPUsize = (Ncv32u)this->allocatorGPU.get()->maxSize();
+        Ncv32u maxCPUsize = (Ncv32u)this->allocatorCPU.get()->maxSize();
+
+        report.statsNums["MemGPU"] = maxGPUsize;
+        report.statsNums["MemCPU"] = maxCPUsize;
+
+        this->allocatorGPU.reset(new NCVMemStackAllocator(NCVMemoryTypeDevice, maxGPUsize, devProp.textureAlignment));
+
+        this->allocatorCPU.reset(new NCVMemStackAllocator(NCVMemoryTypeHostPinned, maxCPUsize, devProp.textureAlignment));
+
+        if (!this->allocatorGPU.get()->isInitialized() ||
+            !this->allocatorCPU.get()->isInitialized())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+
+        return true;
+    }
+
+    void deinitMemory()
+    {
+        this->allocatorGPU.reset();
+        this->allocatorCPU.reset();
+    }
+
+    void dumpToFile(NCVTestReport &report)
+    {
+        bool bReasonMem = (0 == report.statsText["rcode"].compare("Memory FAILED"));
+        std::string fname = "TestDump_";
+        fname += (bReasonMem ? "m_" : "") + this->testName + ".log";
+        std::ofstream stream(fname.c_str(), std::ios::trunc | std::ios::out);
+        if (!stream.is_open()) return;
+
+        stream << "NCV Test Failure Log: " << this->testName << std::endl;
+        stream << "====================================================" << std::endl << std::endl;
+        stream << "Test initialization report: " << std::endl;
+        for (std::map<std::string,std::string>::iterator it=report.statsText.begin();
+             it != report.statsText.end(); it++)
+        {
+            stream << it->first << "=" << it->second << std::endl;
+        }
+        for (std::map<std::string,Ncv32u>::iterator it=report.statsNums.begin();
+            it != report.statsNums.end(); it++)
+        {
+            stream << it->first << "=" << it->second << std::endl;
+        }
+        stream << std::endl;
+
+        stream << "Test initialization parameters: " << std::endl;
+        bool bSerializeRes = false;
+        try
+        {
+            bSerializeRes = this->toString(stream);
+        }
+        catch (...)
+        {
+        }
+
+        if (!bSerializeRes)
+        {
+            stream << "Couldn't retrieve object dump" << std::endl;
+        }
+
+        stream.flush();
+    }
+};
+
+#endif // _ncvtest_hpp_
diff --git a/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp b/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4f9a392ecf43960cafafa3c3d8b9c43ddea2730
--- /dev/null
+++ b/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _ncvtestsourceprovider_hpp_
+#define _ncvtestsourceprovider_hpp_
+
+#include <memory>
+
+#include "NCV.hpp"
+#include <opencv2/highgui/highgui.hpp>
+
+
+template <class T>
+class NCVTestSourceProvider
+{
+public:
+
+    NCVTestSourceProvider(Ncv32u seed, T rangeLow, T rangeHigh, Ncv32u maxWidth, Ncv32u maxHeight)
+        :
+        bInit(false)
+    {
+        ncvAssertPrintReturn(rangeLow < rangeHigh, "NCVTestSourceProvider ctor:: Invalid range", );
+
+        int devId;
+        cudaDeviceProp devProp;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", );
+
+        //Ncv32u maxWpitch = alignUp(maxWidth * sizeof(T), devProp.textureAlignment);
+
+        allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment));
+        data.reset(new NCVMatrixAlloc<T>(*this->allocatorCPU.get(), maxWidth, maxHeight));
+        ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", );
+
+        this->dataWidth = maxWidth;
+        this->dataHeight = maxHeight;
+
+        srand(seed);
+
+        for (Ncv32u i=0; i<maxHeight; i++)
+        {
+            for (Ncv32u j=0; j<data.get()->stride(); j++)
+            {
+                data.get()->ptr()[i * data.get()->stride() + j] =
+                    (T)(((1.0 * rand()) / RAND_MAX) * (rangeHigh - rangeLow) + rangeLow);
+            }
+        }
+
+        this->bInit = true;
+    }
+
+    NCVTestSourceProvider(std::string pgmFilename)
+        :
+        bInit(false)
+    {
+        ncvAssertPrintReturn(sizeof(T) == 1, "NCVTestSourceProvider ctor:: PGM constructor complies only with 8bit types", );
+
+		cv::Mat image = cv::imread(pgmFilename);		
+		ncvAssertPrintReturn(!image.empty(), "NCVTestSourceProvider ctor:: PGM file error", );
+
+        int devId;
+        cudaDeviceProp devProp;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", );
+
+        allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment));
+        data.reset(new NCVMatrixAlloc<T>(*this->allocatorCPU.get(), image.cols, image.rows));
+        ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", );
+
+        this->dataWidth = image.cols;
+        this->dataHeight = image.rows;
+
+		cv::Mat hdr(image.size(), CV_8UC1, data.get()->ptr(), data.get()->pitch());
+		image.copyTo(hdr);
+        
+        this->bInit = true;
+    }
+
+    NcvBool fill(NCVMatrix<T> &dst)
+    {
+        ncvAssertReturn(this->isInit() &&
+                        dst.memType() == allocatorCPU.get()->memType(), false);
+
+        if (dst.width() == 0 || dst.height() == 0)
+        {
+            return true;
+        }
+
+        for (Ncv32u i=0; i<dst.height(); i++)
+        {
+            Ncv32u srcLine = i % this->dataHeight;
+
+            Ncv32u srcFullChunks = dst.width() / this->dataWidth;
+            for (Ncv32u j=0; j<srcFullChunks; j++)
+            {
+                memcpy(dst.ptr() + i * dst.stride() + j * this->dataWidth,
+                    this->data.get()->ptr() + this->data.get()->stride() * srcLine,
+                    this->dataWidth * sizeof(T));
+            }
+
+            Ncv32u srcLastChunk = dst.width() % this->dataWidth;
+            memcpy(dst.ptr() + i * dst.stride() + srcFullChunks * this->dataWidth,
+                this->data.get()->ptr() + this->data.get()->stride() * srcLine,
+                srcLastChunk * sizeof(T));
+        }
+
+        return true;
+    }
+
+    NcvBool fill(NCVVector<T> &dst)
+    {
+        ncvAssertReturn(this->isInit() &&
+                        dst.memType() == allocatorCPU.get()->memType(), false);
+
+        if (dst.length() == 0)
+        {
+            return true;
+        }
+
+        Ncv32u srcLen = this->dataWidth * this->dataHeight;
+
+        Ncv32u srcFullChunks = (Ncv32u)dst.length() / srcLen;
+        for (Ncv32u j=0; j<srcFullChunks; j++)
+        {
+            memcpy(dst.ptr() + j * srcLen, this->data.get()->ptr(), srcLen * sizeof(T));
+        }
+
+        Ncv32u srcLastChunk = dst.length() % srcLen;
+        memcpy(dst.ptr() + srcFullChunks * srcLen, this->data.get()->ptr(), srcLastChunk * sizeof(T));
+
+        return true;
+    }
+
+    ~NCVTestSourceProvider()
+    {
+        data.reset();
+        allocatorCPU.reset();
+    }
+
+private:
+
+    NcvBool isInit(void)
+    {
+        return this->bInit;
+    }
+
+    NcvBool bInit;
+    std::auto_ptr< INCVMemAllocator > allocatorCPU;
+    std::auto_ptr< NCVMatrixAlloc<T> > data;
+    Ncv32u dataWidth;
+    Ncv32u dataHeight;
+};
+
+#endif // _ncvtestsourceprovider_hpp_
diff --git a/tests/gpu/src/nvidia/TestCompact.cpp b/tests/gpu/src/nvidia/TestCompact.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2882f7cad214941c23298a8d28626d8bd704142e
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestCompact.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include "TestCompact.h"
+
+
+TestCompact::TestCompact(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                                             Ncv32u length, Ncv32u badElem, Ncv32u badElemPercentage)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    length(length),
+    badElem(badElem),
+    badElemPercentage(badElemPercentage > 100 ? 100 : badElemPercentage)
+{
+}
+
+
+bool TestCompact::toString(std::ofstream &strOut)
+{
+    strOut << "length=" << length << std::endl;
+    strOut << "badElem=" << badElem << std::endl;
+    strOut << "badElemPercentage=" << badElemPercentage << std::endl;
+    return true;
+}
+
+
+bool TestCompact::init()
+{
+    return true;
+}
+
+
+bool TestCompact::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVVectorAlloc<Ncv32u> h_vecSrc(*this->allocatorCPU.get(), this->length);
+    ncvAssertReturn(h_vecSrc.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> d_vecSrc(*this->allocatorGPU.get(), this->length);
+    ncvAssertReturn(d_vecSrc.isMemAllocated(), false);
+
+    NCVVectorAlloc<Ncv32u> h_vecDst(*this->allocatorCPU.get(), this->length);
+    ncvAssertReturn(h_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> d_vecDst(*this->allocatorGPU.get(), this->length);
+    ncvAssertReturn(d_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> h_vecDst_d(*this->allocatorCPU.get(), this->length);
+    ncvAssertReturn(h_vecDst_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_vecSrc), false);
+    for (Ncv32u i=0; i<this->length; i++)
+    {
+        Ncv32u tmp = (h_vecSrc.ptr()[i]) & 0xFF;
+        tmp = tmp * 99 / 255;
+        if (tmp < this->badElemPercentage)
+        {
+            h_vecSrc.ptr()[i] = this->badElem;
+        }
+    }
+    NCV_SKIP_COND_END
+
+    NCVVectorAlloc<Ncv32u> h_dstLen(*this->allocatorCPU.get(), 1);
+    ncvAssertReturn(h_dstLen.isMemAllocated(), false);
+    Ncv32u bufSize;
+    ncvStat = nppsStCompactGetSize_32u(this->length, &bufSize, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    Ncv32u h_outElemNum_h = 0;
+
+    NCV_SKIP_COND_BEGIN
+    ncvStat = h_vecSrc.copySolid(d_vecSrc, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    ncvStat = nppsStCompact_32u(d_vecSrc.ptr(), this->length,
+                                d_vecDst.ptr(), h_dstLen.ptr(), this->badElem,
+                                d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    ncvStat = d_vecDst.copySolid(h_vecDst_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppsStCompact_32u_host(h_vecSrc.ptr(), this->length, h_vecDst.ptr(), &h_outElemNum_h, this->badElem);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    if (h_dstLen.ptr()[0] != h_outElemNum_h)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        for (Ncv32u i=0; bLoopVirgin && i < h_outElemNum_h; i++)
+        {
+            if (h_vecDst.ptr()[i] != h_vecDst_d.ptr()[i])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestCompact::deinit()
+{
+    return true;
+}
diff --git a/tests/gpu/src/nvidia/TestCompact.h b/tests/gpu/src/nvidia/TestCompact.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba4f93aa1090443a89652c86ffbd7bbf8146d842
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestCompact.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testhypothesescompact_h_
+#define _testhypothesescompact_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestCompact : public NCVTestProvider
+{
+public:
+
+    TestCompact(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                          Ncv32u length, Ncv32u badElem, Ncv32u badElemPercentage);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+	TestCompact(const TestCompact&);
+	TestCompact& operator=(const TestCompact&);	
+
+
+    NCVTestSourceProvider<Ncv32u> &src;
+    Ncv32u length;
+    Ncv32u badElem;
+    Ncv32u badElemPercentage;
+};
+
+#endif // _testhypothesescompact_h_
diff --git a/tests/gpu/src/nvidia/TestDrawRects.cpp b/tests/gpu/src/nvidia/TestDrawRects.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b86ac58b159098aeb2aa8375f64a426249580fcf
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestDrawRects.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include "TestDrawRects.h"
+#include "NCVHaarObjectDetection.hpp"
+
+
+template <class T>
+TestDrawRects<T>::TestDrawRects(std::string testName, NCVTestSourceProvider<T> &src, NCVTestSourceProvider<Ncv32u> &src32u,
+                                Ncv32u width, Ncv32u height, Ncv32u numRects, T color)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    src32u(src32u),
+    width(width),
+    height(height),
+    numRects(numRects),
+    color(color)
+{
+}
+
+
+template <class T>
+bool TestDrawRects<T>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T)=" << sizeof(T) << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    strOut << "numRects=" << numRects << std::endl;
+    strOut << "color=" << color << std::endl;
+    return true;
+}
+
+
+template <class T>
+bool TestDrawRects<T>::init()
+{
+    return true;
+}
+
+
+template <class T>
+bool TestDrawRects<T>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVMatrixAlloc<T> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img_d(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img_d.isMemAllocated(), false);
+
+    NCVVectorAlloc<NcvRect32u> d_rects(*this->allocatorGPU.get(), this->numRects);
+    ncvAssertReturn(d_rects.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_rects(*this->allocatorCPU.get(), this->numRects);
+    ncvAssertReturn(h_rects.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    //fill vector of rectangles with random rects covering the input
+    NCVVectorReuse<Ncv32u> h_rects_as32u(h_rects.getSegment());
+    ncvAssertReturn(h_rects_as32u.isMemReused(), false);
+    ncvAssertReturn(this->src32u.fill(h_rects_as32u), false);
+    for (Ncv32u i=0; i<this->numRects; i++)
+    {
+        h_rects.ptr()[i].x = (Ncv32u)(((1.0 * h_rects.ptr()[i].x) / RAND_MAX) * (this->width-2));
+        h_rects.ptr()[i].y = (Ncv32u)(((1.0 * h_rects.ptr()[i].y) / RAND_MAX) * (this->height-2));
+        h_rects.ptr()[i].width = (Ncv32u)(((1.0 * h_rects.ptr()[i].width) / RAND_MAX) * (this->width+10 - h_rects.ptr()[i].x));
+        h_rects.ptr()[i].height = (Ncv32u)(((1.0 * h_rects.ptr()[i].height) / RAND_MAX) * (this->height+10 - h_rects.ptr()[i].y));
+    }
+    ncvStat = h_rects.copySolid(d_rects, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = ncvDrawRects_32u_device((Ncv32u *)d_img.ptr(), d_img.stride(), this->width, this->height,
+                                          (NcvRect32u *)d_rects.ptr(), this->numRects, this->color, 0);
+    }
+    else if (sizeof(T) == sizeof(Ncv8u))
+    {
+        ncvStat = ncvDrawRects_8u_device((Ncv8u *)d_img.ptr(), d_img.stride(), this->width, this->height,
+                                         (NcvRect32u *)d_rects.ptr(), this->numRects, (Ncv8u)this->color, 0);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect drawrects test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    ncvStat = d_img.copySolid(h_img_d, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = ncvDrawRects_32u_host((Ncv32u *)h_img.ptr(), h_img.stride(), this->width, this->height,
+                                        (NcvRect32u *)h_rects.ptr(), this->numRects, this->color);
+    }
+    else if (sizeof(T) == sizeof(Ncv8u))
+    {
+        ncvStat = ncvDrawRects_8u_host((Ncv8u *)h_img.ptr(), h_img.stride(), this->width, this->height,
+                                       (NcvRect32u *)h_rects.ptr(), this->numRects, (Ncv8u)this->color);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect drawrects test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    //const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < h_img.height(); i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_img.width(); j++)
+        {
+            if (h_img.ptr()[h_img.stride()*i+j] != h_img_d.ptr()[h_img_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T>
+bool TestDrawRects<T>::deinit()
+{
+    return true;
+}
+
+
+template class TestDrawRects<Ncv8u>;
+template class TestDrawRects<Ncv32u>;
diff --git a/tests/gpu/src/nvidia/TestDrawRects.h b/tests/gpu/src/nvidia/TestDrawRects.h
new file mode 100644
index 0000000000000000000000000000000000000000..bde80fe6c61e9aa9bc84d95ebe57276a158b24b7
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestDrawRects.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testdrawrects_h_
+#define _testdrawrects_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+template <class T>
+class TestDrawRects : public NCVTestProvider
+{
+public:
+
+    TestDrawRects(std::string testName, NCVTestSourceProvider<T> &src, NCVTestSourceProvider<Ncv32u> &src32u,
+                  Ncv32u width, Ncv32u height, Ncv32u numRects, T color);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+	TestDrawRects(const TestDrawRects&);
+	TestDrawRects& operator=(const TestDrawRects&);	
+
+    NCVTestSourceProvider<T> &src;
+    NCVTestSourceProvider<Ncv32u> &src32u;
+    Ncv32u width;
+    Ncv32u height;
+    Ncv32u numRects;
+    T color;
+};
+
+#endif // _testdrawrects_h_
diff --git a/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp b/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3a13fc3200dc144893c815fff2596cdcac85c63
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include <float.h>
+#include "TestHaarCascadeApplication.h"
+#include "NCVHaarObjectDetection.hpp"
+
+
+TestHaarCascadeApplication::TestHaarCascadeApplication(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                                                       std::string cascadeName, Ncv32u width, Ncv32u height)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    cascadeName(cascadeName),
+    width(width),
+    height(height)
+{
+}
+
+
+bool TestHaarCascadeApplication::toString(std::ofstream &strOut)
+{
+    strOut << "cascadeName=" << cascadeName << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    return true;
+}
+
+
+bool TestHaarCascadeApplication::init()
+{
+    return true;
+}
+
+
+bool TestHaarCascadeApplication::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u numStages, numNodes, numFeatures;
+
+    ncvStat = ncvHaarGetClassifierSize(this->cascadeName, numStages, numNodes, numFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCVVectorAlloc<HaarStage64> h_HaarStages(*this->allocatorCPU.get(), numStages);
+    ncvAssertReturn(h_HaarStages.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(*this->allocatorCPU.get(), numNodes);
+    ncvAssertReturn(h_HaarNodes.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures(*this->allocatorCPU.get(), numFeatures);
+    ncvAssertReturn(h_HaarFeatures.isMemAllocated(), false);
+
+    NCVVectorAlloc<HaarStage64> d_HaarStages(*this->allocatorGPU.get(), numStages);
+    ncvAssertReturn(d_HaarStages.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> d_HaarNodes(*this->allocatorGPU.get(), numNodes);
+    ncvAssertReturn(d_HaarNodes.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> d_HaarFeatures(*this->allocatorGPU.get(), numFeatures);
+    ncvAssertReturn(d_HaarFeatures.isMemAllocated(), false);
+
+    HaarClassifierCascadeDescriptor haar;
+    haar.ClassifierSize.width = haar.ClassifierSize.height = 1;
+    haar.bNeedsTiltedII = false;
+    haar.NumClassifierRootNodes = numNodes;
+    haar.NumClassifierTotalNodes = numNodes;
+    haar.NumFeatures = numFeatures;
+    haar.NumStages = numStages;
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = ncvHaarLoadFromFile_host(this->cascadeName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvAssertReturn(NCV_SUCCESS == h_HaarStages.copySolid(d_HaarStages, 0), false);
+    ncvAssertReturn(NCV_SUCCESS == h_HaarNodes.copySolid(d_HaarNodes, 0), false);
+    ncvAssertReturn(NCV_SUCCESS == h_HaarFeatures.copySolid(d_HaarFeatures, 0), false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    NCV_SKIP_COND_END
+
+    NcvSize32s srcRoi, srcIIRoi, searchRoi;
+    srcRoi.width = this->width;
+    srcRoi.height = this->height;
+    srcIIRoi.width = srcRoi.width + 1;
+    srcIIRoi.height = srcRoi.height + 1;
+    searchRoi.width = srcIIRoi.width - haar.ClassifierSize.width;
+    searchRoi.height = srcIIRoi.height - haar.ClassifierSize.height;
+    if (searchRoi.width <= 0 || searchRoi.height <= 0)
+    {
+        return false;
+    }
+    NcvSize32u searchRoiU(searchRoi.width, searchRoi.height);
+
+    NCVMatrixAlloc<Ncv8u> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv8u> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    Ncv32u integralWidth = this->width + 1;
+    Ncv32u integralHeight = this->height + 1;
+
+    NCVMatrixAlloc<Ncv32u> d_integralImage(*this->allocatorGPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(d_integralImage.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(*this->allocatorGPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> h_integralImage(*this->allocatorCPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(h_integralImage.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_sqIntegralImage(*this->allocatorCPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(h_sqIntegralImage.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv32f> d_rectStdDev(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_rectStdDev.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> d_pixelMask(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_pixelMask.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32f> h_rectStdDev(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_rectStdDev.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> h_pixelMask(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_pixelMask.isMemAllocated(), false);
+
+    NCVVectorAlloc<NcvRect32u> d_hypotheses(*this->allocatorGPU.get(), this->width * this->height);
+    ncvAssertReturn(d_hypotheses.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_hypotheses(*this->allocatorCPU.get(), this->width * this->height);
+    ncvAssertReturn(h_hypotheses.isMemAllocated(), false);
+
+    NCVStatus nppStat;
+    Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
+    nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &szTmpBufIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+    nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &szTmpBufSqIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+    NCVVectorAlloc<Ncv8u> d_tmpIIbuf(*this->allocatorGPU.get(), std::max(szTmpBufIntegral, szTmpBufSqIntegral));
+    ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), false);
+
+    Ncv32u detectionsOnThisScale_d = 0;
+    Ncv32u detectionsOnThisScale_h = 0;
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvAssertReturn(this->src.fill(h_img), false);
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    nppStat = nppiStIntegral_8u32u_C1R(d_img.ptr(), d_img.pitch(),
+                                       d_integralImage.ptr(), d_integralImage.pitch(),
+                                       NcvSize32u(d_img.width(), d_img.height()),
+                                       d_tmpIIbuf.ptr(), szTmpBufIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+
+    nppStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(),
+                                          d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
+                                          NcvSize32u(d_img.width(), d_img.height()),
+                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+
+    const NcvRect32u rect(
+        HAAR_STDDEV_BORDER,
+        HAAR_STDDEV_BORDER,
+        haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
+        haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
+    nppStat = nppiStRectStdDev_32f_C1R(
+        d_integralImage.ptr(), d_integralImage.pitch(),
+        d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
+        d_rectStdDev.ptr(), d_rectStdDev.pitch(),
+        NcvSize32u(searchRoi.width, searchRoi.height), rect,
+        1.0f, true);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+
+    ncvStat = d_integralImage.copySolid(h_integralImage, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvStat = d_rectStdDev.copySolid(h_rectStdDev, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    for (Ncv32u i=0; i<searchRoiU.height; i++)
+    {
+        for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
+        {
+            if (j<searchRoiU.width)
+            {
+                h_pixelMask.ptr()[i*h_pixelMask.stride()+j] = (i << 16) | j;
+            }
+            else
+            {
+                h_pixelMask.ptr()[i*h_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
+            }
+        }
+    }
+    ncvAssertReturn(cudaSuccess == cudaStreamSynchronize(0), false);
+
+    Ncv32u fpu_oldcw, fpu_cw;
+    _controlfp_s(&fpu_cw, 0, 0);
+    fpu_oldcw = fpu_cw;
+    _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
+    ncvStat = ncvApplyHaarClassifierCascade_host(
+        h_integralImage, h_rectStdDev, h_pixelMask,
+        detectionsOnThisScale_h,
+        haar, h_HaarStages, h_HaarNodes, h_HaarFeatures, false,
+        searchRoiU, 1, 1.0f);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
+
+    NCV_SKIP_COND_END
+
+    int devId;
+    ncvAssertCUDAReturn(cudaGetDevice(&devId), false);
+    cudaDeviceProp devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), false);
+
+    ncvStat = ncvApplyHaarClassifierCascade_device(
+        d_integralImage, d_rectStdDev, d_pixelMask,
+        detectionsOnThisScale_d,
+        haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
+        searchRoiU, 1, 1.0f,
+        *this->allocatorGPU.get(), *this->allocatorCPU.get(),
+        devProp, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_pixelMask_d.isMemAllocated(), false);
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    if (detectionsOnThisScale_d != detectionsOnThisScale_h)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        std::sort(h_pixelMask_d.ptr(), h_pixelMask_d.ptr() + detectionsOnThisScale_d);
+        for (Ncv32u i=0; i<detectionsOnThisScale_d && bLoopVirgin; i++)
+        {
+            if (h_pixelMask.ptr()[i] != h_pixelMask_d.ptr()[i])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHaarCascadeApplication::deinit()
+{
+    return true;
+}
diff --git a/tests/gpu/src/nvidia/TestHaarCascadeApplication.h b/tests/gpu/src/nvidia/TestHaarCascadeApplication.h
new file mode 100644
index 0000000000000000000000000000000000000000..d472544892581e23250607b74da55f08f2f8d2cc
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHaarCascadeApplication.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testhaarcascadeapplication_h_
+#define _testhaarcascadeapplication_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHaarCascadeApplication : public NCVTestProvider
+{
+public:
+
+    TestHaarCascadeApplication(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                               std::string cascadeName, Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+	TestHaarCascadeApplication(const TestHaarCascadeApplication&);
+	TestHaarCascadeApplication& operator=(const TestHaarCascadeApplication&);	
+
+
+    NCVTestSourceProvider<Ncv8u> &src;
+    std::string cascadeName;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testhaarcascadeapplication_h_
diff --git a/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp b/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8991e69cd8564371d7e0128c43ecb7171b6129b0
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include "TestHaarCascadeLoader.h"
+#include "NCVHaarObjectDetection.hpp"
+
+
+TestHaarCascadeLoader::TestHaarCascadeLoader(std::string testName, std::string cascadeName)
+    :
+    NCVTestProvider(testName),
+    cascadeName(cascadeName)
+{
+}
+
+
+bool TestHaarCascadeLoader::toString(std::ofstream &strOut)
+{
+    strOut << "cascadeName=" << cascadeName << std::endl;
+    return true;
+}
+
+
+bool TestHaarCascadeLoader::init()
+{
+    return true;
+}
+
+
+bool TestHaarCascadeLoader::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u numStages, numNodes, numFeatures;
+    Ncv32u numStages_2 = 0, numNodes_2 = 0, numFeatures_2 = 0;
+
+    ncvStat = ncvHaarGetClassifierSize(this->cascadeName, numStages, numNodes, numFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCVVectorAlloc<HaarStage64> h_HaarStages(*this->allocatorCPU.get(), numStages);
+    ncvAssertReturn(h_HaarStages.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(*this->allocatorCPU.get(), numNodes);
+    ncvAssertReturn(h_HaarNodes.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures(*this->allocatorCPU.get(), numFeatures);
+    ncvAssertReturn(h_HaarFeatures.isMemAllocated(), false);
+
+    NCVVectorAlloc<HaarStage64> h_HaarStages_2(*this->allocatorCPU.get(), numStages);
+    ncvAssertReturn(h_HaarStages_2.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes_2(*this->allocatorCPU.get(), numNodes);
+    ncvAssertReturn(h_HaarNodes_2.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures_2(*this->allocatorCPU.get(), numFeatures);
+    ncvAssertReturn(h_HaarFeatures_2.isMemAllocated(), false);
+
+    HaarClassifierCascadeDescriptor haar;
+    HaarClassifierCascadeDescriptor haar_2;
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    const std::string testNvbinName = "test.nvbin";
+    ncvStat = ncvHaarLoadFromFile_host(this->cascadeName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvStat = ncvHaarStoreNVBIN_host(testNvbinName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvStat = ncvHaarGetClassifierSize(testNvbinName, numStages_2, numNodes_2, numFeatures_2);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvStat = ncvHaarLoadFromFile_host(testNvbinName, haar_2, h_HaarStages_2, h_HaarNodes_2, h_HaarFeatures_2);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+
+    if (
+    numStages_2 != numStages                                       ||
+    numNodes_2 != numNodes                                         ||
+    numFeatures_2 != numFeatures                                   ||
+    haar.NumStages               != haar_2.NumStages               ||
+    haar.NumClassifierRootNodes  != haar_2.NumClassifierRootNodes  ||
+    haar.NumClassifierTotalNodes != haar_2.NumClassifierTotalNodes ||
+    haar.NumFeatures             != haar_2.NumFeatures             ||
+    haar.ClassifierSize.width    != haar_2.ClassifierSize.width    ||
+    haar.ClassifierSize.height   != haar_2.ClassifierSize.height   ||
+    haar.bNeedsTiltedII          != haar_2.bNeedsTiltedII          ||
+    haar.bHasStumpsOnly          != haar_2.bHasStumpsOnly          )
+    {
+        bLoopVirgin = false;
+    }
+    if (memcmp(h_HaarStages.ptr(), h_HaarStages_2.ptr(), haar.NumStages * sizeof(HaarStage64)) ||
+        memcmp(h_HaarNodes.ptr(), h_HaarNodes_2.ptr(), haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128)) ||
+        memcmp(h_HaarFeatures.ptr(), h_HaarFeatures_2.ptr(), haar.NumFeatures * sizeof(HaarFeature64)) )
+    {
+        bLoopVirgin = false;
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHaarCascadeLoader::deinit()
+{
+    return true;
+}
diff --git a/tests/gpu/src/nvidia/TestHaarCascadeLoader.h b/tests/gpu/src/nvidia/TestHaarCascadeLoader.h
new file mode 100644
index 0000000000000000000000000000000000000000..717a38e2e09376f547b37c63d94fbaa90d35ddbc
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHaarCascadeLoader.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testhaarcascadeloader_h_
+#define _testhaarcascadeloader_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHaarCascadeLoader : public NCVTestProvider
+{
+public:
+
+    TestHaarCascadeLoader(std::string testName, std::string cascadeName);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+    std::string cascadeName;
+};
+
+#endif // _testhaarcascadeloader_h_
diff --git a/tests/gpu/src/nvidia/TestHypothesesFilter.cpp b/tests/gpu/src/nvidia/TestHypothesesFilter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c41ca0025fe7ce29f0ec24c97cb38f6c545f7afd
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHypothesesFilter.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include "TestHypothesesFilter.h"
+#include "NCVHaarObjectDetection.hpp"
+
+
+TestHypothesesFilter::TestHypothesesFilter(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                                           Ncv32u numDstRects, Ncv32u minNeighbors, Ncv32f eps)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    numDstRects(numDstRects),
+    minNeighbors(minNeighbors),
+    eps(eps)
+{
+}
+
+
+bool TestHypothesesFilter::toString(std::ofstream &strOut)
+{
+    strOut << "numDstRects=" << numDstRects << std::endl;
+    strOut << "minNeighbors=" << minNeighbors << std::endl;
+    strOut << "eps=" << eps << std::endl;
+    return true;
+}
+
+
+bool TestHypothesesFilter::init()
+{
+    this->canvasWidth = 4096;
+    this->canvasHeight = 4096;
+    return true;
+}
+
+
+bool compareRects(const NcvRect32u &r1, const NcvRect32u &r2, Ncv32f eps)
+{
+    double delta = eps*(std::min(r1.width, r2.width) + std::min(r1.height, r2.height))*0.5;
+    return std::abs((Ncv32s)r1.x - (Ncv32s)r2.x) <= delta &&
+        std::abs((Ncv32s)r1.y - (Ncv32s)r2.y) <= delta &&
+        std::abs((Ncv32s)r1.x + (Ncv32s)r1.width - (Ncv32s)r2.x - (Ncv32s)r2.width) <= delta &&
+        std::abs((Ncv32s)r1.y + (Ncv32s)r1.height - (Ncv32s)r2.y - (Ncv32s)r2.height) <= delta;
+}
+
+
+inline bool operator < (const NcvRect32u &a, const NcvRect32u &b)
+{
+    return a.x < b.x;
+}
+
+
+bool TestHypothesesFilter::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVVectorAlloc<Ncv32u> h_random32u(*this->allocatorCPU.get(), this->numDstRects * sizeof(NcvRect32u) / sizeof(Ncv32u));
+    ncvAssertReturn(h_random32u.isMemAllocated(), false);
+
+    Ncv32u srcSlotSize = 2 * this->minNeighbors + 1;
+
+    NCVVectorAlloc<NcvRect32u> h_vecSrc(*this->allocatorCPU.get(), this->numDstRects*srcSlotSize);
+    ncvAssertReturn(h_vecSrc.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_vecDst_groundTruth(*this->allocatorCPU.get(), this->numDstRects);
+    ncvAssertReturn(h_vecDst_groundTruth.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorCPU.get()->isCounting());
+
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_random32u), false);
+    Ncv32u randCnt = 0;
+    Ncv64f randVal;
+
+    for (Ncv32u i=0; i<this->numDstRects; i++)
+    {
+        h_vecDst_groundTruth.ptr()[i].x = i * this->canvasWidth / this->numDstRects + this->canvasWidth / (this->numDstRects * 4);
+        h_vecDst_groundTruth.ptr()[i].y = i * this->canvasHeight / this->numDstRects + this->canvasHeight / (this->numDstRects * 4);
+        h_vecDst_groundTruth.ptr()[i].width = this->canvasWidth / (this->numDstRects * 2);
+        h_vecDst_groundTruth.ptr()[i].height = this->canvasHeight / (this->numDstRects * 2);
+
+        Ncv32u numNeighbors = this->minNeighbors + 1 + (Ncv32u)(((1.0 * h_random32u.ptr()[i]) * (this->minNeighbors + 1)) / 0xFFFFFFFF);
+        numNeighbors = (numNeighbors > srcSlotSize) ? srcSlotSize : numNeighbors;
+
+        //fill in strong hypotheses                           (2 * ((1.0 * randVal) / 0xFFFFFFFF) - 1)
+        for (Ncv32u j=0; j<numNeighbors; j++)
+        {
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].x = 
+                h_vecDst_groundTruth.ptr()[i].x +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].width * this->eps * (randVal - 0.5));
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].y = 
+                h_vecDst_groundTruth.ptr()[i].y +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].height * this->eps * (randVal - 0.5));
+            h_vecSrc.ptr()[srcSlotSize * i + j].width = h_vecDst_groundTruth.ptr()[i].width;
+            h_vecSrc.ptr()[srcSlotSize * i + j].height = h_vecDst_groundTruth.ptr()[i].height;
+        }
+
+        //generate weak hypotheses (to be removed in processing)
+        for (Ncv32u j=numNeighbors; j<srcSlotSize; j++)
+        {
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].x = 
+                this->canvasWidth + h_vecDst_groundTruth.ptr()[i].x +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].width * this->eps * (randVal - 0.5));
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].y = 
+                this->canvasHeight + h_vecDst_groundTruth.ptr()[i].y +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].height * this->eps * (randVal - 0.5));
+            h_vecSrc.ptr()[srcSlotSize * i + j].width = h_vecDst_groundTruth.ptr()[i].width;
+            h_vecSrc.ptr()[srcSlotSize * i + j].height = h_vecDst_groundTruth.ptr()[i].height;
+        }
+    }
+
+    //shuffle
+    for (Ncv32u i=0; i<this->numDstRects*srcSlotSize-1; i++)
+    {
+        Ncv32u randVal = h_random32u.ptr()[randCnt++]; randCnt = randCnt % h_random32u.length();
+        Ncv32u secondSwap = randVal % (this->numDstRects*srcSlotSize-1 - i);
+        NcvRect32u tmp = h_vecSrc.ptr()[i + secondSwap];
+        h_vecSrc.ptr()[i + secondSwap] = h_vecSrc.ptr()[i];
+        h_vecSrc.ptr()[i] = tmp;
+    }
+    NCV_SKIP_COND_END
+
+    Ncv32u numHypothesesSrc = h_vecSrc.length();
+    NCV_SKIP_COND_BEGIN
+    ncvStat = ncvFilterHypotheses_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //verification
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    if (numHypothesesSrc != this->numDstRects)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        std::vector<NcvRect32u> tmpRects(numHypothesesSrc);
+        memcpy(&tmpRects[0], h_vecSrc.ptr(), numHypothesesSrc * sizeof(NcvRect32u));
+        std::sort(tmpRects.begin(), tmpRects.end());
+        for (Ncv32u i=0; i<numHypothesesSrc && bLoopVirgin; i++)
+        {
+            if (!compareRects(tmpRects[i], h_vecDst_groundTruth.ptr()[i], this->eps))
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHypothesesFilter::deinit()
+{
+    return true;
+}
diff --git a/tests/gpu/src/nvidia/TestHypothesesFilter.h b/tests/gpu/src/nvidia/TestHypothesesFilter.h
new file mode 100644
index 0000000000000000000000000000000000000000..63894f82d5a8e29acd884b3a7899897a098f8a16
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHypothesesFilter.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testhypothesesfilter_h_
+#define _testhypothesesfilter_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHypothesesFilter : public NCVTestProvider
+{
+public:
+
+    TestHypothesesFilter(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                         Ncv32u numDstRects, Ncv32u minNeighbors, Ncv32f eps);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+	TestHypothesesFilter(const TestHypothesesFilter&);
+	TestHypothesesFilter& operator=(const TestHypothesesFilter&);	
+
+    NCVTestSourceProvider<Ncv32u> &src;
+    Ncv32u numDstRects;
+    Ncv32u minNeighbors;
+    Ncv32f eps;
+
+    Ncv32u canvasWidth;
+    Ncv32u canvasHeight;
+};
+
+#endif // _testhypothesesfilter_h_
diff --git a/tests/gpu/src/nvidia/TestHypothesesGrow.cpp b/tests/gpu/src/nvidia/TestHypothesesGrow.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ca076c973bae2ae6e763713758878be0327e54d
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHypothesesGrow.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include "TestHypothesesGrow.h"
+#include "NCVHaarObjectDetection.hpp"
+
+
+TestHypothesesGrow::TestHypothesesGrow(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                                       Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f rectScale, 
+                                       Ncv32u maxLenSrc, Ncv32u lenSrc, Ncv32u maxLenDst, Ncv32u lenDst)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    rectWidth(rectWidth),
+    rectHeight(rectHeight),
+    rectScale(rectScale),
+    maxLenSrc(maxLenSrc),
+    lenSrc(lenSrc),
+    maxLenDst(maxLenDst),
+    lenDst(lenDst)
+{
+}
+
+
+bool TestHypothesesGrow::toString(std::ofstream &strOut)
+{
+    strOut << "rectWidth=" << rectWidth << std::endl;
+    strOut << "rectHeight=" << rectHeight << std::endl;
+    strOut << "rectScale=" << rectScale << std::endl;
+    strOut << "maxLenSrc=" << maxLenSrc << std::endl;
+    strOut << "lenSrc=" << lenSrc << std::endl;
+    strOut << "maxLenDst=" << maxLenDst << std::endl;
+    strOut << "lenDst=" << lenDst << std::endl;
+    return true;
+}
+
+
+bool TestHypothesesGrow::init()
+{
+    return true;
+}
+
+
+bool TestHypothesesGrow::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVVectorAlloc<Ncv32u> h_vecSrc(*this->allocatorCPU.get(), this->maxLenSrc);
+    ncvAssertReturn(h_vecSrc.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> d_vecSrc(*this->allocatorGPU.get(), this->maxLenSrc);
+    ncvAssertReturn(d_vecSrc.isMemAllocated(), false);
+
+    NCVVectorAlloc<NcvRect32u> h_vecDst(*this->allocatorCPU.get(), this->maxLenDst);
+    ncvAssertReturn(h_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> d_vecDst(*this->allocatorGPU.get(), this->maxLenDst);
+    ncvAssertReturn(d_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_vecDst_d(*this->allocatorCPU.get(), this->maxLenDst);
+    ncvAssertReturn(h_vecDst_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_vecSrc), false);
+    memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u));
+    NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u));
+    ncvAssertReturn(h_vecDst_as32u.isMemReused(), false);
+    ncvAssertReturn(this->src.fill(h_vecDst_as32u), false);
+    memcpy(h_vecDst_d.ptr(), h_vecDst.ptr(), h_vecDst.length() * sizeof(NcvRect32u));
+    NCV_SKIP_COND_END
+
+    ncvStat = h_vecSrc.copySolid(d_vecSrc, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvStat = h_vecDst.copySolid(d_vecDst, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    Ncv32u h_outElemNum_d = 0;
+    Ncv32u h_outElemNum_h = 0;
+    NCV_SKIP_COND_BEGIN
+    h_outElemNum_d = this->lenDst;
+    ncvStat = ncvGrowDetectionsVector_device(d_vecSrc, this->lenSrc,
+                                             d_vecDst, h_outElemNum_d, this->maxLenDst,
+                                             this->rectWidth, this->rectHeight, this->rectScale, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvStat = d_vecDst.copySolid(h_vecDst_d, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    h_outElemNum_h = this->lenDst;
+    ncvStat = ncvGrowDetectionsVector_host(h_vecSrc, this->lenSrc,
+                                           h_vecDst, h_outElemNum_h, this->maxLenDst,
+                                           this->rectWidth, this->rectHeight, this->rectScale);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    if (h_outElemNum_d != h_outElemNum_h)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        if (memcmp(h_vecDst.ptr(), h_vecDst_d.ptr(), this->maxLenDst * sizeof(NcvRect32u)))
+        {
+            bLoopVirgin = false;
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHypothesesGrow::deinit()
+{
+    return true;
+}
diff --git a/tests/gpu/src/nvidia/TestHypothesesGrow.h b/tests/gpu/src/nvidia/TestHypothesesGrow.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8358ec7828670029eb4bcb033fb5693b903b5f3
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestHypothesesGrow.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testhypothesesgrow_h_
+#define _testhypothesesgrow_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHypothesesGrow : public NCVTestProvider
+{
+public:
+
+    TestHypothesesGrow(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                       Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f rectScale, 
+                       Ncv32u maxLenSrc, Ncv32u lenSrc, Ncv32u maxLenDst, Ncv32u lenDst);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+	TestHypothesesGrow(const TestHypothesesGrow&);
+	TestHypothesesGrow& operator=(const TestHypothesesGrow&);	
+
+
+    NCVTestSourceProvider<Ncv32u> &src;
+    Ncv32u rectWidth;
+    Ncv32u rectHeight;
+    Ncv32f rectScale;
+    Ncv32u maxLenSrc;
+    Ncv32u lenSrc;
+    Ncv32u maxLenDst;
+    Ncv32u lenDst;
+};
+
+#endif // _testhypothesesgrow_h_
diff --git a/tests/gpu/src/nvidia/TestIntegralImage.cpp b/tests/gpu/src/nvidia/TestIntegralImage.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47de70cb138916c893e050749a640a7aa6546d2e
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestIntegralImage.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include <math.h>
+#include "TestIntegralImage.h"
+
+
+template <class T_in, class T_out>
+TestIntegralImage<T_in, T_out>::TestIntegralImage(std::string testName, NCVTestSourceProvider<T_in> &src,
+                                                  Ncv32u width, Ncv32u height)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    width(width),
+    height(height)
+{
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T_in)=" << sizeof(T_in) << std::endl;
+    strOut << "sizeof(T_out)=" << sizeof(T_out) << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    return true;
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::init()
+{
+    return true;
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u widthII = this->width + 1;
+    Ncv32u heightII = this->height + 1;
+
+    NCVMatrixAlloc<T_in> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T_in> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T_out> d_imgII(*this->allocatorGPU.get(), widthII, heightII);
+    ncvAssertReturn(d_imgII.isMemAllocated(), false);
+    NCVMatrixAlloc<T_out> h_imgII(*this->allocatorCPU.get(), widthII, heightII);
+    ncvAssertReturn(h_imgII.isMemAllocated(), false);
+    NCVMatrixAlloc<T_out> h_imgII_d(*this->allocatorCPU.get(), widthII, heightII);
+    ncvAssertReturn(h_imgII_d.isMemAllocated(), false);
+
+    Ncv32u bufSize;
+    if (sizeof(T_in) == sizeof(Ncv8u))
+    {
+        ncvStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &bufSize, this->devProp);
+        ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    }
+    else if (sizeof(T_in) == sizeof(Ncv32f))
+    {
+        ncvStat = nppiStIntegralGetSize_32f32f(NcvSize32u(this->width, this->height), &bufSize, this->devProp);
+        ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+    }
+
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    ncvAssertReturn(this->src.fill(h_img), false);
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    if (sizeof(T_in) == sizeof(Ncv8u))
+    {
+        ncvStat = nppiStIntegral_8u32u_C1R((Ncv8u *)d_img.ptr(), d_img.pitch(),
+                                           (Ncv32u *)d_imgII.ptr(), d_imgII.pitch(),
+                                           NcvSize32u(this->width, this->height),
+                                           d_tmpBuf.ptr(), bufSize, this->devProp);
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else if (sizeof(T_in) == sizeof(Ncv32f))
+    {
+        ncvStat = nppiStIntegral_32f32f_C1R((Ncv32f *)d_img.ptr(), d_img.pitch(),
+                                            (Ncv32f *)d_imgII.ptr(), d_imgII.pitch(),
+                                            NcvSize32u(this->width, this->height),
+                                            d_tmpBuf.ptr(), bufSize, this->devProp);
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+    }
+
+    ncvStat = d_imgII.copySolid(h_imgII_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    if (sizeof(T_in) == sizeof(Ncv8u))
+    {
+        ncvStat = nppiStIntegral_8u32u_C1R_host((Ncv8u *)h_img.ptr(), h_img.pitch(),
+                                                (Ncv32u *)h_imgII.ptr(), h_imgII.pitch(),
+                                                NcvSize32u(this->width, this->height));
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else if (sizeof(T_in) == sizeof(Ncv32f))
+    {
+        ncvStat = nppiStIntegral_32f32f_C1R_host((Ncv32f *)h_img.ptr(), h_img.pitch(),
+                                                 (Ncv32f *)h_imgII.ptr(), h_imgII.pitch(),
+                                                 NcvSize32u(this->width, this->height));
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+    }
+
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    for (Ncv32u i=0; bLoopVirgin && i < h_img.height() + 1; i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_img.width() + 1; j++)
+        {
+            if (sizeof(T_in) == sizeof(Ncv8u))
+            {
+                if (h_imgII.ptr()[h_imgII.stride()*i+j] != h_imgII_d.ptr()[h_imgII_d.stride()*i+j])
+                {
+                    bLoopVirgin = false;
+                }
+            }
+            else if (sizeof(T_in) == sizeof(Ncv32f))
+            {
+                if (fabsf((float)h_imgII.ptr()[h_imgII.stride()*i+j] - (float)h_imgII_d.ptr()[h_imgII_d.stride()*i+j]) > 0.01f)
+                {
+                    bLoopVirgin = false;
+                }
+            }
+            else
+            {
+                ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::deinit()
+{
+    return true;
+}
+
+
+template class TestIntegralImage<Ncv8u, Ncv32u>;
+template class TestIntegralImage<Ncv32f, Ncv32f>;
diff --git a/tests/gpu/src/nvidia/TestIntegralImage.h b/tests/gpu/src/nvidia/TestIntegralImage.h
new file mode 100644
index 0000000000000000000000000000000000000000..22677766b629996f3053e8002d3f562fd61bbd9a
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestIntegralImage.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testintegralimage_h_
+#define _testintegralimage_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+template <class T_in, class T_out>
+class TestIntegralImage : public NCVTestProvider
+{
+public:
+
+    TestIntegralImage(std::string testName, NCVTestSourceProvider<T_in> &src,
+                      Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+	TestIntegralImage(const TestIntegralImage&);
+	TestIntegralImage& operator=(const TestIntegralImage&);	
+
+    NCVTestSourceProvider<T_in> &src;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testintegralimage_h_
diff --git a/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp b/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec245934fd161ec40a5fbe377023b60ab17e51c5
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include "TestIntegralImageSquared.h"
+
+
+TestIntegralImageSquared::TestIntegralImageSquared(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                                                   Ncv32u width, Ncv32u height)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    width(width),
+    height(height)
+{
+}
+
+
+bool TestIntegralImageSquared::toString(std::ofstream &strOut)
+{
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    return true;
+}
+
+
+bool TestIntegralImageSquared::init()
+{
+    return true;
+}
+
+
+bool TestIntegralImageSquared::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u widthSII = this->width + 1;
+    Ncv32u heightSII = this->height + 1;
+
+    NCVMatrixAlloc<Ncv8u> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv8u> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> d_imgSII(*this->allocatorGPU.get(), widthSII, heightSII);
+    ncvAssertReturn(d_imgSII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_imgSII(*this->allocatorCPU.get(), widthSII, heightSII);
+    ncvAssertReturn(h_imgSII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_imgSII_d(*this->allocatorCPU.get(), widthSII, heightSII);
+    ncvAssertReturn(h_imgSII_d.isMemAllocated(), false);
+
+    Ncv32u bufSize;
+    ncvStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &bufSize, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    ncvAssertReturn(this->src.fill(h_img), false);
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(),
+                                          d_imgSII.ptr(), d_imgSII.pitch(),
+                                          NcvSize32u(this->width, this->height),
+                                          d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = d_imgSII.copySolid(h_imgSII_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R_host(h_img.ptr(), h_img.pitch(),
+                                               h_imgSII.ptr(), h_imgSII.pitch(),
+                                               NcvSize32u(this->width, this->height));
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    for (Ncv32u i=0; bLoopVirgin && i < h_img.height() + 1; i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_img.width() + 1; j++)
+        {
+            if (h_imgSII.ptr()[h_imgSII.stride()*i+j] != h_imgSII_d.ptr()[h_imgSII_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestIntegralImageSquared::deinit()
+{
+    return true;
+}
diff --git a/tests/gpu/src/nvidia/TestIntegralImageSquared.h b/tests/gpu/src/nvidia/TestIntegralImageSquared.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1aaf28af50f9a855bbe5dc710cb3349f72be29f
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestIntegralImageSquared.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testintegralimagesquared_h_
+#define _testintegralimagesquared_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestIntegralImageSquared : public NCVTestProvider
+{
+public:
+
+    TestIntegralImageSquared(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                             Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+	TestIntegralImageSquared(const TestIntegralImageSquared&);
+	TestIntegralImageSquared& operator=(const TestIntegralImageSquared&);	
+
+    NCVTestSourceProvider<Ncv8u> &src;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testintegralimagesquared_h_
diff --git a/tests/gpu/src/nvidia/TestRectStdDev.cpp b/tests/gpu/src/nvidia/TestRectStdDev.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40a1ad7dac3cde5901a21a2856f0e04bbe976b68
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestRectStdDev.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include <math.h>
+
+#include "TestRectStdDev.h"
+
+
+TestRectStdDev::TestRectStdDev(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                               Ncv32u width, Ncv32u height, NcvRect32u rect, Ncv32f scaleFactor,
+                               NcvBool bTextureCache)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    width(width),
+    height(height),
+    rect(rect),
+    scaleFactor(scaleFactor),
+    bTextureCache(bTextureCache)
+{
+}
+
+
+bool TestRectStdDev::toString(std::ofstream &strOut)
+{
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    strOut << "rect=[" << rect.x << ", " << rect.y << ", " << rect.width << ", " << rect.height << "]\n";
+    strOut << "scaleFactor=" << scaleFactor << std::endl;
+    strOut << "bTextureCache=" << bTextureCache << std::endl;
+    return true;
+}
+
+
+bool TestRectStdDev::init()
+{
+    return true;
+}
+
+
+bool TestRectStdDev::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32s _normWidth = (Ncv32s)this->width - this->rect.x - this->rect.width + 1;
+    Ncv32s _normHeight = (Ncv32s)this->height - this->rect.y - this->rect.height + 1;
+    if (_normWidth <= 0 || _normHeight <= 0)
+    {
+        return true;
+    }
+    Ncv32u normWidth = (Ncv32u)_normWidth;
+    Ncv32u normHeight = (Ncv32u)_normHeight;
+    NcvSize32u szNormRoi(normWidth, normHeight);
+
+    Ncv32u widthII = this->width + 1;
+    Ncv32u heightII = this->height + 1;
+    Ncv32u widthSII = this->width + 1;
+    Ncv32u heightSII = this->height + 1;
+
+    NCVMatrixAlloc<Ncv8u> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv8u> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv32u> d_imgII(*this->allocatorGPU.get(), widthII, heightII);
+    ncvAssertReturn(d_imgII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> h_imgII(*this->allocatorCPU.get(), widthII, heightII);
+    ncvAssertReturn(h_imgII.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv64u> d_imgSII(*this->allocatorGPU.get(), widthSII, heightSII);
+    ncvAssertReturn(d_imgSII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_imgSII(*this->allocatorCPU.get(), widthSII, heightSII);
+    ncvAssertReturn(h_imgSII.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv32f> d_norm(*this->allocatorGPU.get(), normWidth, normHeight);
+    ncvAssertReturn(d_norm.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32f> h_norm(*this->allocatorCPU.get(), normWidth, normHeight);
+    ncvAssertReturn(h_norm.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32f> h_norm_d(*this->allocatorCPU.get(), normWidth, normHeight);
+    ncvAssertReturn(h_norm_d.isMemAllocated(), false);
+
+    Ncv32u bufSizeII, bufSizeSII;
+    ncvStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &bufSizeII, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    ncvStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &bufSizeSII, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    Ncv32u bufSize = bufSizeII > bufSizeSII ? bufSizeII : bufSizeSII;
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStIntegral_8u32u_C1R(d_img.ptr(), d_img.pitch(),
+                                       d_imgII.ptr(), d_imgII.pitch(),
+                                       NcvSize32u(this->width, this->height),
+                                       d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(),
+                                          d_imgSII.ptr(), d_imgSII.pitch(),
+                                          NcvSize32u(this->width, this->height),
+                                          d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStRectStdDev_32f_C1R(d_imgII.ptr(), d_imgII.pitch(),
+                                       d_imgSII.ptr(), d_imgSII.pitch(),
+                                       d_norm.ptr(), d_norm.pitch(),
+                                       szNormRoi, this->rect,
+                                       this->scaleFactor,
+                                       this->bTextureCache);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = d_norm.copySolid(h_norm_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStIntegral_8u32u_C1R_host(h_img.ptr(), h_img.pitch(),
+                                          h_imgII.ptr(), h_imgII.pitch(),
+                                          NcvSize32u(this->width, this->height));
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R_host(h_img.ptr(), h_img.pitch(),
+                                             h_imgSII.ptr(), h_imgSII.pitch(),
+                                             NcvSize32u(this->width, this->height));
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStRectStdDev_32f_C1R_host(h_imgII.ptr(), h_imgII.pitch(),
+                                          h_imgSII.ptr(), h_imgSII.pitch(),
+                                          h_norm.ptr(), h_norm.pitch(),
+                                          szNormRoi, this->rect,
+                                          this->scaleFactor);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < h_norm.height(); i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_norm.width(); j++)
+        {
+            Ncv64f absErr = fabs(h_norm.ptr()[h_norm.stride()*i+j] - h_norm_d.ptr()[h_norm_d.stride()*i+j]);
+            Ncv64f relErr = absErr / h_norm.ptr()[h_norm.stride()*i+j];
+
+            if (relErr > relEPS)
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestRectStdDev::deinit()
+{
+    return true;
+}
diff --git a/tests/gpu/src/nvidia/TestRectStdDev.h b/tests/gpu/src/nvidia/TestRectStdDev.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c0473e29640e2d34912ac3969647d0deb4262f1
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestRectStdDev.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testrectstddev_h_
+#define _testrectstddev_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestRectStdDev : public NCVTestProvider
+{
+public:
+
+    TestRectStdDev(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                   Ncv32u width, Ncv32u height, NcvRect32u rect, Ncv32f scaleFactor,
+                   NcvBool bTextureCache);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+	TestRectStdDev(const TestRectStdDev&);
+	TestRectStdDev& operator=(const TestRectStdDev&);	
+
+    NCVTestSourceProvider<Ncv8u> &src;
+    NcvRect32u rect;
+    Ncv32u width;
+    Ncv32u height;
+    Ncv32f scaleFactor;
+
+    NcvBool bTextureCache;
+};
+
+#endif // _testrectstddev_h_
diff --git a/tests/gpu/src/nvidia/TestResize.cpp b/tests/gpu/src/nvidia/TestResize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..02108997b3b3e17cac9b8aa71bf46222427d0c1a
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestResize.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include <math.h>
+
+#include "TestResize.h"
+
+
+template <class T>
+TestResize<T>::TestResize(std::string testName, NCVTestSourceProvider<T> &src,
+                          Ncv32u width, Ncv32u height, Ncv32u scaleFactor, NcvBool bTextureCache)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    width(width),
+    height(height),
+    scaleFactor(scaleFactor),
+    bTextureCache(bTextureCache)
+{
+}
+
+
+template <class T>
+bool TestResize<T>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T)=" << sizeof(T) << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "scaleFactor=" << scaleFactor << std::endl;
+    strOut << "bTextureCache=" << bTextureCache << std::endl;
+    return true;
+}
+
+
+template <class T>
+bool TestResize<T>::init()
+{
+    return true;
+}
+
+
+template <class T>
+bool TestResize<T>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32s smallWidth = this->width / this->scaleFactor;
+    Ncv32s smallHeight = this->height / this->scaleFactor;
+    if (smallWidth == 0 || smallHeight == 0)
+    {
+        return true;
+    }
+
+    NcvSize32u srcSize(this->width, this->height);
+
+    NCVMatrixAlloc<T> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    NCVMatrixAlloc<T> d_small(*this->allocatorGPU.get(), smallWidth, smallHeight);
+    ncvAssertReturn(d_small.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_small(*this->allocatorCPU.get(), smallWidth, smallHeight);
+    ncvAssertReturn(h_small.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_small_d(*this->allocatorCPU.get(), smallWidth, smallHeight);
+    ncvAssertReturn(h_small_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+    NCV_SKIP_COND_END
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStDownsampleNearest_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
+                                                  (Ncv32u *)d_small.ptr(), d_small.pitch(),
+                                                  srcSize, this->scaleFactor,
+                                                  this->bTextureCache);
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStDownsampleNearest_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
+                                                  (Ncv64u *)d_small.ptr(), d_small.pitch(),
+                                                  srcSize, this->scaleFactor,
+                                                  this->bTextureCache);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect downsample test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+    ncvStat = d_small.copySolid(h_small_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStDownsampleNearest_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
+                                                       (Ncv32u *)h_small.ptr(), h_small.pitch(),
+                                                       srcSize, this->scaleFactor);
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStDownsampleNearest_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
+                                                       (Ncv64u *)h_small.ptr(), h_small.pitch(),
+                                                       srcSize, this->scaleFactor);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect downsample test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    //const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < h_small.height(); i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_small.width(); j++)
+        {
+            if (h_small.ptr()[h_small.stride()*i+j] != h_small_d.ptr()[h_small_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T>
+bool TestResize<T>::deinit()
+{
+    return true;
+}
+
+
+template class TestResize<Ncv32u>;
+template class TestResize<Ncv64u>;
diff --git a/tests/gpu/src/nvidia/TestResize.h b/tests/gpu/src/nvidia/TestResize.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bd57a8007cb36fb22d4ba434f2dc341170fa0d9
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestResize.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testresize_h_
+#define _testresize_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+template <class T>
+class TestResize : public NCVTestProvider
+{
+public:
+
+    TestResize(std::string testName, NCVTestSourceProvider<T> &src,
+               Ncv32u width, Ncv32u height, Ncv32u scaleFactor, NcvBool bTextureCache);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+	TestResize(const TestResize&);
+	TestResize& operator=(const TestResize&);	
+
+    NCVTestSourceProvider<T> &src;
+    Ncv32u width;
+    Ncv32u height;
+    Ncv32u scaleFactor;
+
+    NcvBool bTextureCache;
+};
+
+#endif // _testresize_h_
diff --git a/tests/gpu/src/nvidia/TestTranspose.cpp b/tests/gpu/src/nvidia/TestTranspose.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa131f867bb2932853621e2eac089faa4269bde3
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestTranspose.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+
+#include <math.h>
+
+#include "TestTranspose.h"
+
+
+template <class T>
+TestTranspose<T>::TestTranspose(std::string testName, NCVTestSourceProvider<T> &src,
+                                Ncv32u width, Ncv32u height)
+    :
+    NCVTestProvider(testName),
+    src(src),
+    width(width),
+    height(height)
+{
+}
+
+
+template <class T>
+bool TestTranspose<T>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T)=" << sizeof(T) << std::endl;
+    strOut << "width=" << width << std::endl;
+    return true;
+}
+
+
+template <class T>
+bool TestTranspose<T>::init()
+{
+    return true;
+}
+
+
+template <class T>
+bool TestTranspose<T>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NcvSize32u srcSize(this->width, this->height);
+
+    NCVMatrixAlloc<T> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    NCVMatrixAlloc<T> d_dst(*this->allocatorGPU.get(), this->height, this->width);
+    ncvAssertReturn(d_dst.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_dst(*this->allocatorCPU.get(), this->height, this->width);
+    ncvAssertReturn(h_dst.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_dst_d(*this->allocatorCPU.get(), this->height, this->width);
+    ncvAssertReturn(h_dst_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+    NCV_SKIP_COND_END
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
+                                          (Ncv32u *)d_dst.ptr(), d_dst.pitch(),
+                                          NcvSize32u(this->width, this->height));
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStTranspose_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
+                                        (Ncv64u *)d_dst.ptr(), d_dst.pitch(),
+                                        NcvSize32u(this->width, this->height));
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect transpose test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+    ncvStat = d_dst.copySolid(h_dst_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStTranspose_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
+                                               (Ncv32u *)h_dst.ptr(), h_dst.pitch(),
+                                               NcvSize32u(this->width, this->height));
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStTranspose_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
+                                               (Ncv64u *)h_dst.ptr(), h_dst.pitch(),
+                                               NcvSize32u(this->width, this->height));
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect downsample test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    //const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < this->width; i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < this->height; j++)
+        {
+            if (h_dst.ptr()[h_dst.stride()*i+j] != h_dst_d.ptr()[h_dst_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T>
+bool TestTranspose<T>::deinit()
+{
+    return true;
+}
+
+
+template class TestTranspose<Ncv32u>;
+template class TestTranspose<Ncv64u>;
diff --git a/tests/gpu/src/nvidia/TestTranspose.h b/tests/gpu/src/nvidia/TestTranspose.h
new file mode 100644
index 0000000000000000000000000000000000000000..d865c3c8f26fb591db014b342a17765efc3c16ce
--- /dev/null
+++ b/tests/gpu/src/nvidia/TestTranspose.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testtranspose_h_
+#define _testtranspose_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+template <class T>
+class TestTranspose : public NCVTestProvider
+{
+public:
+
+    TestTranspose(std::string testName, NCVTestSourceProvider<T> &src,
+                  Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+	TestTranspose(const TestTranspose&);
+	TestTranspose& operator=(const TestTranspose&);	
+
+    NCVTestSourceProvider<T> &src;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testtranspose_h_
diff --git a/tests/gpu/src/nvidia/main_nvidia.cpp b/tests/gpu/src/nvidia/main_nvidia.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4f2119c30b2489ae04260fce73a38bc8e09d3f4
--- /dev/null
+++ b/tests/gpu/src/nvidia/main_nvidia.cpp
@@ -0,0 +1,346 @@
+#pragma warning (disable : 4408 4201 4100)
+ 
+#include <cstdio>
+
+#include "NCV.hpp"
+#include "NCVHaarObjectDetection.hpp"
+
+#include "TestIntegralImage.h"
+#include "TestIntegralImageSquared.h"
+#include "TestRectStdDev.h"
+#include "TestResize.h"
+#include "TestCompact.h"
+#include "TestTranspose.h"
+
+#include "TestDrawRects.h"
+#include "TestHypothesesGrow.h"
+#include "TestHypothesesFilter.h"
+#include "TestHaarCascadeLoader.h"
+#include "TestHaarCascadeApplication.h"
+
+#include "NCVAutoTestLister.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+template <class T_in, class T_out>
+void generateIntegralTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<T_in> &src,
+                           Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    for (Ncv32f _i=1.0; _i<maxWidth; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "LinIntImgW%dH%d", i, 2);
+        testLister.add(new TestIntegralImage<T_in, T_out>(testName, src, i, 2));
+    }
+    for (Ncv32f _i=1.0; _i<maxHeight; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "LinIntImgW%dH%d", 2, i);
+        testLister.add(new TestIntegralImage<T_in, T_out>(testName, src, 2, i));
+    }
+
+    //test VGA
+    testLister.add(new TestIntegralImage<T_in, T_out>("LinIntImg_VGA", src, 640, 480));
+
+    //TODO: add tests of various resolutions up to 4096x4096
+}
+
+
+void generateSquaredIntegralTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv8u> &src,
+                                  Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    for (Ncv32f _i=1.0; _i<maxWidth; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "SqIntImgW%dH%d", i, 32);
+        testLister.add(new TestIntegralImageSquared(testName, src, i, 32));
+    }
+    for (Ncv32f _i=1.0; _i<maxHeight; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "SqIntImgW%dH%d", 32, i);
+        testLister.add(new TestIntegralImageSquared(testName, src, 32, i));
+    }
+
+    //test VGA
+    testLister.add(new TestIntegralImageSquared("SqLinIntImg_VGA", src, 640, 480));
+
+    //TODO: add tests of various resolutions up to 4096x4096
+}
+
+
+void generateRectStdDevTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv8u> &src,
+                             Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    NcvRect32u rect(1,1,18,18);
+
+    for (Ncv32f _i=32; _i<maxHeight/2 && _i < maxWidth/2; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "RectStdDevW%dH%d", i*2, i);
+        testLister.add(new TestRectStdDev(testName, src, i*2, i, rect, 1, true));
+        testLister.add(new TestRectStdDev(testName, src, i*2, i, rect, 1.5, false));
+        testLister.add(new TestRectStdDev(testName, src, i-1, i*2-1, rect, 1, false));
+        testLister.add(new TestRectStdDev(testName, src, i-1, i*2-1, rect, 2.5, true));
+    }
+
+    //test VGA
+    testLister.add(new TestRectStdDev("RectStdDev_VGA", src, 640, 480, rect, 1, true));
+
+    //TODO: add tests of various resolutions up to 4096x4096
+}
+
+
+template <class T>
+void generateResizeTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<T> &src)
+{
+    //test VGA
+    for (Ncv32u i=1; i<480; i+=3)
+    {
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "TestResize_VGA_s%d", i);
+        testLister.add(new TestResize<T>(testName, src, 640, 480, i, true));
+        testLister.add(new TestResize<T>(testName, src, 640, 480, i, false));
+    }
+
+    //test HD
+    for (Ncv32u i=1; i<1080; i+=5)
+    {
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "TestResize_1080_s%d", i);
+        testLister.add(new TestResize<T>(testName, src, 1920, 1080, i, true));
+        testLister.add(new TestResize<T>(testName, src, 1920, 1080, i, false));
+    }
+
+    //TODO: add tests of various resolutions up to 4096x4096
+}
+
+
+void generateNPPSTVectorTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv32u> &src, Ncv32u maxLength)
+{
+    //compaction
+    for (Ncv32f _i=256.0; _i<maxLength; _i*=1.1f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0xFFFFFFFF, 30));
+    }
+    for (Ncv32u i=1; i<260; i++)
+    {
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0xC001C0DE, 70));
+        testLister.add(new TestCompact(testName, src, i, 0xC001C0DE, 0));
+        testLister.add(new TestCompact(testName, src, i, 0xC001C0DE, 100));
+    }
+    for (Ncv32u i=256*256-256; i<256*256+257; i++)
+    {
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0xFFFFFFFF, 40));
+    }
+    for (Ncv32u i=256*256*256-10; i<256*256*256+10; i++)
+    {
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0x00000000, 2));
+    }
+}
+
+
+template <class T>
+void generateTransposeTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<T> &src)
+{
+    for (int i=2; i<64; i+=4)
+    {
+        for (int j=2; j<64; j+=4)
+        {
+            char testName[80];
+            sprintf_s(testName, sizeof(testName), "TestTranspose_%dx%d", i, j);
+            testLister.add(new TestTranspose<T>(testName, src, i, j));
+        }
+    }
+
+    for (int i=1; i<128; i+=1)
+    {
+        for (int j=1; j<2; j+=1)
+        {
+            char testName[80];
+            sprintf_s(testName, sizeof(testName), "TestTranspose_%dx%d", i, j);
+            testLister.add(new TestTranspose<T>(testName, src, i, j));
+        }
+    }
+
+    testLister.add(new TestTranspose<T>("TestTranspose_VGA", src, 640, 480));
+    testLister.add(new TestTranspose<T>("TestTranspose_HD1080", src, 1920, 1080));
+}
+
+
+template <class T>
+void generateDrawRectsTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<T> &src, NCVTestSourceProvider<Ncv32u> &src32u,
+                            Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    for (Ncv32f _i=16.0; _i<maxWidth; _i*=1.1f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        Ncv32u j = maxHeight * i / maxWidth;
+        if (!j) continue;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "DrawRectsW%dH%d", i, j);
+
+        if (sizeof(T) == sizeof(Ncv32u))
+        {
+            testLister.add(new TestDrawRects<T>(testName, src, src32u, i, j, i*j/1000+1, (T)0xFFFFFFFF));
+        }
+        else if (sizeof(T) == sizeof(Ncv8u))
+        {
+            testLister.add(new TestDrawRects<T>(testName, src, src32u, i, j, i*j/1000+1, (T)0xFF));
+        }
+        else
+        {
+            ncvAssertPrintCheck(false, "Attempted to instantiate non-existing DrawRects test suite");
+        }
+    }
+
+    //test VGA
+    testLister.add(new TestDrawRects<T>("DrawRects_VGA", src, src32u, 640, 480, 640*480/1000, (T)0xFF));
+
+    //TODO: add tests of various resolutions up to 4096x4096
+}
+
+
+void generateVectorTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv32u> &src, Ncv32u maxLength)
+{
+    //growth
+    for (Ncv32f _i=10.0; _i<maxLength; _i*=1.1f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "VectorGrow%d", i);
+        testLister.add(new TestHypothesesGrow(testName, src, 20, 20, 2.2f, i, i/2, i, i/4));
+        testLister.add(new TestHypothesesGrow(testName, src, 10, 42, 1.2f, i, i, i, 0));
+    }
+    testLister.add(new TestHypothesesGrow("VectorGrow01b", src, 10, 42, 1.2f, 10, 0, 10, 1));
+    testLister.add(new TestHypothesesGrow("VectorGrow11b", src, 10, 42, 1.2f, 10, 1, 10, 1));
+    testLister.add(new TestHypothesesGrow("VectorGrow10b", src, 10, 42, 1.2f, 10, 1, 10, 0));
+    testLister.add(new TestHypothesesGrow("VectorGrow00b", src, 10, 42, 1.2f, 10, 0, 10, 0));
+}
+
+
+void generateHypothesesFiltrationTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv32u> &src, Ncv32u maxLength)
+{
+    for (Ncv32f _i=1.0; _i<maxLength; _i*=1.1f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "HypFilter%d", i);
+        testLister.add(new TestHypothesesFilter(testName, src, i, 3, 0.2f));
+        testLister.add(new TestHypothesesFilter(testName, src, i, 0, 0.2f));
+        testLister.add(new TestHypothesesFilter(testName, src, i, 1, 0.1f));
+    }
+}
+
+
+void generateHaarLoaderTests(NCVAutoTestLister &testLister)
+{
+    testLister.add(new TestHaarCascadeLoader("haarcascade_eye.xml", "haarcascade_eye.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_frontalface_alt.xml", "haarcascade_frontalface_alt.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_frontalface_alt2.xml", "haarcascade_frontalface_alt2.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_frontalface_alt_tree.xml", "haarcascade_frontalface_alt_tree.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_eye_tree_eyeglasses.xml", "haarcascade_eye_tree_eyeglasses.xml"));
+}
+
+
+void generateHaarApplicationTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv8u> &src,
+                                  Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    for (Ncv32u i=20; i<512; i+=11)
+    {
+        for (Ncv32u j=20; j<128; j+=5)
+        {
+            char testName[80];
+            sprintf_s(testName, sizeof(testName), "HaarAppl%d_%d", i, j);
+            testLister.add(new TestHaarCascadeApplication(testName, src, "haarcascade_frontalface_alt.xml", j, i));
+        }
+    }
+    for (Ncv32f _i=20.0; _i<maxWidth; _i*=1.1f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf_s(testName, sizeof(testName), "HaarAppl%d", i);
+        testLister.add(new TestHaarCascadeApplication(testName, src, "haarcascade_frontalface_alt.xml", i, i));
+    }
+}
+
+
+static void devNullOutput(const char *msg)
+{
+}
+
+
+int main_nvidia()
+{
+    printf("Testing NVIDIA Computer Vision SDK\n");
+    printf("==================================\n");
+
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerII("NPPST Integral Image"                   );//,,true, false);
+    NCVAutoTestLister testListerSII("NPPST Squared Integral Image"          );//,,true, false);
+    NCVAutoTestLister testListerRStdDev("NPPST RectStdDev"                  );//,,true, false);
+    NCVAutoTestLister testListerResize("NPPST Resize"                       );//,,true, false);
+    NCVAutoTestLister testListerNPPSTVectorOperations("NPPST Vector Operations"  );//,,true, false);
+    NCVAutoTestLister testListerTranspose("NPPST Transpose"                 );//,,true, false);
+
+    NCVAutoTestLister testListerVectorOperations("Vector Operations"        );//,,true, false);
+    NCVAutoTestLister testListerHaarLoader("Haar Cascade Loader"            );//,,true, false);
+    NCVAutoTestLister testListerHaarAppl("Haar Cascade Application"         );//,,true, false);
+    NCVAutoTestLister testListerHypFiltration("Hypotheses Filtration"       );//,,true, false);
+    NCVAutoTestLister testListerVisualize("Visualization"                   );//,,true, false);
+
+    printf("Initializing data source providers\n");
+    NCVTestSourceProvider<Ncv32u> testSrcRandom_32u(2010, 0, 0xFFFFFFFF, 4096, 4096);
+    NCVTestSourceProvider<Ncv8u> testSrcRandom_8u(2010, 0, 255, 4096, 4096);
+    NCVTestSourceProvider<Ncv64u> testSrcRandom_64u(2010, 0, 0xFFFFFFFFFFFFFFFF, 4096, 4096);
+    NCVTestSourceProvider<Ncv8u> testSrcFacesVGA_8u("../../data/group_1_640x480_VGA.pgm");
+    NCVTestSourceProvider<Ncv32f> testSrcRandom_32f(2010, -1.0f, 1.0f, 4096, 4096);
+
+    printf("Generating NPPST test suites\n");
+    generateIntegralTests<Ncv8u, Ncv32u>(testListerII, testSrcRandom_8u, 4096, 4096);
+    generateIntegralTests<Ncv32f, Ncv32f>(testListerII, testSrcRandom_32f, 4096, 4096);
+    generateSquaredIntegralTests(testListerSII, testSrcRandom_8u, 4096, 4096);
+    generateRectStdDevTests(testListerRStdDev, testSrcRandom_8u, 4096, 4096);
+    generateResizeTests(testListerResize, testSrcRandom_32u);
+    generateResizeTests(testListerResize, testSrcRandom_64u);
+    generateNPPSTVectorTests(testListerNPPSTVectorOperations, testSrcRandom_32u, 4096*4096);
+    generateTransposeTests(testListerTranspose, testSrcRandom_32u);
+    generateTransposeTests(testListerTranspose, testSrcRandom_64u);
+
+    printf("Generating NCV test suites\n");
+    generateDrawRectsTests(testListerVisualize, testSrcRandom_8u, testSrcRandom_32u, 4096, 4096);
+    generateDrawRectsTests(testListerVisualize, testSrcRandom_32u, testSrcRandom_32u, 4096, 4096);
+    generateVectorTests(testListerVectorOperations, testSrcRandom_32u, 4096*4096);
+    generateHypothesesFiltrationTests(testListerHypFiltration, testSrcRandom_32u, 1024);
+    generateHaarLoaderTests(testListerHaarLoader);
+    generateHaarApplicationTests(testListerHaarAppl, testSrcFacesVGA_8u, 1280, 720);
+
+    testListerII.invoke();
+    testListerSII.invoke();
+    testListerRStdDev.invoke();
+    testListerResize.invoke();
+    testListerNPPSTVectorOperations.invoke();
+    testListerTranspose.invoke();
+    testListerVisualize.invoke();
+    testListerVectorOperations.invoke();
+    testListerHypFiltration.invoke();
+    testListerHaarLoader.invoke();
+    testListerHaarAppl.invoke();
+
+	return 0;
+}
diff --git a/tests/gpu/src/nvidia_tests.cpp b/tests/gpu/src/nvidia_tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04d64368298cb6883025ff9e0793f5b9371360e5
--- /dev/null
+++ b/tests/gpu/src/nvidia_tests.cpp
@@ -0,0 +1,63 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "gputest.hpp"
+#include "cvconfig.h"
+
+class CV_NVidiaTestsCaller : public CvTest
+{
+public:
+    CV_NVidiaTestsCaller() : CvTest("GPU-NVidia", "NVidia") {}
+    virtual ~CV_NVidiaTestsCaller() {}
+
+protected:
+    
+	void run( int )
+	{   		
+#if defined(HAVE_CUDA)
+		int main_nvidia();
+		main_nvidia();	
+		ts->set_failed_test_info(CvTS::OK);
+#else
+		ts->set_failed_test_info(CvTS::SKIPPED);
+#endif
+	}   
+} CV_NVidiaTestsCaller_test;
\ No newline at end of file