1) NPP_staging as sources. Binaries removed.

2) NVidia tests for GPU 3) FD sample that uses NVidia's interface.

1) NPP_staging as sources. Binaries removed.
2) NVidia tests for GPU 3) FD sample that uses NVidia's interface.
0747f2d8 · Anatoly Baksheev · 811f6fbe · 811f6fbe · 811f6fbe · 811f6fbe
Commit 0747f2d8 authored Feb 04, 2011 by Anatoly Baksheev
51 changed files
--- a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
+++ b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
--- a/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib
+++ b/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib
--- a/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a
+++ b/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a
--- a/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a
+++ b/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a
--- a/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a
+++ b/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -36,10 +36,12 @@ file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})
 if (HAVE_CUDA AND MSVC)
-    file(GLOB ncv_srcs "src/nvidia/*.cpp")
+    file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
-	file(GLOB ncv_hdrs "src/nvidia/*.h*")
+	file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")	
-	file(GLOB ncv_cuda "src/nvidia/*.cu")
+	file(GLOB_RECURSE ncv_hdr1 "src/nvidia/*.hpp")	
-	source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+	file(GLOB_RECURSE ncv_hdr2 "src/nvidia/*.h")	
+	source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdr1} ${ncv_hdr2} ${ncv_cuda})	
+	include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
 endif()
 if (HAVE_CUDA)		
@@ -74,17 +76,13 @@ if (HAVE_CUDA)
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-	endif()
+	endif()     
-    include(FindNPP_staging.cmake)
-    include_directories(${NPPST_INC})   
 	CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
 	#CUDA_BUILD_CLEAN_TARGET()
 endif()
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdr1} ${ncv_hdr2} ${ncv_cuda} ${cuda_objs})
 if(PCHSupport_FOUND)
 	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
@@ -117,8 +115,7 @@ set_target_properties(${the_target} PROPERTIES
 target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
 if (HAVE_CUDA)
-	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
+	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})    
-    target_link_libraries(${the_target} ${NPPST_LIB})
    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
 endif()

--- a/modules/gpu/FindNPP_staging.cmake
+++ b/modules/gpu/FindNPP_staging.cmake
-if(CMAKE_SIZEOF_VOID_P EQUAL 4)			
-    set(BIT_SUFF 32)
-else()
-    set(BIT_SUFF 64)
-endif()
-if (APPLE)
-    set(PLATFORM_SUFF Darwin)
-elseif (UNIX)
-    set(PLATFORM_SUFF Linux)
-else()
-    set(PLATFORM_SUFF Windows)
-endif()
-set(LIB_FILE NPP_staging_static_${PLATFORM_SUFF}_${BIT_SUFF}_v1)
-find_library(NPPST_LIB 
-    NAMES "${LIB_FILE}" "lib${LIB_FILE}" 
-    PATHS "${CMAKE_SOURCE_DIR}/3rdparty/NPP_staging" 
-    DOC "NPP staging library"
-    )	
-SET(NPPST_INC "${CMAKE_SOURCE_DIR}//3rdparty/NPP_staging")
\ No newline at end of file
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -83,25 +83,25 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
        sz.width  = src.cols;
        sz.height = src.rows;
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );		
    }
    else if (src.elemSize() == 4)
    {
-        NppStSize32u sz;
+        NcvSize32u sz;
        sz.width  = src.cols;
        sz.height = src.rows;
-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<NppSt32u*>(src.ptr<NppSt32u>()), src.step, 
+        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), src.step, 
-            dst.ptr<NppSt32u>(), dst.step, sz) );
+            dst.ptr<Ncv32u>(), dst.step, sz) );
    }
    else // if (src.elemSize() == 8)
    {
-        NppStSize32u sz;
+        NcvSize32u sz;
        sz.width  = src.cols;
        sz.height = src.rows;
-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<NppSt64u*>(src.ptr<NppSt64u>()), src.step, 
+        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), src.step, 
-            dst.ptr<NppSt64u>(), dst.step, sz) );
+            dst.ptr<Ncv64u>(), dst.step, sz) );		
    }
    cudaSafeCall( cudaThreadSynchronize() );

--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -126,7 +126,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
            minNeighbors,
            scaleStep, 1,
            flags,
-            *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor,  0);
+            *gpuAllocator, *cpuAllocator, devProp, 0);
        ncvAssertReturnNcvStat(ncvStat);
        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
@@ -146,8 +146,8 @@ private:
        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
-        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice);        
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, devProp.textureAlignment);        
-        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned);
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment);
        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
@@ -212,7 +212,7 @@ private:
        roi.height = d_src.height();
        Ncv32u numDetections;
        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0);
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);
        ncvAssertReturnNcvStat(ncvStat);
        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);

--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -560,16 +560,19 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer)
    sum.create(src.rows + 1, src.cols + 1, CV_32S);
-    NppStSize32u roiSize;
+    NcvSize32u roiSize;
    roiSize.width = src.cols;
    roiSize.height = src.rows;
-    NppSt32u bufSize;
+	cudaDeviceProp prop;
-    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize) );
+	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+    Ncv32u bufSize;
+    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
    ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
-    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<NppSt8u*>(src.ptr<NppSt8u>()), src.step, 
+    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), src.step, 
-        sum.ptr<NppSt32u>(), sum.step, roiSize, buffer.ptr<NppSt8u>(), bufSize) );
+        sum.ptr<Ncv32u>(), sum.step, roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
    cudaSafeCall( cudaThreadSynchronize() );
 }
@@ -600,19 +603,20 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum)
 {
    CV_Assert(src.type() == CV_8U);
-    NppStSize32u roiSize;
+    NcvSize32u roiSize;
    roiSize.width = src.cols;
    roiSize.height = src.rows;
-    NppSt32u bufSize;
+	cudaDeviceProp prop;
-    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize));
+	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+    Ncv32u bufSize;
+    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
    GpuMat buf(1, bufSize, CV_8U);
    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(
+    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), src.step, 
-            const_cast<NppSt8u*>(src.ptr<NppSt8u>(0)), src.step, 
+            sqsum.ptr<Ncv64u>(0), sqsum.step, roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
-            sqsum.ptr<NppSt64u>(0), sqsum.step, roiSize, 
-            buf.ptr<NppSt8u>(0), bufSize));
    cudaSafeCall( cudaThreadSynchronize() );
 }

--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@@ -75,13 +75,13 @@ struct HaarFeature64
 #define HaarFeature64_CreateCheck_MaxRectField                  0xFF
-    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight)
+    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u /*clsWidth*/, Ncv32u /*clsHeight*/)
    {
        ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);
-        ((NcvRect8u*)&(this->_ui2.x))->x = rectX;
+        ((NcvRect8u*)&(this->_ui2.x))->x = (Ncv8u)rectX;
-        ((NcvRect8u*)&(this->_ui2.x))->y = rectY;
+        ((NcvRect8u*)&(this->_ui2.x))->y = (Ncv8u)rectY;
-        ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth;
+        ((NcvRect8u*)&(this->_ui2.x))->width = (Ncv8u)rectWidth;
-        ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight;
+        ((NcvRect8u*)&(this->_ui2.x))->height = (Ncv8u)rectHeight;
        return NCV_SUCCESS;
    }
@@ -306,11 +306,11 @@ struct HaarStage64
 };
-NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8);
+NCV_CT_ASSERT(sizeof(HaarFeature64) == 8);
-NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
-NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
-NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
+NCV_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
-NPPST_CT_ASSERT(sizeof(HaarStage64) == 8);
+NCV_CT_ASSERT(sizeof(HaarStage64) == 8);
 //==============================================================================
@@ -347,7 +347,7 @@ enum
    NCVPipeObjDet_VisualizeInPlace      = 0x004,
 };
+NCV_EXPORTS
 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
                                            NcvSize32u srcRoi,
                                            NCVVector<NcvRect32u> &d_dstRects,
@@ -367,15 +367,14 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
                                            INCVMemAllocator &gpuAllocator,
                                            INCVMemAllocator &cpuAllocator,
-                                            Ncv32u devPropMajor,
+                                            cudaDeviceProp &devProp,
-                                            Ncv32u devPropMinor,
                                            cudaStream_t cuStream);
 #define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF
 #define HAAR_STDDEV_BORDER                  1
+NCV_EXPORTS
 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
                                               NCVMatrix<Ncv32f> &d_weights,
                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
@@ -391,11 +390,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
                                               Ncv32f scaleArea,
                                               INCVMemAllocator &gpuAllocator,
                                               INCVMemAllocator &cpuAllocator,
-                                               Ncv32u devPropMajor,
+                                               cudaDeviceProp &devProp,
-                                               Ncv32u devPropMinor,
                                               cudaStream_t cuStream);
+NCV_EXPORTS
 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
                                             NCVMatrix<Ncv32f> &h_weights,
                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,
@@ -409,7 +407,7 @@ NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
                                             Ncv32u pixelStep,
                                             Ncv32f scaleArea);
+NCV_EXPORTS
 NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
                                 Ncv32u dstStride,
                                 Ncv32u dstWidth,
@@ -419,7 +417,7 @@ NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
                                 Ncv8u color,
                                 cudaStream_t cuStream);
+NCV_EXPORTS
 NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
                                  Ncv32u dstStride,
                                  Ncv32u dstWidth,
@@ -429,7 +427,7 @@ NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
                                  Ncv32u color,
                                  cudaStream_t cuStream);
+NCV_EXPORTS
 NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
                               Ncv32u dstStride,
                               Ncv32u dstWidth,
@@ -438,7 +436,7 @@ NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
                               Ncv32u numRects,
                               Ncv8u color);
+NCV_EXPORTS
 NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
                                Ncv32u dstStride,
                                Ncv32u dstWidth,
@@ -450,7 +448,7 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
 #define RECT_SIMILARITY_PROPORTION      0.2f
+NCV_EXPORTS
 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
                                         Ncv32u numPixelMaskDetections,
                                         NCVVector<NcvRect32u> &hypotheses,
@@ -461,7 +459,7 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
                                         Ncv32f curScale,
                                         cudaStream_t cuStream);
+NCV_EXPORTS
 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
                                       Ncv32u numPixelMaskDetections,
                                       NCVVector<NcvRect32u> &hypotheses,
@@ -471,18 +469,18 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
                                       Ncv32u rectHeight,
                                       Ncv32f curScale);
+NCV_EXPORTS
 NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
                                   Ncv32u &numHypotheses,
                                   Ncv32u minNeighbors,
                                   Ncv32f intersectEps,
                                   NCVVector<Ncv32u> *hypothesesWeights);
+NCV_EXPORTS
 NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
                                   Ncv32u &numNodes, Ncv32u &numFeatures);
+NCV_EXPORTS
 NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
                                   HaarClassifierCascadeDescriptor &haar,
                                   NCVVector<HaarStage64> &h_HaarStages,
@@ -490,6 +488,7 @@ NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
                                   NCVVector<HaarFeature64> &h_HaarFeatures);
+NCV_EXPORTS
 NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
                                 HaarClassifierCascadeDescriptor haar,
                                 NCVVector<HaarStage64> &h_HaarStages,

--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
--- a/3rdparty/NPP_staging/npp_staging.h
+++ b/3rdparty/NPP_staging/npp_staging.h
--- a/modules/gpu/src/nvidia/NCV.cpp
+++ b/modules/gpu/src/nvidia/NCV.cpp
@@ -40,15 +40,13 @@
 //M*/
-#include <precomp.hpp>
 #if !defined (HAVE_CUDA)
 #else /* !defined (HAVE_CUDA) */
+#include <ios>
 #include <stdarg.h>
 #include "NCV.hpp"
@@ -94,17 +92,6 @@ void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
 //==============================================================================
-NCVStatus GPUAlignmentValue(Ncv32u &alignment)
-{
-    int curDev;
-    cudaDeviceProp curProp;
-    ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR);
-    alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major);
-    return NCV_SUCCESS;
-}
 Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
 {
    Ncv32u alignMask = alignment-1;
@@ -216,7 +203,7 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
 }
-NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment)
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
    :
    currentSize(0),
    _maxSize(0),
@@ -229,17 +216,26 @@ NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity,
    allocBegin = NULL;
-    switch (memT)
+    if (reusePtr == NULL)
    {
-    case NCVMemoryTypeDevice:
+        bReusesMemory = false;
-        ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
+        switch (memT)
-        break;
+        {
-    case NCVMemoryTypeHostPinned:
+        case NCVMemoryTypeDevice:
-        ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
+            ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
-        break;
+            break;
-    case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
-        allocBegin = (Ncv8u *)malloc(capacity);
+            ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
-        break;
+            break;
+        case NCVMemoryTypeHostPageable:
+            allocBegin = (Ncv8u *)malloc(capacity);
+            break;
+        }
+    }
+    else
+    {
+        bReusesMemory = true;
+        allocBegin = (Ncv8u *)reusePtr;
    }
    if (capacity == 0)
@@ -260,18 +256,23 @@ NCVMemStackAllocator::~NCVMemStackAllocator()
    if (allocBegin != NULL)
    {
        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
-        switch (_memType)
+        if (!bReusesMemory)
        {
-        case NCVMemoryTypeDevice:
+            switch (_memType)
-            ncvAssertCUDAReturn(cudaFree(allocBegin), );
+            {
-            break;
+            case NCVMemoryTypeDevice:
-        case NCVMemoryTypeHostPinned:
+                ncvAssertCUDAReturn(cudaFree(allocBegin), );
-            ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
+                break;
-            break;
+            case NCVMemoryTypeHostPinned:
-        case NCVMemoryTypeHostPageable:
+                ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
-            free(allocBegin);
+                break;
-            break;
+            case NCVMemoryTypeHostPageable:
+                free(allocBegin);
+                break;
+            }
        }
        allocBegin = NULL;
    }
 }
@@ -356,14 +357,14 @@ size_t NCVMemStackAllocator::maxSize(void) const
 //===================================================================
-NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT)
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
    :
    currentSize(0),
    _maxSize(0),
-    _memType(memT)
+    _memType(memT),
+    _alignment(alignment)
 {
    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
-    ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", );
 }

--- a/modules/gpu/src/nvidia/NCV.hpp
+++ b/modules/gpu/src/nvidia/NCV.hpp
--- a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+#include <stdarg.h>
+#include <vector>
 ////////////////////////////////////////////////////////////////////////////////
 // The Loki Library
 // Copyright (c) 2001 by Andrei Alexandrescu
@@ -14,13 +62,6 @@
 // http://loki-lib.sourceforge.net/index.php?n=Main.License
 ////////////////////////////////////////////////////////////////////////////////
-#ifndef _ncvruntimetemplates_hpp_
-#define _ncvruntimetemplates_hpp_
-#include <stdarg.h>
-#include <vector>
 namespace Loki
 {
    //==============================================================================

--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -69,9 +69,9 @@
    #include "cufft.h"
    #include "opencv2/gpu/stream_accessor.hpp"
    #include "npp.h"    
-    #include "npp_staging.h"
+	#include "nvidia/core/NCV.hpp"
-	#include "nvidia/NCV.hpp"
+	#include "nvidia/NPP_staging/npp_staging.hpp"
 	#include "nvidia/NCVHaarObjectDetection.hpp"
 #define CUDART_MINIMUM_REQUIRED_VERSION 3020

--- a/modules/gtest/src/gtestcv.cpp
+++ b/modules/gtest/src/gtestcv.cpp
@@ -1378,7 +1378,7 @@ cmpEpsFlt_(const _Tp* src1, const _Tp* src2, size_t total, int imaxdiff, size_t
    {
        _Tp a = src1[i], b = src2[i];
        if( a < 0 ) a ^= C; if( b < 0 ) b ^= C;
-        _Tp d = std::abs(a - b);
+        _Tp d = std::abs(double(a - b));
        if( d > imaxdiff )
        {
            idx = i + startidx;

--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -3,8 +3,8 @@
 #
 # ----------------------------------------------------------------------------
-add_subdirectory(c)
+#add_subdirectory(c)
-add_subdirectory(cpp)
+#add_subdirectory(cpp)
 add_subdirectory(gpu)
 if(0)

--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -14,10 +14,12 @@ if (BUILD_EXAMPLES)
        "${CMAKE_SOURCE_DIR}/modules/legacy/include"
        "${CMAKE_SOURCE_DIR}/modules/contrib/include"
        "${CMAKE_SOURCE_DIR}/modules/gpu/include"
+		"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia"
+		"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core"
    )   
    if(HAVE_CUDA)
-        include_directories(${CUDA_INCLUDE_DIRS})
+        include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core)
    endif()
    if(CMAKE_COMPILER_IS_GNUCXX)

--- a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
+++ b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -3,12 +3,15 @@
 # ----------------------------------------------------------------------------
 project(opencv_test_gpu)
+set(the_target "opencv_test_gpu")
 file(GLOB test_srcs "src/*.cpp")
-source_group("Src" FILES ${test_srcs})
 file(GLOB test_hdrs "src/*.h*")
+source_group("Src" FILES ${test_srcs})
 source_group("Include" FILES ${test_hdrs})
-set(the_target "opencv_test_gpu")
 include_directories	(
 			"${CMAKE_SOURCE_DIR}/include/opencv"
@@ -26,11 +29,21 @@ include_directories	(
 			"${CMAKE_SOURCE_DIR}/modules/ml/include"
 			"${CMAKE_CURRENT_SOURCE_DIR}/src"
 			"${CMAKE_CURRENT_BINARY_DIR}"
+			"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia"
+			"${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core"
 			)
 include_directories(../cxts)
-add_executable(${the_target} ${test_srcs} ${test_hdrs})
+if(HAVE_CUDA)
+	include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
+	file(GLOB nvidia "src/nvidia/*.*")
+	SET(ncv_cpp ../../modules/gpu/src/nvidia/core/NCV.cpp)
+	source_group("Src\\nvidia" FILES ${nvidia})
+endif()
+add_executable(${the_target} ${test_srcs} ${test_hdrs} ${nvidia} ${ncv_cpp})
 # Additional target properties
 set_target_properties(${the_target} PROPERTIES

--- a/tests/gpu/src/nvidia/NCVAutoTestLister.hpp
+++ b/tests/gpu/src/nvidia/NCVAutoTestLister.hpp
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _ncvautotestlister_hpp_
+#define _ncvautotestlister_hpp_
+#include <vector>
+#include "NCVTest.hpp"
+class NCVAutoTestLister
+{
+public:
+    NCVAutoTestLister(std::string testSuiteName, NcvBool bStopOnFirstFail=false, NcvBool bCompactOutput=true)
+        :
+    testSuiteName(testSuiteName),
+    bStopOnFirstFail(bStopOnFirstFail),
+    bCompactOutput(bCompactOutput)
+    {
+    }
+    void add(INCVTest *test)
+    {
+        this->tests.push_back(test);
+    }
+    void invoke()
+    {
+        Ncv32u nPassed = 0;
+        Ncv32u nFailed = 0;
+        Ncv32u nFailedMem = 0;
+        if (bCompactOutput)
+        {
+            printf("Test suite '%s' with %d tests\n", 
+                testSuiteName.c_str(),
+                (int)(this->tests.size()));
+        }
+        for (Ncv32u i=0; i<this->tests.size(); i++)
+        {
+            INCVTest &curTest = *tests[i];
+            NCVTestReport curReport;
+            bool res = curTest.executeTest(curReport);
+            if (!bCompactOutput)
+            {
+                printf("Test %3i %16s; Consumed mem GPU = %8d, CPU = %8d; %s\n",
+                    i,
+                    curTest.getName().c_str(),
+                    curReport.statsNums["MemGPU"],
+                    curReport.statsNums["MemCPU"],
+                    curReport.statsText["rcode"].c_str());
+            }
+            if (res)
+            {
+                nPassed++;
+                if (bCompactOutput)
+                {
+                    printf(".");
+                }
+            }
+            else
+            {
+                if (!curReport.statsText["rcode"].compare("FAILED"))
+                {
+                    nFailed++;
+                    if (bCompactOutput)
+                    {
+                        printf("x");
+                    }
+                    if (bStopOnFirstFail)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    nFailedMem++;
+                    if (bCompactOutput)
+                    {
+                        printf("m");
+                    }
+                }
+            }
+            fflush(stdout);
+        }
+        if (bCompactOutput)
+        {
+            printf("\n");
+        }
+        printf("Test suite '%s' complete: %d total, %d passed, %d memory errors, %d failed\n\n", 
+            testSuiteName.c_str(),
+            (int)(this->tests.size()),
+            nPassed,
+            nFailedMem,
+            nFailed);
+    }
+    ~NCVAutoTestLister()
+    {
+        for (Ncv32u i=0; i<this->tests.size(); i++)
+        {
+            delete tests[i];
+        }
+    }
+private:
+    NcvBool bStopOnFirstFail;
+    NcvBool bCompactOutput;
+    std::string testSuiteName;
+    std::vector<INCVTest *> tests;
+};
+#endif // _ncvautotestlister_hpp_
--- a/tests/gpu/src/nvidia/NCVTest.hpp
+++ b/tests/gpu/src/nvidia/NCVTest.hpp
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _ncvtest_hpp_
+#define _ncvtest_hpp_
+#pragma warning( disable : 4201 4408 4127 4100)
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <algorithm>
+#include <fstream>
+#include <cuda_runtime.h>
+#include "NPP_staging.hpp"
+struct NCVTestReport
+{
+    std::map<std::string, Ncv32u> statsNums;
+    std::map<std::string, std::string> statsText;
+};
+class INCVTest
+{
+public:
+    virtual bool executeTest(NCVTestReport &report) = 0;
+    virtual std::string getName() const = 0;
+};
+class NCVTestProvider : public INCVTest
+{
+public:
+    NCVTestProvider(std::string testName)
+        :
+        testName(testName)
+    {
+        int devId;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&this->devProp, devId), "Error returned from cudaGetDeviceProperties", );
+    }
+    virtual bool init() = 0;
+    virtual bool process() = 0;
+    virtual bool deinit() = 0;
+    virtual bool toString(std::ofstream &strOut) = 0;
+    virtual std::string getName() const
+    {
+        return this->testName;
+    }
+    virtual ~NCVTestProvider()
+    {
+        deinitMemory();
+    }
+    virtual bool executeTest(NCVTestReport &report)
+    {
+        bool res;
+        report.statsText["rcode"] = "FAILED";
+        res = initMemory(report);
+        if (!res)
+        {
+            dumpToFile(report);
+            deinitMemory();
+            return false;
+        }
+        res = init();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinit();
+            deinitMemory();
+            return false;
+        }
+        res = process();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinit();
+            deinitMemory();
+            return false;
+        }
+        res = deinit();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinitMemory();
+            return false;
+        }
+        deinitMemory();
+        report.statsText["rcode"] = "Passed";
+        return true;
+    }
+protected:
+    cudaDeviceProp devProp;
+    std::auto_ptr<INCVMemAllocator> allocatorGPU;
+    std::auto_ptr<INCVMemAllocator> allocatorCPU;
+private:
+    std::string testName;
+    bool initMemory(NCVTestReport &report)
+    {
+        this->allocatorGPU.reset(new NCVMemStackAllocator(devProp.textureAlignment));
+        this->allocatorCPU.reset(new NCVMemStackAllocator(devProp.textureAlignment));
+        if (!this->allocatorGPU.get()->isInitialized() ||
+            !this->allocatorCPU.get()->isInitialized())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+        if (!this->process())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+        Ncv32u maxGPUsize = (Ncv32u)this->allocatorGPU.get()->maxSize();
+        Ncv32u maxCPUsize = (Ncv32u)this->allocatorCPU.get()->maxSize();
+        report.statsNums["MemGPU"] = maxGPUsize;
+        report.statsNums["MemCPU"] = maxCPUsize;
+        this->allocatorGPU.reset(new NCVMemStackAllocator(NCVMemoryTypeDevice, maxGPUsize, devProp.textureAlignment));
+        this->allocatorCPU.reset(new NCVMemStackAllocator(NCVMemoryTypeHostPinned, maxCPUsize, devProp.textureAlignment));
+        if (!this->allocatorGPU.get()->isInitialized() ||
+            !this->allocatorCPU.get()->isInitialized())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+        return true;
+    }
+    void deinitMemory()
+    {
+        this->allocatorGPU.reset();
+        this->allocatorCPU.reset();
+    }
+    void dumpToFile(NCVTestReport &report)
+    {
+        bool bReasonMem = (0 == report.statsText["rcode"].compare("Memory FAILED"));
+        std::string fname = "TestDump_";
+        fname += (bReasonMem ? "m_" : "") + this->testName + ".log";
+        std::ofstream stream(fname.c_str(), std::ios::trunc | std::ios::out);
+        if (!stream.is_open()) return;
+        stream << "NCV Test Failure Log: " << this->testName << std::endl;
+        stream << "====================================================" << std::endl << std::endl;
+        stream << "Test initialization report: " << std::endl;
+        for (std::map<std::string,std::string>::iterator it=report.statsText.begin();
+             it != report.statsText.end(); it++)
+        {
+            stream << it->first << "=" << it->second << std::endl;
+        }
+        for (std::map<std::string,Ncv32u>::iterator it=report.statsNums.begin();
+            it != report.statsNums.end(); it++)
+        {
+            stream << it->first << "=" << it->second << std::endl;
+        }
+        stream << std::endl;
+        stream << "Test initialization parameters: " << std::endl;
+        bool bSerializeRes = false;
+        try
+        {
+            bSerializeRes = this->toString(stream);
+        }
+        catch (...)
+        {
+        }
+        if (!bSerializeRes)
+        {
+            stream << "Couldn't retrieve object dump" << std::endl;
+        }
+        stream.flush();
+    }
+};
+#endif // _ncvtest_hpp_
--- a/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp
+++ b/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _ncvtestsourceprovider_hpp_
+#define _ncvtestsourceprovider_hpp_
+#include <memory>
+#include "NCV.hpp"
+#include <opencv2/highgui/highgui.hpp>
+template <class T>
+class NCVTestSourceProvider
+{
+public:
+    NCVTestSourceProvider(Ncv32u seed, T rangeLow, T rangeHigh, Ncv32u maxWidth, Ncv32u maxHeight)
+        :
+        bInit(false)
+    {
+        ncvAssertPrintReturn(rangeLow < rangeHigh, "NCVTestSourceProvider ctor:: Invalid range", );
+        int devId;
+        cudaDeviceProp devProp;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", );
+        //Ncv32u maxWpitch = alignUp(maxWidth * sizeof(T), devProp.textureAlignment);
+        allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment));
+        data.reset(new NCVMatrixAlloc<T>(*this->allocatorCPU.get(), maxWidth, maxHeight));
+        ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", );
+        this->dataWidth = maxWidth;
+        this->dataHeight = maxHeight;
+        srand(seed);
+        for (Ncv32u i=0; i<maxHeight; i++)
+        {
+            for (Ncv32u j=0; j<data.get()->stride(); j++)
+            {
+                data.get()->ptr()[i * data.get()->stride() + j] =
+                    (T)(((1.0 * rand()) / RAND_MAX) * (rangeHigh - rangeLow) + rangeLow);
+            }
+        }
+        this->bInit = true;
+    }
+    NCVTestSourceProvider(std::string pgmFilename)
+        :
+        bInit(false)
+    {
+        ncvAssertPrintReturn(sizeof(T) == 1, "NCVTestSourceProvider ctor:: PGM constructor complies only with 8bit types", );
+		cv::Mat image = cv::imread(pgmFilename);		
+		ncvAssertPrintReturn(!image.empty(), "NCVTestSourceProvider ctor:: PGM file error", );
+        int devId;
+        cudaDeviceProp devProp;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", );
+        allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment));
+        data.reset(new NCVMatrixAlloc<T>(*this->allocatorCPU.get(), image.cols, image.rows));
+        ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", );
+        this->dataWidth = image.cols;
+        this->dataHeight = image.rows;
+		cv::Mat hdr(image.size(), CV_8UC1, data.get()->ptr(), data.get()->pitch());
+		image.copyTo(hdr);
+        this->bInit = true;
+    }
+    NcvBool fill(NCVMatrix<T> &dst)
+    {
+        ncvAssertReturn(this->isInit() &&
+                        dst.memType() == allocatorCPU.get()->memType(), false);
+        if (dst.width() == 0 || dst.height() == 0)
+        {
+            return true;
+        }
+        for (Ncv32u i=0; i<dst.height(); i++)
+        {
+            Ncv32u srcLine = i % this->dataHeight;
+            Ncv32u srcFullChunks = dst.width() / this->dataWidth;
+            for (Ncv32u j=0; j<srcFullChunks; j++)
+            {
+                memcpy(dst.ptr() + i * dst.stride() + j * this->dataWidth,
+                    this->data.get()->ptr() + this->data.get()->stride() * srcLine,
+                    this->dataWidth * sizeof(T));
+            }
+            Ncv32u srcLastChunk = dst.width() % this->dataWidth;
+            memcpy(dst.ptr() + i * dst.stride() + srcFullChunks * this->dataWidth,
+                this->data.get()->ptr() + this->data.get()->stride() * srcLine,
+                srcLastChunk * sizeof(T));
+        }
+        return true;
+    }
+    NcvBool fill(NCVVector<T> &dst)
+    {
+        ncvAssertReturn(this->isInit() &&
+                        dst.memType() == allocatorCPU.get()->memType(), false);
+        if (dst.length() == 0)
+        {
+            return true;
+        }
+        Ncv32u srcLen = this->dataWidth * this->dataHeight;
+        Ncv32u srcFullChunks = (Ncv32u)dst.length() / srcLen;
+        for (Ncv32u j=0; j<srcFullChunks; j++)
+        {
+            memcpy(dst.ptr() + j * srcLen, this->data.get()->ptr(), srcLen * sizeof(T));
+        }
+        Ncv32u srcLastChunk = dst.length() % srcLen;
+        memcpy(dst.ptr() + srcFullChunks * srcLen, this->data.get()->ptr(), srcLastChunk * sizeof(T));
+        return true;
+    }
+    ~NCVTestSourceProvider()
+    {
+        data.reset();
+        allocatorCPU.reset();
+    }
+private:
+    NcvBool isInit(void)
+    {
+        return this->bInit;
+    }
+    NcvBool bInit;
+    std::auto_ptr< INCVMemAllocator > allocatorCPU;
+    std::auto_ptr< NCVMatrixAlloc<T> > data;
+    Ncv32u dataWidth;
+    Ncv32u dataHeight;
+};
+#endif // _ncvtestsourceprovider_hpp_
--- a/tests/gpu/src/nvidia/TestCompact.cpp
+++ b/tests/gpu/src/nvidia/TestCompact.cpp
--- a/tests/gpu/src/nvidia/TestCompact.h
+++ b/tests/gpu/src/nvidia/TestCompact.h
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual 
+ * property and proprietary rights in and to this software and 
+ * related documentation and any modifications thereto.  
+ * Any use, reproduction, disclosure, or distribution of this 
+ * software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ */
+#ifndef _testhypothesescompact_h_
+#define _testhypothesescompact_h_
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+class TestCompact : public NCVTestProvider
+{
+public:
+    TestCompact(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                          Ncv32u length, Ncv32u badElem, Ncv32u badElemPercentage);
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+private:
+	TestCompact(const TestCompact&);
+	TestCompact& operator=(const TestCompact&);	
+    NCVTestSourceProvider<Ncv32u> &src;
+    Ncv32u length;
+    Ncv32u badElem;
+    Ncv32u badElemPercentage;
+};
+#endif // _testhypothesescompact_h_
--- a/tests/gpu/src/nvidia/TestDrawRects.cpp
+++ b/tests/gpu/src/nvidia/TestDrawRects.cpp
--- a/tests/gpu/src/nvidia/TestDrawRects.h
+++ b/tests/gpu/src/nvidia/TestDrawRects.h
--- a/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp
+++ b/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp
--- a/tests/gpu/src/nvidia/TestHaarCascadeApplication.h
+++ b/tests/gpu/src/nvidia/TestHaarCascadeApplication.h
--- a/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp
+++ b/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp
--- a/tests/gpu/src/nvidia/TestHaarCascadeLoader.h
+++ b/tests/gpu/src/nvidia/TestHaarCascadeLoader.h
--- a/tests/gpu/src/nvidia/TestHypothesesFilter.cpp
+++ b/tests/gpu/src/nvidia/TestHypothesesFilter.cpp
--- a/tests/gpu/src/nvidia/TestHypothesesFilter.h
+++ b/tests/gpu/src/nvidia/TestHypothesesFilter.h
--- a/tests/gpu/src/nvidia/TestHypothesesGrow.cpp
+++ b/tests/gpu/src/nvidia/TestHypothesesGrow.cpp
--- a/tests/gpu/src/nvidia/TestHypothesesGrow.h
+++ b/tests/gpu/src/nvidia/TestHypothesesGrow.h
--- a/tests/gpu/src/nvidia/TestIntegralImage.cpp
+++ b/tests/gpu/src/nvidia/TestIntegralImage.cpp
--- a/tests/gpu/src/nvidia/TestIntegralImage.h
+++ b/tests/gpu/src/nvidia/TestIntegralImage.h
--- a/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp
+++ b/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp
--- a/tests/gpu/src/nvidia/TestIntegralImageSquared.h
+++ b/tests/gpu/src/nvidia/TestIntegralImageSquared.h
--- a/tests/gpu/src/nvidia/TestRectStdDev.cpp
+++ b/tests/gpu/src/nvidia/TestRectStdDev.cpp
--- a/tests/gpu/src/nvidia/TestRectStdDev.h
+++ b/tests/gpu/src/nvidia/TestRectStdDev.h
--- a/tests/gpu/src/nvidia/TestResize.cpp
+++ b/tests/gpu/src/nvidia/TestResize.cpp
--- a/tests/gpu/src/nvidia/TestResize.h
+++ b/tests/gpu/src/nvidia/TestResize.h
--- a/tests/gpu/src/nvidia/TestTranspose.cpp
+++ b/tests/gpu/src/nvidia/TestTranspose.cpp
--- a/tests/gpu/src/nvidia/TestTranspose.h
+++ b/tests/gpu/src/nvidia/TestTranspose.h
--- a/tests/gpu/src/nvidia/main_nvidia.cpp
+++ b/tests/gpu/src/nvidia/main_nvidia.cpp
--- a/tests/gpu/src/nvidia_tests.cpp
+++ b/tests/gpu/src/nvidia_tests.cpp