First version of CascadeClassifier_GPU.

Only for VS2008 now. Sample for it. new NPP_staging for VS2008 only

First version of CascadeClassifier_GPU.
Only for VS2008 now. Sample for it. new NPP_staging for VS2008 only
1a941861 · Anatoly Baksheev · 31e582e3 · 1a941861 · 1a941861 · 1a941861
Commit 1a941861 authored Jan 13, 2011 by Anatoly Baksheev
17 changed files
--- a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
+++ b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
--- a/3rdparty/NPP_staging/npp_staging.h
+++ b/3rdparty/NPP_staging/npp_staging.h
@@ -188,14 +188,14 @@ struct NppStSize32u
 enum NppStStatus
 {
    //already present in NPP
- /*   NPP_SUCCESS                      = 0,   ///< Successful operation (same as NPP_NO_ERROR)
-    NPP_ERROR                        = -1,  ///< Unknown error
-    NPP_CUDA_KERNEL_EXECUTION_ERROR  = -3,  ///< CUDA kernel execution error
-    NPP_NULL_POINTER_ERROR           = -4,  ///< NULL pointer argument error
-    NPP_TEXTURE_BIND_ERROR           = -24, ///< CUDA texture binding error or non-zero offset returned
-    NPP_MEMCPY_ERROR                 = -13, ///< CUDA memory copy error
-    NPP_MEM_ALLOC_ERR                = -12, ///< CUDA memory allocation error
-    NPP_MEMFREE_ERR                  = -15, ///< CUDA memory deallocation error*/
+    //NPP_SUCCESS                      = 0,   ///< Successful operation (same as NPP_NO_ERROR)
+    //NPP_ERROR                        = -1,  ///< Unknown error
+    //NPP_CUDA_KERNEL_EXECUTION_ERROR  = -3,  ///< CUDA kernel execution error
+    //NPP_NULL_POINTER_ERROR           = -4,  ///< NULL pointer argument error
+    //NPP_TEXTURE_BIND_ERROR           = -24, ///< CUDA texture binding error or non-zero offset returned
+    //NPP_MEMCPY_ERROR                 = -13, ///< CUDA memory copy error
+    //NPP_MEM_ALLOC_ERR                = -12, ///< CUDA memory allocation error
+    //NPP_MEMFREE_ERR                  = -15, ///< CUDA memory deallocation error

    //to be added
    NPP_INVALID_ROI,                        ///< Invalid region of interest argument
@@ -244,7 +244,7 @@ extern "C" {

 /** \defgroup core_npp NPP Core
 * Basic functions for CUDA streams management.
- * WARNING: These functions couldn't be exported from NPP_staging library, so they can't be used
+ * WARNING: These functions couldn't be exported into DLL, so they can be used only with static version of NPP_staging
 * @{
 */

@@ -569,6 +569,13 @@ NppStStatus nppiStTranspose_64f_C1R_host(NppSt64f *h_src, NppSt32u srcStride,
 NppStStatus nppiStIntegralGetSize_8u32u(NppStSize32u roiSize, NppSt32u *pBufsize);


+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ * \see nppiStIntegralGetSize_8u32u
+ */
+NppStStatus nppiStIntegralGetSize_32f32f(NppStSize32u roiSize, NppSt32u *pBufsize);
+
+
 /**
 * Creates an integral image representation for the input image
 *
@@ -587,6 +594,15 @@ NppStStatus nppiStIntegral_8u32u_C1R(NppSt8u *d_src, NppSt32u srcStep,
                                     NppSt8u *pBuffer, NppSt32u bufSize);


+/**
+ * Creates an integral image representation for the input image
+ * \see nppiStIntegral_8u32u_C1R
+ */
+NppStStatus nppiStIntegral_32f32f_C1R(NppSt32f *d_src, NppSt32u srcStep,
+                                      NppSt32f *d_dst, NppSt32u dstStep, NppStSize32u roiSize,
+                                      NppSt8u *pBuffer, NppSt32u bufSize);
+
+
 /**
 * Creates an integral image representation for the input image. Host implementation
 *
@@ -602,6 +618,14 @@ NppStStatus nppiStIntegral_8u32u_C1R_host(NppSt8u *h_src, NppSt32u srcStep,
                                          NppSt32u *h_dst, NppSt32u dstStep, NppStSize32u roiSize);


+/**
+ * Creates an integral image representation for the input image. Host implementation
+ * \see nppiStIntegral_8u32u_C1R_host
+ */
+NppStStatus nppiStIntegral_32f32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep,
+                                           NppSt32f *h_dst, NppSt32u dstStep, NppStSize32u roiSize);
+
+
 /**
 * Calculates the size of the temporary buffer for squared integral image creation
 *

--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -35,6 +35,13 @@ source_group("Include" FILES ${lib_hdrs})
 file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})

+if (HAVE_CUDA AND MSVC)
+    file(GLOB ncv_srcs "src/nvidia/*.cpp")
+	file(GLOB ncv_hdrs "src/nvidia/*.h*")
+	file(GLOB ncv_cuda "src/nvidia/*.cu")
+	source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+endif()
+
 if (HAVE_CUDA)		
 	get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
 	set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})	
@@ -68,19 +75,16 @@ if (HAVE_CUDA)
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
 	endif()
+   

-	CUDA_COMPILE(cuda_objs ${lib_cuda})
+    include(FindNPP_staging.cmake)
+    include_directories(${NPPST_INC})   
+    
+	CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
 	#CUDA_BUILD_CLEAN_TARGET()
 endif()

-
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${cuda_objs})
-
-IF (HAVE_CUDA)
-    include(FindNPP_staging.cmake)    
-    include_directories(${NPPST_INC})       
-    target_link_libraries(${the_target} ${NPPST_LIB})
-endif()
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})

 if(PCHSupport_FOUND)
 	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
@@ -114,6 +118,7 @@ target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )

 if (HAVE_CUDA)
 	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
+    target_link_libraries(${the_target} ${NPPST_LIB})
    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
 endif()


--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1380,87 +1380,39 @@ namespace cv
            explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}
        };

-        ////////////////////////////////// CascadeClassifier //////////////////////////////////////////
+        ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////
        // The cascade classifier class for object detection.
-        class CV_EXPORTS CascadeClassifier
+        class CV_EXPORTS CascadeClassifier_GPU
        {
-        public:
-            struct CV_EXPORTS DTreeNode
-            {
-                int featureIdx;
-                float threshold; // for ordered features only
-                int left;
-                int right;
-            };
-
-            struct CV_EXPORTS DTree
-            {
-                int nodeCount;
-            };
-
-            struct CV_EXPORTS Stage
-            {
-                int first;
-                int ntrees;
-                float threshold;
-            };
-
-            enum { BOOST = 0 };
-            enum { DO_CANNY_PRUNING = 1, SCALE_IMAGE = 2,FIND_BIGGEST_OBJECT = 4, DO_ROUGH_SEARCH = 8 };
-
-            CascadeClassifier();
-            CascadeClassifier(const string& filename);
-            ~CascadeClassifier();
+        public:            
+            CascadeClassifier_GPU();
+            CascadeClassifier_GPU(const string& filename);
+            ~CascadeClassifier_GPU();

            bool empty() const;
            bool load(const string& filename);
-            bool read(const FileNode& node);
-
-            void detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor=1.1,
-                int minNeighbors=3, int flags=0, Size minSize=Size(), Size maxSize=Size());
-
-            bool setImage( Ptr<FeatureEvaluator>&, const Mat& );
-            int runAt( Ptr<FeatureEvaluator>&, Point );
-
-            bool isStumpBased;
-
-            int stageType;
-            int featureType;
-            int ncategories;
-            Size origWinSize;
-
-            vector<Stage> stages;
-            vector<DTree> classifiers;
-            vector<DTreeNode> nodes;
-            vector<float> leaves;
-            vector<int> subsets;
+            void release();
+            
+            /* returns number of detected objects */
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());
+                                    
+            bool findLargestObject;
+            bool visualizeInPlace;

-            Ptr<FeatureEvaluator> feval;
-            Ptr<CvHaarClassifierCascade> oldCascade;
+            Size getClassifierSize() const;
+        private:
+            
+            struct CascadeClassifierImpl;                        
+            CascadeClassifierImpl* impl;            
        };
-
+        
        ////////////////////////////////// SURF //////////////////////////////////////////
        
        struct CV_EXPORTS SURFParams_GPU 
        {
-            SURFParams_GPU() :
-                threshold(0.1f), 
-                nOctaves(4),
-                nIntervals(4),
-                initialScale(2.f),
-
-                l1(3.f/1.5f),
-                l2(5.f/1.5f),
-                l3(3.f/1.5f),
-                l4(1.f/1.5f),
-                edgeScale(0.81f),
-                initialStep(1),
-
-                extended(true),
-
-                featuresRatio(0.01f)
-            {
-            }
+            SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), 
+                l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),
+                edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}

            //! The interest operator threshold
            float threshold;

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -170,8 +170,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)

    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,
-            nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
    }
    else
    {
@@ -186,8 +185,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
        }
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,
-            pValues3, lvls.pLevels3, lvls.nValues3) );
+        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) );
    }
 }


--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -62,7 +62,7 @@ namespace cv
            BORDER_REPLICATE_GPU,
            BORDER_CONSTANT_GPU
        };
-        
+                
        // Converts CPU border extrapolation mode into GPU internal analogue.
        // Returns true if the GPU analogue exists, false otherwise.
        bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
@@ -105,8 +105,28 @@ namespace cv
            const textureReference* tex; 
            cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
            cudaSafeCall( cudaUnbindTexture(tex) );
-        }        
+        }    
+
+        struct KeyPoint_GPU
+        {
+            float x;
+            float y;
+            float size;
+            float response;
+            float angle;
+            float octave;
+        };

+        enum KeypointLayout 
+        {
+            SF_X,
+            SF_Y,
+            SF_SIZE,
+            SF_RESPONSE,
+            SF_ANGLE,
+            SF_OCTAVE,
+            SF_FEATURE_STRIDE
+        };
    }
 }


--- a/modules/gpu/src/cuda/surf_key_point.h
+++ b/modules/gpu/src/cuda/surf_key_point.h
@@ -47,29 +47,7 @@ namespace cv
 {
    namespace gpu
    {
-        namespace surf
-        {
-            struct KeyPoint_GPU
-            {
-                float x;
-                float y;
-                float size;
-                float response;
-                float angle;
-                float octave;
-            };
-
-            enum KeypointLayout 
-            {
-                SF_X,
-                SF_Y,
-                SF_SIZE,
-                SF_RESPONSE,
-                SF_ANGLE,
-                SF_OCTAVE,
-                SF_FEATURE_STRIDE
-            };
-        }
+       
    }
 }


--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -82,21 +82,16 @@ void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu()

 namespace
 {
-    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep,
-                                         NppiSize oSizeROI, int nScaleFactor);
-    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst,
-                                          int nDstStep, NppiSize oSizeROI);
-    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst,
-                                          int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
+    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);

    void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,
                         npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,
                         npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1)
    {
        CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
-
        CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);
-
        dst.create( src1.size(), src1.type() );

        NppiSize sz;
@@ -106,24 +101,16 @@ namespace
        switch (src1.type())
        {
        case CV_8UC1:
-            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );
            break;
        case CV_8UC4:
-            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );
            break;
        case CV_32SC1:
-            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step,
-                src2.ptr<Npp32s>(), src2.step,
-                dst.ptr<Npp32s>(), dst.step, sz) );
+            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );
            break;
        case CV_32FC1:
-            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step,
-                src2.ptr<Npp32f>(), src2.step,
-                dst.ptr<Npp32f>(), dst.step, sz) );
+            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );
            break;
        default:
            CV_Assert(!"Unsupported source type");
@@ -133,16 +120,15 @@ namespace
    template<int SCN> struct NppArithmScalarFunc;
    template<> struct NppArithmScalarFunc<1>
    {
-        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst,
-                                      int nDstStep, NppiSize oSizeROI);
+        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst, int nDstStep, NppiSize oSizeROI);
    };
    template<> struct NppArithmScalarFunc<2>
    {
-        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst,
-                                      int nDstStep, NppiSize oSizeROI);
+        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst, int nDstStep, NppiSize oSizeROI);
    };

    template<int SCN, typename NppArithmScalarFunc<SCN>::func_ptr func> struct NppArithmScalar;
+
    template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func>
    {
        static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)
@@ -254,24 +240,16 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
    switch (src1.type())
    {
    case CV_8UC1:
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step,
-            src2.ptr<Npp8u>(), src2.step,
-            dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );
        break;
    case CV_8UC4:
-        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step,
-            src2.ptr<Npp8u>(), src2.step,
-            dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );
        break;
    case CV_32SC1:
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step,
-            src2.ptr<Npp32s>(), src2.step,
-            dst.ptr<Npp32s>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );
        break;
    case CV_32FC1:
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step,
-            src2.ptr<Npp32f>(), src2.step,
-            dst.ptr<Npp32f>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );
        break;
    default:
        CV_Assert(!"Unsupported source type");

--- a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
+++ b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
--- a/modules/gpu/src/nvidia/NCV.cpp
+++ b/modules/gpu/src/nvidia/NCV.cpp
--- a/modules/gpu/src/nvidia/NCV.hpp
+++ b/modules/gpu/src/nvidia/NCV.hpp
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
--- a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
+////////////////////////////////////////////////////////////////////////////////
+// The Loki Library
+// Copyright (c) 2001 by Andrei Alexandrescu
+// This code accompanies the book:
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design 
+//     Patterns Applied". Copyright (c) 2001. Addison-Wesley.
+// Permission to use, copy, modify, distribute and sell this software for any 
+//     purpose is hereby granted without fee, provided that the above copyright 
+//     notice appear in all copies and that both that copyright notice and this 
+//     permission notice appear in supporting documentation.
+// The author or Addison-Welsey Longman make no representations about the 
+//     suitability of this software for any purpose. It is provided "as is" 
+//     without express or implied warranty.
+// http://loki-lib.sourceforge.net/index.php?n=Main.License
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+
+#include <stdarg.h>
+#include <vector>
+
+
+namespace Loki
+{
+    //==============================================================================
+    // class NullType
+    // Used as a placeholder for "no type here"
+    // Useful as an end marker in typelists 
+    //==============================================================================
+
+    class NullType {};
+
+    //==============================================================================
+    // class template Typelist
+    // The building block of typelists of any length
+    // Use it through the LOKI_TYPELIST_NN macros
+    // Defines nested types:
+    //     Head (first element, a non-typelist type by convention)
+    //     Tail (second element, can be another typelist)
+    //==============================================================================
+
+    template <class T, class U>
+    struct Typelist
+    {
+        typedef T Head;
+        typedef U Tail;
+    };
+
+    //==============================================================================
+    // class template Int2Type
+    // Converts each integral constant into a unique type
+    // Invocation: Int2Type<v> where v is a compile-time constant integral
+    // Defines 'value', an enum that evaluates to v
+    //==============================================================================
+
+    template <int v>
+    struct Int2Type
+    {
+        enum { value = v };
+    };
+
+    namespace TL
+    {
+        //==============================================================================
+        // class template TypeAt
+        // Finds the type at a given index in a typelist
+        // Invocation (TList is a typelist and index is a compile-time integral 
+        //     constant):
+        // TypeAt<TList, index>::Result
+        // returns the type in position 'index' in TList
+        // If you pass an out-of-bounds index, the result is a compile-time error
+        //==============================================================================
+
+        template <class TList, unsigned int index> struct TypeAt;
+
+        template <class Head, class Tail>
+        struct TypeAt<Typelist<Head, Tail>, 0>
+        {
+            typedef Head Result;
+        };
+
+        template <class Head, class Tail, unsigned int i>
+        struct TypeAt<Typelist<Head, Tail>, i>
+        {
+            typedef typename TypeAt<Tail, i - 1>::Result Result;
+        };
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime boolean template instance dispatcher
+// Cyril Crassin <cyril.crassin@icare3d.org>
+// NVIDIA, 2010
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NCVRuntimeTemplateBool
+{
+    //This struct is used to transform a list of parameters into template arguments
+    //The idea is to build a typelist containing the arguments
+    //and to pass this typelist to a user defined functor
+    template<typename TList, int NumArguments, class Func>
+    struct KernelCaller
+    {
+        //Convenience function used by the user
+        //Takes a variable argument list, transforms it into a list
+        static void call(Func &functor, int dummy, ...)
+        {
+            //Vector used to collect arguments
+            std::vector<int> templateParamList;
+
+            //Variable argument list manipulation
+            va_list listPointer;
+            va_start(listPointer, dummy);
+            //Collect parameters into the list
+            for(int i=0; i<NumArguments; i++)
+            {
+                int val = va_arg(listPointer, int);
+                templateParamList.push_back(val);
+            }
+            va_end(listPointer);
+
+            //Call the actual typelist building function
+            call(functor, templateParamList);
+        }
+
+        //Actual function called recursively to build a typelist based
+        //on a list of values
+        static void call( Func &functor, std::vector<int> &templateParamList)
+        {
+            //Get current parameter value in the list
+            int val = templateParamList[templateParamList.size() - 1];
+            templateParamList.pop_back();
+
+            //Select the compile time value to add into the typelist
+            //depending on the runtime variable and make recursive call. 
+            //Both versions are really instantiated
+            if(val)
+            {
+                KernelCaller<
+                    Loki::Typelist<typename Loki::Int2Type<true>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+            else
+            {
+                KernelCaller< 
+                    Loki::Typelist<typename Loki::Int2Type<false>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+        }
+    };
+
+    //Specialization for 0 value left in the list
+    //-> actual kernel functor call
+    template<class TList, class Func>
+    struct KernelCaller<TList, 0, Func>
+    {
+        static void call(Func &functor)
+        {
+            //Call to the functor's kernel call method
+            functor.call(TList()); //TList instantiated to get the method template parameter resolved
+        }
+
+        static void call(Func &functor, std::vector<int> &templateParams)
+        {
+            functor.call(TList());
+        }
+    };
+}
+
+#endif //_ncvruntimetemplates_hpp_
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -71,6 +71,9 @@
    #include "npp_staging.h"
    #include "surf_key_point.h"

+	#include "nvidia/NCV.hpp"
+	#include "nvidia/NCVHaarObjectDetection.hpp"
+
 #define CUDART_MINIMUM_REQUIRED_VERSION 3020
 #define NPP_MINIMUM_REQUIRED_VERSION 3216


--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
+// WARNING: this sample is under construction! Use it on your own risk.
+
+#include <opencv2/contrib/contrib.hpp>
+#include <opencv2/objdetect/objdetect.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/gpu/gpu.hpp>
+
+#include <iostream>
+#include <iomanip>
+#include <stdio.h>
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+void help()
+{
+    cout << "Usage: ./cascadeclassifier <cascade_file> <image_or_video_or_cameraid>\n"               
+            "Using OpenCV version " << CV_VERSION << endl << endl;
+}
+
+void DetectAndDraw(Mat& img, CascadeClassifier_GPU& cascade);
+
+String cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml";
+String nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml";
+
+
+
+template<class T> void convertAndReseize(const T& src, T& gray, T& resized, double scale = 2.0)
+{
+    if (src.channels() == 3)
+        cvtColor( src, gray, CV_BGR2GRAY );
+    else
+        gray = src;
+
+    Size sz(cvRound(gray.cols * scale), cvRound(gray.rows * scale));
+    if (scale != 1)
+        resize(gray, resized, sz);
+    else
+        resized = gray;
+}
+
+
+
+int main( int argc, const char** argv )
+{        
+    if (argc != 3)
+        return help(), -1;
+
+    if (cv::gpu::getCudaEnabledDeviceCount() == 0)
+        return cerr << "No GPU found or the library is compiled without GPU support" << endl, -1;
+
+    VideoCapture capture;
+     
+    string cascadeName = argv[1];
+    string inputName = argv[2];
+
+    cv::gpu::CascadeClassifier_GPU cascade_gpu;
+    if( !cascade_gpu.load( cascadeName ) )
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
+
+    cv::CascadeClassifier cascade_cpu;
+    if( !cascade_cpu.load( cascadeName ) )
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
+    
+    Mat image = imread( inputName);
+    if( image.empty() )
+        if (!capture.open(inputName))
+        {
+            int camid = 0;
+            sscanf(inputName.c_str(), "%d", &camid);
+            if(!capture.open(camid))
+                cout << "Can't open source" << endl;
+        }
+    
+    namedWindow( "result", 1 );        
+    Size fontSz = cv::getTextSize("T[]", FONT_HERSHEY_SIMPLEX, 1.0, 2, 0);
+
+    Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;
+    vector<Rect> facesBuf_cpu;
+
+    GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu;    
+    
+    /* parameters */
+    bool useGPU = true;
+    double scale_factor = 2;
+
+    bool visualizeInPlace = false;   
+    bool findLargestObject = false;    
+
+    printf("\t<space> - toggle GPU/CPU\n");
+    printf("\tL       - toggle lagest faces\n");
+    printf("\tV       - toggle visualisation in-place (for GPU only)\n");
+    printf("\t1/q     - inc/dec scale\n");
+        
+    int detections_num;
+    for(;;)
+    {               
+        if( capture.isOpened() )
+        {
+            capture >> frame;                            
+            if( frame.empty())
+                break;
+        }
+
+        (image.empty() ? frame : image).copyTo(frame_cpu);
+        frame_gpu.upload( image.empty() ? frame : image);
+        
+        convertAndReseize(frame_gpu, gray_gpu, resized_gpu, scale_factor);
+        convertAndReseize(frame_cpu, gray_cpu, resized_cpu, scale_factor);
+
+        cv::TickMeter tm;
+        tm.start();      
+
+        if (useGPU)
+        {
+            cascade_gpu.visualizeInPlace = visualizeInPlace;   
+            cascade_gpu.findLargestObject = findLargestObject;    
+
+            detections_num = cascade_gpu.detectMultiScale( resized_gpu, facesBuf_gpu ); 
+            facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
+        
+        }
+        else /* so use CPU */
+        {   
+            Size minSize = cascade_gpu.getClassifierSize();
+            if (findLargestObject)
+            {                
+                float ratio = (float)std::min(frame.cols / minSize.width, frame.rows / minSize.height);
+                ratio = std::max(ratio / 2.5f, 1.f);
+                minSize = Size(cvRound(minSize.width * ratio), cvRound(minSize.height * ratio));                
+            }
+            
+            cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2, 4, (findLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE, minSize);                            
+            detections_num = (int)facesBuf_cpu.size();
+        }
+
+        tm.stop();
+        printf( "detection time = %g ms\n", tm.getTimeMilli() );
+
+        if (useGPU)
+            resized_gpu.download(resized_cpu);
+
+        if (!visualizeInPlace || !useGPU)
+            if (detections_num)
+            {
+                Rect* faces = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0];                
+                for(int i = 0; i < detections_num; ++i)                
+                    cv::rectangle(resized_cpu, faces[i], Scalar(255));            
+            }
+        
+        Point text_pos(5, 25);
+        int offs = fontSz.height + 5;
+        Scalar color = CV_RGB(255, 0, 0);
+
+
+        cv::cvtColor(resized_cpu, frameDisp, CV_GRAY2BGR);
+
+        char buf[4096];
+        sprintf(buf, "%s, FPS = %0.3g", useGPU ? "GPU" : "CPU", 1.0/tm.getTimeSec());                       
+        putText(frameDisp, buf, text_pos, FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+        sprintf(buf, "scale = %0.3g, [%d*scale x %d*scale]", scale_factor, frame.cols, frame.rows);                       
+        putText(frameDisp, buf, text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+        putText(frameDisp, "Hotkeys: space, 1, Q, L, V, Esc", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+
+        if (findLargestObject)
+            putText(frameDisp, "FindLargestObject", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+
+        if (visualizeInPlace && useGPU)
+            putText(frameDisp, "VisualizeInPlace", text_pos+Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+
+        cv::imshow( "result", frameDisp);
+
+        int key = waitKey( 5 );
+        if( key == 27)
+            break;
+
+        switch (key)
+        {
+        case (int)' ':  useGPU = !useGPU;  printf("Using %s\n", useGPU ? "GPU" : "CPU");break;
+        case (int)'v':  case (int)'V': visualizeInPlace = !visualizeInPlace; printf("VisualizeInPlace = %d\n", visualizeInPlace); break;
+        case (int)'l':  case (int)'L': findLargestObject = !findLargestObject;  printf("FindLargestObject = %d\n", findLargestObject); break;
+        case (int)'1':  scale_factor*=1.05; printf("Scale factor = %g\n", scale_factor); break;
+        case (int)'q':  case (int)'Q':scale_factor/=1.05; printf("Scale factor = %g\n", scale_factor); break;
+        }
+       
+    }    
+    return 0;
+}
+
+
+