Merge cuda-geek/soft-cascade-gpu into cuda-dev

209f1645 · marina.kolpakova · 1712d093 · 05cd88ae · 209f1645 · 209f1645
Commit 209f1645 authored Nov 26, 2012 by marina.kolpakova
16 changed files
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -199,6 +199,121 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.


+Soft Cascade Classifier
+==========================
+
+Soft Cascade Classifier for Object Detection
+----------------------------------------------------------
+
+Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
+
+.. math::
+    \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
+
+where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
+
+.. math::
+    \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
+
+be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
+After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
+
+The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
+
+.. [BJ05] Lubomir Bourdev and Jonathan Brandt. tRobust Object Detection Via Soft Cascade. IEEE CVPR, 2005.
+.. [BMTG12] Rodrigo Benenson, Markus Mathias, Radu Timofte and Luc Van Gool. Pedestrian detection at 100 frames per second. IEEE CVPR, 2012.
+
+
+SCascade
+----------------
+.. ocv:class:: SCascade : public Algorithm
+
+Implementation of soft (stageless) cascaded detector. ::
+
+    class CV_EXPORTS SCascade : public Algorithm
+    {
+        struct CV_EXPORTS Detection
+        {
+              ushort x;
+              ushort y;
+              ushort w;
+              ushort h;
+              float confidence;
+              int kind;
+
+              enum {PEDESTRIAN = 0};
+        };
+
+        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+        virtual ~SCascade();
+        virtual bool load(const FileNode& fn);
+        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+    };
+
+
+SCascade::SCascade
+--------------------------
+An empty cascade will be created.
+
+.. ocv:function:: bool SCascade::SCascade(const float minScale = 0.4f, const float maxScale = 5.f, const int scales = 55, const int rejfactor = 1)
+
+    :param minScale: a minimum scale relative to the original size of the image on which cascade will be applyed.
+
+    :param maxScale: a maximum scale relative to the original size of the image on which cascade will be applyed.
+
+    :param scales: a number of scales from minScale to maxScale.
+
+    :param rejfactor: used for non maximum suppression.
+
+
+
+SCascade::~SCascade
+---------------------------
+Destructor for SCascade.
+
+.. ocv:function:: SCascade::~SCascade()
+
+
+
+SCascade::load
+--------------------------
+Load cascade from FileNode.
+
+.. ocv:function:: bool SCascade::load(const FileNode& fn)
+
+    :param fn: File node from which the soft cascade are read.
+
+
+
+SCascade::detect
+--------------------------
+Apply cascade to an input frame and return the vector of Decection objcts.
+
+.. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
+
+    :param image: a frame on which detector will be applied.
+
+    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
+
+    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
+
+
+SCascade::genRoi
+--------------------------
+Convert ROI matrix into the suitable for detect method.
+
+.. ocv:function:: void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const
+
+    :param rois: an input matrix of the same size as the image. There non zero value mean that detector should be executed in this point.
+
+    :param mask: an output mask
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
+
+

 gpu::CascadeClassifier_GPU
 --------------------------

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1532,6 +1532,76 @@ public:
    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 };

+// ======================== GPU version for soft cascade ===================== //
+
+// Implementation of soft (stageless) cascaded detector.
+class CV_EXPORTS SCascade : public Algorithm
+{
+public:
+
+    // Representation of detectors result.
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
+
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT};
+
+    // An empty cascade will be created.
+    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
+    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
+    // Param scales is a number of scales from minScale to maxScale.
+    // Param rejfactor is used for NMS.
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejCriteria = 1);
+
+    virtual ~SCascade();
+
+    cv::AlgorithmInfo* info() const;
+
+    // Load cascade from FileNode.
+    // Param fn is a root node for cascade. Should be <cascade>.
+    virtual bool load(const FileNode& fn);
+
+    // Load cascade config.
+    virtual void read(const FileNode& fn);
+
+    // Return the matrix of of detectioned objects.
+    // Param image is a frame on which detector will be applied.
+    // Param rois is a regions of interests mask generated by genRoi.
+    //    Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
+    //    The first element of the matrix is  actually a count of detections.
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+
+    // Convert ROI matrix into the suitable for detect method.
+    // Param roi is an input matrix of the same size as the image.
+    //    There non zero value mean that detector should be executed in this point.
+    // Param mask is an output mask
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+
+private:
+
+    struct Fields;
+    Fields* fields;
+
+    double minScale;
+    double maxScale;
+
+    int scales;
+    int rejCriteria;
+};
+
+CV_EXPORTS bool initModule_gpu(void);
+
 ////////////////////////////////// SURF //////////////////////////////////////////

 class CV_EXPORTS SURF_GPU

--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
    SANITY_CHECK(found_locations);
 }

-
 ///////////////////////////////////////////////////////////////
 // HaarClassifier

@@ -181,4 +180,4 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
    }
 }

-} // namespace
+} // namespace
\ No newline at end of file
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
+#include "perf_precomp.hpp"
+
+#define GPU_PERF_TEST_P(fixture, name, params)  \
+    class fixture##_##name : public fixture {\
+     public:\
+      fixture##_##name() {}\
+     protected:\
+        virtual void __cpu();\
+        virtual void __gpu();\
+      virtual void PerfTestBody();\
+    };\
+    TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (PERF_RUN_GPU()) __gpu(); else __cpu();}\
+    INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
+    void fixture##_##name::PerfTestBody()
+
+#define RUN_CPU(fixture, name)\
+    void fixture##_##name::__cpu()
+
+#define RUN_GPU(fixture, name)\
+    void fixture##_##name::__gpu()
+
+#define NO_CPU(fixture, name)\
+void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";}
+
+namespace {
+    struct DetectionLess
+    {
+        bool operator()(const cv::gpu::SCascade::Detection& a,
+            const cv::gpu::SCascade::Detection& b) const
+        {
+            if (a.x != b.x) return a.x < b.x;
+            else if (a.y != b.y) return a.y < b.y;
+            else if (a.w != b.w) return a.w < b.w;
+            else return a.h < b.h;
+        }
+    };
+
+    cv::Mat sortDetections(cv::gpu::GpuMat& objects)
+    {
+        cv::Mat detections(objects);
+
+        typedef cv::gpu::SCascade::Detection Detection;
+        Detection* begin = (Detection*)(detections.ptr<char>(0));
+        Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
+        std::sort(begin, end, DetectionLess());
+
+        return detections;
+    }
+}
+
+
+typedef std::tr1::tuple<std::string, std::string> fixture_t;
+typedef perf::TestBaseWithParam<fixture_t> SCascadeTest;
+
+GPU_PERF_TEST_P(SCascadeTest, detect,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
+{ }
+
+RUN_GPU(SCascadeTest, detect)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois;
+    rois.setTo(1);
+    cascade.genRoi(rois, trois);
+
+    cascade.detect(colored, trois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, trois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detect)
+
+static cv::Rect getFromTable(int idx)
+{
+    static const cv::Rect rois[] =
+    {
+        cv::Rect( 65 * 4,  20 * 4,  35 * 4, 80 * 4),
+        cv::Rect( 95 * 4,  35 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 45 * 4,  35 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 25 * 4,  27 * 4,  50 * 4, 45 * 4),
+        cv::Rect(100 * 4,  50 * 4,  45 * 4, 40 * 4),
+
+        cv::Rect( 60 * 4,  30 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 40 * 4,  55 * 4,  50 * 4, 40 * 4),
+        cv::Rect( 48 * 4,  37 * 4,  72 * 4, 80 * 4),
+        cv::Rect( 48 * 4,  32 * 4,  85 * 4, 58 * 4),
+        cv::Rect( 48 * 4,   0 * 4,  32 * 4, 27 * 4)
+    };
+
+    return rois[idx];
+}
+
+typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
+typedef perf::TestBaseWithParam<roi_fixture_t> SCascadeTestRoi;
+
+GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 5)))
+{}
+
+RUN_GPU(SCascadeTestRoi, detectInRoi)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+
+    int nroi = GET_PARAM(2);
+    cv::RNG rng;
+    for (int i = 0; i < nroi; ++i)
+    {
+        cv::Rect r = getFromTable(rng(10));
+        cv::gpu::GpuMat sub(rois, r);
+        sub.setTo(1);
+    }
+
+    cv::gpu::GpuMat trois;
+    cascade.genRoi(rois, trois);
+
+    cascade.detect(colored, trois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, trois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTestRoi, detectInRoi)
+
+
+GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 10)))
+{}
+
+RUN_GPU(SCascadeTestRoi, detectEachRoi)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+
+    int idx = GET_PARAM(2);
+    cv::Rect r = getFromTable(idx);
+    cv::gpu::GpuMat sub(rois, r);
+    sub.setTo(1);
+
+    cv::gpu::GpuMat trois;
+    cascade.genRoi(rois, trois);
+
+    cascade.detect(colored, trois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, trois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTestRoi, detectEachRoi)
+
+GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/integrals.xml"))))
+{ }
+
+    static std::string itoa(long i)
+    {
+        static char s[65];
+        sprintf(s, "%ld", i);
+        return std::string(s);
+    }
+
+RUN_GPU(SCascadeTest, detectOnIntegral)
+{
+    cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+    ASSERT_TRUE(fsi.isOpened());
+
+    cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1);
+    for (int i = 0; i < 10; ++i)
+    {
+        cv::Mat channel;
+        fsi[std::string("channel") + itoa(i)] >> channel;
+        cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1), trois;
+    rois.setTo(1);
+    cascade.genRoi(rois, trois);
+
+    cascade.detect(hogluv, trois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(hogluv, trois, objectBoxes);
+    }
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detectOnIntegral)
+
+GPU_PERF_TEST_P(SCascadeTest, detectStream,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
+{ }
+
+RUN_GPU(SCascadeTest, detectStream)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois;
+    rois.setTo(1);
+
+    cv::gpu::Stream s;
+
+    cascade.genRoi(rois, trois, s);
+
+    cascade.detect(colored, trois, objectBoxes, s);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, trois, objectBoxes, s);
+    }
+
+    cudaDeviceSynchronize();
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detectStream)
\ No newline at end of file
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -1034,7 +1034,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int /*cc*/, cudaStream_t stream)
        {
            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
        }

--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -383,6 +383,88 @@ namespace cv { namespace gpu { namespace device
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
+
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = buffer.ptr(y) + tidx;
+                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *dst = sum;
+            }
+        #endif
+        }
+
+        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
+            int blockStep, cudaStream_t stream)
+        {
+            {
+                const int block = blockStep;
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        }
    }
 }}}


--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -85,7 +85,7 @@ namespace cv

  namespace device
  {
-      using pcl::gpu::TextureBinder;
+      using cv::gpu::TextureBinder;
  }
 }


--- a/modules/gpu/src/gpu_init.cpp
+++ b/modules/gpu/src/gpu_init.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <precomp.hpp>
+
+namespace cv { namespace gpu
+{
+
+CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
+                  obj.info()->addParam(obj, "minScale",    obj.minScale);
+                  obj.info()->addParam(obj, "maxScale",    obj.maxScale);
+                  obj.info()->addParam(obj, "scales",      obj.scales);
+                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));
+
+bool initModule_gpu(void)
+{
+    Ptr<Algorithm> sc = createSCascade();
+    return sc->info() != 0;
+}
+
+} }
\ No newline at end of file
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
+//M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M
+
+
+#ifndef __OPENCV_ICF_HPP__
+#define __OPENCV_ICF_HPP__
+
+#include <opencv2/gpu/device/common.hpp>
+
+#if defined __CUDACC__
+# define __device __device__ __forceinline__
+#else
+# define __device
+#endif
+
+
+namespace cv { namespace gpu { namespace device {
+namespace icf {
+
+struct __align__(16) Octave
+{
+    ushort index;
+    ushort stages;
+    ushort shrinkage;
+    ushort2 size;
+    float scale;
+
+    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+};
+
+struct __align__(8) Level //is actually 24 bytes
+{
+    int octave;
+    int step;
+
+    float relScale;
+    float scaling[2]; // calculated according to Dollal paper
+
+    // for 640x480 we can not get overflow
+    uchar2 workRect;
+    uchar2 objSize;
+
+    Level(int idx, const Octave& oct, const float scale, const int w, const int h);
+    __device Level(){}
+};
+
+struct __align__(8) Node
+{
+    uchar4 rect;
+    // ushort channel;
+    uint threshold;
+
+    enum { THRESHOLD_MASK = 0x0FFFFFFF };
+
+    Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28)) {}
+};
+
+struct __align__(16) Detection
+{
+    ushort x;
+    ushort y;
+    ushort w;
+    ushort h;
+    float confidence;
+    int kind;
+
+    Detection(){}
+    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
+    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
+};
+
+struct GK107PolicyX4
+{
+    enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4};
+    typedef float2 roi_type;
+    static const dim3 block()
+    {
+        return dim3(STA_X, STA_Y);
+    }
+};
+
+template<typename Policy>
+struct CascadeInvoker
+{
+    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
+
+    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
+                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    : levels((const Level*)_levels.ptr()),
+      stages((const float*)_stages.ptr()),
+      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
+      scales(_levels.cols / sizeof(Level))
+    {}
+
+    const Level*  levels;
+    const float*  stages;
+
+    const Node*   nodes;
+    const float*  leaves;
+
+    int scales;
+
+    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        const int downscales, const cudaStream_t& stream = 0) const;
+
+    template<bool isUp>
+    __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
+};
+
+}
+}}}
+
+#endif
\ No newline at end of file
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -288,7 +288,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
    do \
    { \
        cudaError_t res = cudacall; \
-        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
+        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
    } while (0)



--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -30,7 +30,7 @@ const Size2i preferredVideoFrameSize(640, 480);
 const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade";


-void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
+static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 {
    int fontFace = FONT_HERSHEY_DUPLEX;
    double fontScale = 0.8;
@@ -45,7 +45,7 @@ void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 }


-void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
+static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
 {
    Scalar fontColorRed = CV_RGB(255,0,0);
    Scalar fontColorNV  = CV_RGB(118,185,0);
@@ -74,7 +74,7 @@ void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bF
 }


-NCVStatus process(Mat *srcdst,
+static NCVStatus process(Mat *srcdst,
                  Ncv32u width, Ncv32u height,
                  NcvBool bFilterRects, NcvBool bLargestFace,
                  HaarClassifierCascadeDescriptor &haar,
@@ -281,7 +281,7 @@ int main(int argc, const char** argv)
    //==============================================================================

    namedWindow(wndTitle, 1);
-    Mat gray, frameDisp;
+    Mat frameDisp;

    do
    {

--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
@@ -59,7 +59,7 @@ public:
 class RgbToR
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char /*g*/, unsigned char r)
    {
        return static_cast<float>(r)/255.0f;
    }
@@ -69,7 +69,7 @@ public:
 class RgbToG
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char g, unsigned char /*r*/)
    {
        return static_cast<float>(g)/255.0f;
    }
@@ -78,7 +78,7 @@ public:
 class RgbToB
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char b, unsigned char /*g*/, unsigned char /*r*/)
    {
        return static_cast<float>(b)/255.0f;
    }
@@ -135,7 +135,7 @@ NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc<Ncv32f> &dst)
    return NCV_SUCCESS;
 }

-NCVStatus LoadImages (const char *frame0Name,
+static NCVStatus LoadImages (const char *frame0Name,
                      const char *frame1Name,
                      int &width,
                      int &height,
@@ -186,7 +186,7 @@ inline T MapValue (T x, T a, T b, T c, T d)
    return c + (d - c) * (x - a) / (b - a);
 }

-NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
+static NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
 {
    IplImage *flowField;

@@ -246,7 +246,7 @@ NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const
    return NCV_SUCCESS;
 }

-IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
+static IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
 {
    CvSize imageSize = cvSize (h_r.width (), h_r.height ());
    IplImage *image  = cvCreateImage (imageSize, IPL_DEPTH_8U, 4);
@@ -270,7 +270,7 @@ IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g,
    return image;
 }

-void PrintHelp ()
+static void PrintHelp ()
 {
    std::cout << "Usage help:\n";
    std::cout << std::setiosflags(std::ios::left);
@@ -286,7 +286,7 @@ void PrintHelp ()
    std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n";
 }

-int ProcessCommandLine(int argc, char **argv,
+static int ProcessCommandLine(int argc, char **argv,
                       Ncv32f &timeStep,
                       char *&frame0Name,
                       char *&frame1Name,

--- a/samples/gpu/softcascade.cpp
+++ b/samples/gpu/softcascade.cpp
+#include <opencv2/gpu/gpu.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <iostream>
+
+int main(int argc, char** argv)
+{
+    const std::string keys =
+        "{help h usage ?    |     | print this message }"
+        "{cascade c         |     | path to configuration xml }"
+        "{frames f          |     | path to configuration xml }"
+        "{min_scale         |0.4f | path to configuration xml }"
+        "{max_scale         |5.0f | path to configuration xml }"
+        "{total_scales      |55   | path to configuration xml }"
+        "{device d          |0    | path to configuration xml }"
+    ;
+
+    cv::CommandLineParser parser(argc, argv, keys);
+    parser.about("Soft cascade training application.");
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 1;
+    }
+
+    cv::gpu::setDevice(parser.get<int>("device"));
+
+    std::string cascadePath = parser.get<std::string>("cascade");
+
+    cv::FileStorage fs(cascadePath, cv::FileStorage::READ);
+    if(!fs.isOpened())
+    {
+        std::cout << "Soft Cascade file " << cascadePath << " can't be opened." << std::endl << std::flush;
+        return 1;
+    }
+
+    std::cout << "Read cascade from file " << cascadePath << std::endl;
+
+    float minScale =  parser.get<float>("min_scale");
+    float maxScale =  parser.get<float>("max_scale");
+    int scales     =  parser.get<int>("total_scales");
+
+    using cv::gpu::SCascade;
+    SCascade cascade(minScale, maxScale, scales);
+
+    if (!cascade.load(fs.getFirstTopLevelNode()))
+    {
+        std::cout << "Soft Cascade can't be parsed." << std::endl << std::flush;
+        return 1;
+    }
+
+    std::string frames = parser.get<std::string>("frames");
+    cv::VideoCapture capture(frames);
+    if(!capture.isOpened())
+    {
+        std::cout << "Frame source " << frames << " can't be opened." << std::endl << std::flush;
+        return 1;
+    }
+
+    cv::gpu::GpuMat objects(1, sizeof(SCascade::Detection) * 10000, CV_8UC1);
+    cv::gpu::printShortCudaDeviceInfo(parser.get<int>("device"));
+    for (;;)
+    {
+        cv::Mat frame;
+        if (!capture.read(frame))
+        {
+            std::cout << "Nothing to read. " << std::endl << std::flush;
+            return 0;
+        }
+
+        cv::gpu::GpuMat dframe(frame), roi(frame.rows, frame.cols, CV_8UC1), trois;
+        roi.setTo(cv::Scalar::all(1));
+        cascade.genRoi(roi, trois);
+        cascade.detect(dframe, trois, objects);
+
+        cv::Mat dt(objects);
+        typedef cv::gpu::SCascade::Detection Detection;
+
+        Detection* dts = ((Detection*)dt.data) + 1;
+        int* count = dt.ptr<int>(0);
+
+        std::cout << *count << std::endl;
+
+        cv::Mat result;
+        frame.copyTo(result);
+
+
+        for (int i = 0; i < *count; ++i)
+        {
+            Detection d = dts[i];
+            cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+        }
+
+        std::cout << "working..." << std::endl;
+        cv::imshow("Soft Cascade demo", result);
+        cv::waitKey(10);
+    }
+
+    return 0;
+}
\ No newline at end of file