Merge pull request #648 from cuda-geek:move-gpu-soft-cascade-to-softcascade-module

1eb34e06 · Andrey Kamaev · OpenCV Buildbot · d9cd7538 · 157a98ed · 1eb34e06
Commit 1eb34e06 authored Mar 15, 2013 by Andrey Kamaev Committed by OpenCV Buildbot Mar 15, 2013
30 changed files
--- a/doc/check_docs2.py
+++ b/doc/check_docs2.py
@@ -199,6 +199,7 @@ def process_module(module, path):
    if module == "gpu":
        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_devptrs.hpp"))
        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpumat.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "stream_accessor.hpp"))

    decls = []
    for hname in hdrlist:

--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -50,6 +50,141 @@

 namespace cv { namespace gpu
 {
+    //////////////////////////////// CudaMem ////////////////////////////////
+    // CudaMem is limited cv::Mat with page locked memory allocation.
+    // Page locked memory is only needed for async and faster coping to GPU.
+    // It is convertable to cv::Mat header without reference counting
+    // so you can use it with other opencv functions.
+
+    // Page-locks the matrix m memory and maps it for the device(s)
+    CV_EXPORTS void registerPageLocked(Mat& m);
+    // Unmaps the memory of matrix m, and makes it pageable again.
+    CV_EXPORTS void unregisterPageLocked(Mat& m);
+
+    class CV_EXPORTS CudaMem
+    {
+    public:
+        enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
+
+        CudaMem();
+        CudaMem(const CudaMem& m);
+
+        CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
+        CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+
+        //! creates from cv::Mat with coping data
+        explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        ~CudaMem();
+
+        CudaMem& operator = (const CudaMem& m);
+
+        //! returns deep copy of the matrix, i.e. the data is copied
+        CudaMem clone() const;
+
+        //! allocates new matrix data unless the matrix already has specified size and type.
+        void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+        void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        //! decrements reference counter and released memory if needed.
+        void release();
+
+        //! returns matrix header with disabled reference counting for CudaMem data.
+        Mat createMatHeader() const;
+        operator Mat() const;
+
+        //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+        GpuMat createGpuMatHeader() const;
+        operator GpuMat() const;
+
+        //returns if host memory can be mapperd to gpu address space;
+        static bool canMapHostMemory();
+
+        // Please see cv::Mat for descriptions
+        bool isContinuous() const;
+        size_t elemSize() const;
+        size_t elemSize1() const;
+        int type() const;
+        int depth() const;
+        int channels() const;
+        size_t step1() const;
+        Size size() const;
+        bool empty() const;
+
+
+        // Please see cv::Mat for descriptions
+        int flags;
+        int rows, cols;
+        size_t step;
+
+        uchar* data;
+        int* refcount;
+
+        uchar* datastart;
+        uchar* dataend;
+
+        int alloc_type;
+    };
+
+
+    //////////////////////////////// CudaStream ////////////////////////////////
+    // Encapculates Cuda Stream. Provides interface for async coping.
+    // Passed to each function that supports async kernel execution.
+    // Reference counting is enabled
+
+    class CV_EXPORTS Stream
+    {
+    public:
+        Stream();
+        ~Stream();
+
+        Stream(const Stream&);
+        Stream& operator =(const Stream&);
+
+        bool queryIfComplete();
+        void waitForCompletion();
+
+        //! downloads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
+        void enqueueDownload(const GpuMat& src, CudaMem& dst);
+        void enqueueDownload(const GpuMat& src, Mat& dst);
+
+        //! uploads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
+        void enqueueUpload(const CudaMem& src, GpuMat& dst);
+        void enqueueUpload(const Mat& src, GpuMat& dst);
+
+        //! copy asynchronously
+        void enqueueCopy(const GpuMat& src, GpuMat& dst);
+
+        //! memory set asynchronously
+        void enqueueMemSet(GpuMat& src, Scalar val);
+        void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
+
+        //! converts matrix type, ex from float to uchar depending on type
+        void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+
+        //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        void enqueueHostCallback(StreamCallback callback, void* userData);
+
+        static Stream& Null();
+
+        operator bool() const;
+
+    private:
+        struct Impl;
+
+        explicit Stream(Impl* impl);
+        void create();
+        void release();
+
+        Impl *impl;
+
+        friend struct StreamAccessor;
+    };
+
    //////////////////////////////// Initialization & Info ////////////////////////

    //! This is the only function that do not throw exceptions if the library is compiled without Cuda.

--- a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
+++ b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
@@ -40,10 +40,10 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
-#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+#ifndef __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
+#define __OPENCV_CUDA_STREAM_ACCESSOR_HPP__

-#include "opencv2/gpu.hpp"
+#include "opencv2/core/gpumat.hpp"
 #include "cuda_runtime_api.h"

 namespace cv
@@ -61,4 +61,4 @@ namespace cv
    }
 }

-#endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
\ No newline at end of file
+#endif /* __OPENCV_CUDA_STREAM_ACCESSOR_HPP__ */
\ No newline at end of file
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -41,11 +41,13 @@
 //M*/

 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"

 using namespace cv;
 using namespace cv::gpu;

 #if !defined (HAVE_CUDA)
+#define throw_nogpu() CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")

 cv::gpu::Stream::Stream() { throw_nogpu(); }
 cv::gpu::Stream::~Stream() {}
@@ -70,7 +72,7 @@ void cv::gpu::Stream::release() { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

-#include "opencv2/gpu/stream_accessor.hpp"
+#include "opencv2/core/stream_accessor.hpp"

 namespace cv { namespace gpu
 {

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -72,19 +72,11 @@ using namespace cv::gpu;
 namespace
 {
 #if defined(__GNUC__)
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
 #endif

-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
    {
        if (err < 0)

--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -41,6 +41,7 @@
 //M*/

 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"

 using namespace cv;
 using namespace cv::gpu;
@@ -178,14 +179,15 @@ bool cv::gpu::CudaMem::empty() const
    return data == 0;
 }

-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA)

-void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
-void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); }
-void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); }
-bool cv::gpu::CudaMem::canMapHostMemory() { throw_nogpu(); return false; }
-void cv::gpu::CudaMem::release() { throw_nogpu(); }
-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return GpuMat(); }
+void cv::gpu::registerPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+void cv::gpu::unregisterPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/)
+{ CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+bool cv::gpu::CudaMem::canMapHostMemory() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return false; }
+void cv::gpu::CudaMem::release() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return GpuMat(); }

 #else /* !defined (HAVE_CUDA) */


--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
@@ -67,18 +67,6 @@ namespace
            void throw_nocuda() { CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); }
        #else
            void throw_nocuda() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
-
-            #if defined(__GNUC__)
-                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-            #else /* defined(__CUDACC__) || defined(__MSVC__) */
-                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-            #endif
-
-            void ___cudaSafeCall(cudaError_t err, const char* file, const int line, const char* func = "")
-            {
-                if (cudaSuccess != err)
-                    cv::gpu::error(cudaGetErrorString(err), file, line, func);
-            }
        #endif
    #endif
 }

--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -66,6 +66,25 @@
 #define GET_OPTIMIZED(func) (func)
 #endif

+#ifdef HAVE_CUDA
+#  include <cuda_runtime_api.h>
+#  include "opencv2/core/gpumat.hpp"
+
+#  if defined(__GNUC__)
+#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#  else
+#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#  endif
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+#else
+#  define cudaSafeCall(expr)
+#endif
+
 namespace cv
 {


--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -199,88 +199,6 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.


-Soft Cascade Classifier
-==========================
-
-Soft Cascade Classifier for Object Detection
----------------------------------------------------------
-
-Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
-
-.. math::
-    \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
-
-where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
-
-.. math::
-    \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
-
-be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
-After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
-
-The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
-
-gpu::SCascade
-----------------------------------------------
-.. ocv:class:: gpu::SCascade : public Algorithm
-
-Implementation of soft (stageless) cascaded detector. ::
-
-    class CV_EXPORTS SCascade : public Algorithm
-    {
-        struct CV_EXPORTS Detection
-        {
-              ushort x;
-              ushort y;
-              ushort w;
-              ushort h;
-              float confidence;
-              int kind;
-
-              enum {PEDESTRIAN = 0};
-        };
-
-        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
-        virtual ~SCascade();
-        virtual bool load(const FileNode& fn);
-        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
-    };
-
-
-gpu::SCascade::~SCascade
---------------------------
-Destructor for SCascade.
-
-.. ocv:function:: gpu::SCascade::~SCascade()
-
-
-
-gpu::SCascade::load
--------------------------
-Load cascade from FileNode.
-
-.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn)
-
-    :param fn: File node from which the soft cascade are read.
-
-
-
-gpu::SCascade::detect
--------------------------
-Apply cascade to an input frame and return the vector of Decection objcts.
-
-.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
-
-    :param image: a frame on which detector will be applied.
-
-    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
-
-    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
-
-    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
-
-
 gpu::CascadeClassifier_GPU
 --------------------------
 .. ocv:class:: gpu::CascadeClassifier_GPU

--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -55,142 +55,6 @@
 #include "opencv2/features2d.hpp"

 namespace cv { namespace gpu {
-
-//////////////////////////////// CudaMem ////////////////////////////////
-// CudaMem is limited cv::Mat with page locked memory allocation.
-// Page locked memory is only needed for async and faster coping to GPU.
-// It is convertable to cv::Mat header without reference counting
-// so you can use it with other opencv functions.
-
-// Page-locks the matrix m memory and maps it for the device(s)
-CV_EXPORTS void registerPageLocked(Mat& m);
-// Unmaps the memory of matrix m, and makes it pageable again.
-CV_EXPORTS void unregisterPageLocked(Mat& m);
-
-class CV_EXPORTS CudaMem
-{
-public:
-    enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
-
-    CudaMem();
-    CudaMem(const CudaMem& m);
-
-    CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
-    CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-
-    //! creates from cv::Mat with coping data
-    explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    ~CudaMem();
-
-    CudaMem& operator = (const CudaMem& m);
-
-    //! returns deep copy of the matrix, i.e. the data is copied
-    CudaMem clone() const;
-
-    //! allocates new matrix data unless the matrix already has specified size and type.
-    void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-    void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    //! decrements reference counter and released memory if needed.
-    void release();
-
-    //! returns matrix header with disabled reference counting for CudaMem data.
-    Mat createMatHeader() const;
-    operator Mat() const;
-
-    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
-    GpuMat createGpuMatHeader() const;
-    operator GpuMat() const;
-
-    //returns if host memory can be mapperd to gpu address space;
-    static bool canMapHostMemory();
-
-    // Please see cv::Mat for descriptions
-    bool isContinuous() const;
-    size_t elemSize() const;
-    size_t elemSize1() const;
-    int type() const;
-    int depth() const;
-    int channels() const;
-    size_t step1() const;
-    Size size() const;
-    bool empty() const;
-
-
-    // Please see cv::Mat for descriptions
-    int flags;
-    int rows, cols;
-    size_t step;
-
-    uchar* data;
-    int* refcount;
-
-    uchar* datastart;
-    uchar* dataend;
-
-    int alloc_type;
-};
-
-//////////////////////////////// CudaStream ////////////////////////////////
-// Encapculates Cuda Stream. Provides interface for async coping.
-// Passed to each function that supports async kernel execution.
-// Reference counting is enabled
-
-class CV_EXPORTS Stream
-{
-public:
-    Stream();
-    ~Stream();
-
-    Stream(const Stream&);
-    Stream& operator =(const Stream&);
-
-    bool queryIfComplete();
-    void waitForCompletion();
-
-    //! downloads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
-    void enqueueDownload(const GpuMat& src, CudaMem& dst);
-    void enqueueDownload(const GpuMat& src, Mat& dst);
-
-    //! uploads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
-    void enqueueUpload(const CudaMem& src, GpuMat& dst);
-    void enqueueUpload(const Mat& src, GpuMat& dst);
-
-    //! copy asynchronously
-    void enqueueCopy(const GpuMat& src, GpuMat& dst);
-
-    //! memory set asynchronously
-    void enqueueMemSet(GpuMat& src, Scalar val);
-    void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
-
-    //! converts matrix type, ex from float to uchar depending on type
-    void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
-
-    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
-    typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
-    void enqueueHostCallback(StreamCallback callback, void* userData);
-
-    static Stream& Null();
-
-    operator bool() const;
-
-private:
-    struct Impl;
-
-    explicit Stream(Impl* impl);
-    void create();
-    void release();
-
-    Impl *impl;
-
-    friend struct StreamAccessor;
-};
-
-
 //////////////////////////////// Filter Engine ////////////////////////////////

 /*!
@@ -1522,97 +1386,6 @@ private:
    friend class CascadeClassifier_GPU_LBP;
 };

-// ======================== GPU version for soft cascade ===================== //
-
-class CV_EXPORTS ChannelsProcessor
-{
-public:
-    enum
-    {
-        GENERIC   = 1 << 4,
-        SEPARABLE = 2 << 4
-    };
-
-    // Appends specified number of HOG first-order features integrals into given vector.
-    // Param frame is an input 3-channel bgr image.
-    // Param channels is a GPU matrix of optionally shrinked channels
-    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
-    virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
-
-    // Creates a specific preprocessor implementation.
-    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
-    // Param bins is a number of HOG-like channels.
-    // Param flags is a channel computing extra flags.
-    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
-
-    virtual ~ChannelsProcessor();
-
-protected:
-    ChannelsProcessor();
-};
-
-// Implementation of soft (stage-less) cascaded detector.
-class CV_EXPORTS SCascade : public cv::Algorithm
-{
-public:
-
-    // Representation of detectors result.
-    struct CV_EXPORTS Detection
-    {
-        ushort x;
-        ushort y;
-        ushort w;
-        ushort h;
-        float confidence;
-        int kind;
-
-        enum {PEDESTRIAN = 0};
-    };
-
-    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
-
-    // An empty cascade will be created.
-    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
-    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
-    // Param scales is a number of scales from minScale to maxScale.
-    // Param flags is an extra tuning flags.
-    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
-        const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
-
-    virtual ~SCascade();
-
-    cv::AlgorithmInfo* info() const;
-
-    // Load cascade from FileNode.
-    // Param fn is a root node for cascade. Should be <cascade>.
-    virtual bool load(const FileNode& fn);
-
-    // Load cascade config.
-    virtual void read(const FileNode& fn);
-
-    // Return the matrix of of detected objects.
-    // Param image is a frame on which detector will be applied.
-    // Param rois is a regions of interests mask generated by genRoi.
-    //    Only the objects that fall into one of the regions will be returned.
-    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
-    //    The first element of the matrix is  actually a count of detections.
-    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
-    virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-
-private:
-
-    struct Fields;
-    Fields* fields;
-
-    double minScale;
-    double maxScale;
-    int scales;
-
-    int flags;
-};
-
-CV_EXPORTS bool initModule_gpu(void);
-
 ////////////////////////////////// SURF //////////////////////////////////////////

 class CV_EXPORTS SURF_GPU

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -106,7 +106,7 @@
    #endif

    #include "internal_shared.hpp"
-    #include "opencv2/gpu/stream_accessor.hpp"
+    #include "opencv2/core/stream_accessor.hpp"

    #include "nvidia/core/NCV.hpp"
    #include "nvidia/NPP_staging/NPP_staging.hpp"

--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
 set(the_description "Soft Cascade detection and training")
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef -Wsign-promo -Wmissing-declarations -Wmissing-prototypes)
 ocv_define_module(softcascade opencv_core opencv_imgproc opencv_ml)
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310)
\ No newline at end of file
--- a/modules/softcascade/doc/softcascade.rst
+++ b/modules/softcascade/doc/softcascade.rst
@@ -8,4 +8,5 @@ softcascade. Soft Cascade object detection and training.
    :maxdepth: 2

    softcascade_detector
-    softcascade_training
\ No newline at end of file
+    softcascade_training
+    softcascade_cuda
\ No newline at end of file
--- a/modules/softcascade/doc/softcascade_cuda.rst
+++ b/modules/softcascade/doc/softcascade_cuda.rst
+CUDA version of Soft Cascade Classifier
+========================================
+
+softcascade::SCascade
+-----------------------------------------------
+.. ocv:class:: softcascade::SCascade : public Algorithm
+
+Implementation of soft (stageless) cascaded detector. ::
+
+    class CV_EXPORTS SCascade : public Algorithm
+    {
+        struct CV_EXPORTS Detection
+        {
+              ushort x;
+              ushort y;
+              ushort w;
+              ushort h;
+              float confidence;
+              int kind;
+
+              enum {PEDESTRIAN = 0};
+        };
+
+        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+        virtual ~SCascade();
+        virtual bool load(const FileNode& fn);
+        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+    };
+
+
+softcascade::SCascade::~SCascade
+---------------------------------
+Destructor for SCascade.
+
+.. ocv:function:: softcascade::SCascade::~SCascade()
+
+
+
+softcascade::SCascade::load
+----------------------------
+Load cascade from FileNode.
+
+.. ocv:function:: bool softcascade::SCascade::load(const FileNode& fn)
+
+    :param fn: File node from which the soft cascade are read.
+
+
+
+softcascade::SCascade::detect
+------------------------------
+Apply cascade to an input frame and return the vector of Decection objcts.
+
+.. ocv:function:: void softcascade::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const
+
+    :param image: a frame on which detector will be applied.
+
+    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
+
+    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@@ -44,6 +44,7 @@
 #define __OPENCV_SOFTCASCADE_HPP__

 #include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"

 namespace cv { namespace softcascade {

@@ -212,6 +213,96 @@ public:

 CV_EXPORTS bool initModule_softcascade(void);

+// ======================== GPU version for soft cascade ===================== //
+
+class CV_EXPORTS ChannelsProcessor
+{
+public:
+    enum
+    {
+        // GENERIC   = 1 << 4, does not supported
+        SEPARABLE = 2 << 4
+    };
+
+    // Appends specified number of HOG first-order features integrals into given vector.
+    // Param frame is an input 3-channel bgr image.
+    // Param channels is a GPU matrix of optionally shrinked channels
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
+    virtual void apply(InputArray frame, OutputArray channels, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) = 0;
+
+    // Creates a specific preprocessor implementation.
+    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
+    // Param bins is a number of HOG-like channels.
+    // Param flags is a channel computing extra flags.
+    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = SEPARABLE);
+
+    virtual ~ChannelsProcessor();
+
+protected:
+    ChannelsProcessor();
+};
+
+// Implementation of soft (stage-less) cascaded detector.
+class CV_EXPORTS SCascade : public cv::Algorithm
+{
+public:
+
+    // Representation of detectors result.
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
+
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
+
+    // An empty cascade will be created.
+    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
+    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
+    // Param scales is a number of scales from minScale to maxScale.
+    // Param flags is an extra tuning flags.
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
+        const int flags = NO_REJECT | ChannelsProcessor::SEPARABLE);
+
+    virtual ~SCascade();
+
+    cv::AlgorithmInfo* info() const;
+
+    // Load cascade from FileNode.
+    // Param fn is a root node for cascade. Should be <cascade>.
+    virtual bool load(const FileNode& fn);
+
+    // Load cascade config.
+    virtual void read(const FileNode& fn);
+
+    // Return the matrix of of detected objects.
+    // Param image is a frame on which detector will be applied.
+    // Param rois is a regions of interests mask generated by genRoi.
+    //    Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
+    //    The first element of the matrix is  actually a count of detections.
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const;
+
+private:
+
+    struct Fields;
+    Fields* fields;
+
+    double minScale;
+    double maxScale;
+    int scales;
+
+    int flags;
+};
+
+
 }} // namespace cv { namespace softcascade {

 #endif
\ No newline at end of file
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
 #include "perf_precomp.hpp"

+using std::tr1::get;
+
 #define SC_PERF_TEST_P(fixture, name, params)  \
    class fixture##_##name : public fixture {\
     public:\
@@ -25,8 +27,8 @@ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";
 namespace {
    struct DetectionLess
    {
-        bool operator()(const cv::gpu::SCascade::Detection& a,
-            const cv::gpu::SCascade::Detection& b) const
+        bool operator()(const cv::softcascade::SCascade::Detection& a,
+            const cv::softcascade::SCascade::Detection& b) const
        {
            if (a.x != b.x)      return a.x < b.x;
            else if (a.y != b.y) return a.y < b.y;
@@ -39,7 +41,7 @@ namespace {
    {
        cv::Mat detections(objects);

-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;
        Detection* begin = (Detection*)(detections.ptr<char>(0));
        Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
        std::sort(begin, end, DetectionLess());
@@ -60,18 +62,18 @@ SC_PERF_TEST_P(SCascadeTest, detect,

 RUN_GPU(SCascadeTest, detect)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));;
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cascade.detect(colored, rois, objectBoxes);
@@ -118,13 +120,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectInRoi,

 RUN_GPU(SCascadeTestRoi, detectInRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -132,7 +134,7 @@ RUN_GPU(SCascadeTestRoi, detectInRoi)
    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

-    int nroi = GET_PARAM(2);
+    int nroi = get<2>(GetParam());
    cv::RNG rng;
    for (int i = 0; i < nroi; ++i)
    {
@@ -163,13 +165,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,

 RUN_GPU(SCascadeTestRoi, detectEachRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -177,7 +179,7 @@ RUN_GPU(SCascadeTestRoi, detectEachRoi)
    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

-    int idx = GET_PARAM(2);
+    int idx = get<2>(GetParam());
    cv::Rect r = getFromTable(idx);
    cv::gpu::GpuMat sub(rois, r);
    sub.setTo(1);
@@ -202,18 +204,18 @@ SC_PERF_TEST_P(SCascadeTest, detectStream,

 RUN_GPU(SCascadeTest, detectStream)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cv::gpu::Stream s;

--- a/modules/softcascade/src/cuda/channels.cu
+++ b/modules/softcascade/src/cuda/channels.cu
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -40,15 +40,31 @@
 //
 //M*/

-#include <opencv2/gpu/device/common.hpp>
-#include <opencv2/gpu/device/saturate_cast.hpp>
-
-#include <icf.hpp>
+#include <cuda_invoker.hpp>
 #include <float.h>
 #include <stdio.h>

-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func);
+}}}
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err) cv::softcascade::internal::error(cudaGetErrorString(err), file, line, func);
+}
+
+#ifndef CV_PI
+    #define CV_PI   3.1415926535897932384626433832795
+#endif
+
+namespace cv { namespace softcascade { namespace device {
+
+typedef unsigned char uchar;

    template <int FACTOR>
    __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
@@ -125,7 +141,7 @@ namespace icf {
        luvg[luvgPitch * (y + 2 * 480) + x] = v;
    }

-    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv)
+    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv)
    {
        dim3 block(32, 8);
        dim3 grid(bgr.cols / 32, bgr.rows / 8);
@@ -207,7 +223,7 @@ namespace icf {
    texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;

    template<bool isDefaultNum>
-    __global__ void gray2hog(PtrStepSzb mag)
+    __global__ void gray2hog(cv::gpu::PtrStepSzb mag)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -222,7 +238,7 @@ namespace icf {
        mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
    }

-    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins)
+    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins)
    {
        dim3 block(32, 8);
        dim3 grid(gray.cols / 32, gray.rows / 8);
@@ -303,7 +319,7 @@ namespace icf {
                    excluded = excluded || (suppessed == i);
                }

-            #if __CUDA_ARCH__ >= 120
+            #if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 120)
                if (__all(excluded)) break;
            #endif
            }
@@ -325,8 +341,8 @@ namespace icf {
        }
    }

-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
-        PtrStepSzb suppressed, cudaStream_t stream)
+    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
+        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream)
    {
        int block = 192;
        int grid = 1;
@@ -348,7 +364,7 @@ namespace icf {
    template<typename Policy>
    struct PrefixSum
    {
-    __device static void apply(float& impact)
+    __device_inline__ static void apply(float& impact)
        {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
    #pragma unroll
@@ -442,6 +458,7 @@ namespace icf {
    {
        x += area.x;
        y += area.y;
+
        int a = tex2D(thogluv, x, y);
        int b = tex2D(thogluv, x + area.z, y);
        int c = tex2D(thogluv, x + area.z, y + area.w);
@@ -454,7 +471,7 @@ namespace icf {

 template<typename Policy>
 template<bool isUp>
-__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+__device_inline__ void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
 {
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    const int x = blockIdx.x;
@@ -527,8 +544,8 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
 }

 template<typename Policy>
-void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
+void CascadeInvoker<Policy>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
+    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 {
    int fw = roi.rows;
    int fh = roi.cols;
@@ -560,8 +577,7 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
    }
 }

-template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
+template void CascadeInvoker<GK107PolicyX4>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
+    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;

-}
 }}}
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -22,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and / or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -44,17 +44,20 @@
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__

-#include <opencv2/gpu/device/common.hpp>
+// #include <opencv2/gpu/device/common.hpp>
+#include "opencv2/core/cuda_devptrs.hpp"
+#include "cuda_runtime_api.h"

 #if defined __CUDACC__
-# define __device __device__ __forceinline__
+# define __device_inline__ __device__ __forceinline__
 #else
-# define __device
+# define __device_inline__
 #endif


-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace device {
+
+typedef unsigned char uchar;

 struct Octave
 {
@@ -68,20 +71,19 @@ struct Octave
    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
 };

-struct Level //is actually 24 bytes
+struct Level
 {
    int octave;
    int step;

    float relScale;
-    float scaling[2]; // calculated according to Dollal paper
+    float scaling[2];// calculated according to Dollar paper

-    // for 640x480 we can not get overflow
    uchar2 workRect;
    uchar2 objSize;

    Level(int idx, const Octave& oct, const float scale, const int w, const int h);
-    __device Level(){}
+    __device_inline__ Level(){}
 };

 struct Node
@@ -106,7 +108,7 @@ struct Detection
    int kind;

    Detection(){}
-    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
+    __device_inline__ Detection(int _x, int _y, uchar _w, uchar _h, float c)
    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };

@@ -125,8 +127,8 @@ struct CascadeInvoker
 {
    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}

-    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
-                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    CascadeInvoker(const cv::gpu::PtrStepSzb& _levels, const cv::gpu::PtrStepSzf& _stages,
+                   const cv::gpu::PtrStepSzb& _nodes,  const cv::gpu::PtrStepSzf& _leaves)
    : levels((const Level*)_levels.ptr()),
      stages((const float*)_stages.ptr()),
      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
@@ -141,14 +143,13 @@ struct CascadeInvoker

    int scales;

-    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+    void operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects,
        const int downscales, const cudaStream_t& stream = 0) const;

    template<bool isUp>
-    __device void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
+    __device_inline__ void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
 };

-}
 }}}

 #endif
\ No newline at end of file
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
--- a/modules/softcascade/src/precomp.hpp
+++ b/modules/softcascade/src/precomp.hpp
@@ -56,6 +56,7 @@

 namespace cv { namespace softcascade { namespace internal
 {
+
 namespace rnd {

 typedef cv::RNG_MT19937 engine;

--- a/modules/softcascade/src/softcascade_init.cpp
+++ b/modules/softcascade/src/softcascade_init.cpp
@@ -51,11 +51,34 @@ CV_INIT_ALGORITHM(Detector, "SoftCascade.Detector",
                  obj.info()->addParam(obj, "scales",      obj.scales);
                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));

+CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
+                  obj.info()->addParam(obj, "minScale", obj.minScale);
+                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
+                  obj.info()->addParam(obj, "scales",   obj.scales));

 bool initModule_softcascade(void)
 {
+    Ptr<Algorithm> sc = createSCascade();
    Ptr<Algorithm> sc1 = createDetector();
-    return (sc1->info() != 0);
+    return (sc1->info() != 0) && (sc->info() != 0);
+}
+
+namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func)
+{
+    int code = CV_GpuApiCallError;
+
+    if (std::uncaught_exception())
+    {
+        const char* errorStr = cvErrorStr(code);
+        const char* function = func ? func : "unknown function";
+
+        std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+        std::cerr.flush();
+    }
+    else
+        cv::error( cv::Exception(code, error_string, func, file, line) );
+}
 }

 } }
\ No newline at end of file
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -41,10 +41,11 @@
 //M*/

 #include "test_precomp.hpp"
+#include "opencv2/core/gpumat.hpp"

-#ifdef HAVE_CUDA

-using cv::gpu::GpuMat;
+#ifdef HAVE_CUDA
+using std::tr1::get;

 // show detection results on input image with cv::imshow
 //#define SHOW_DETECTIONS
@@ -59,7 +60,7 @@ using cv::gpu::GpuMat;

 static std::string path(std::string relative)
 {
-    return cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/" + relative;
+    return cvtest::TS::ptr()->get_data_path() + "cascadeandhog/" + relative;
 }

 TEST(SCascadeTest, readCascade)
@@ -67,7 +68,7 @@ TEST(SCascadeTest, readCascade)
    std::string xml = path("cascades/inria_caltech-17.01.2013.xml");
    cv::FileStorage fs(xml, cv::FileStorage::READ);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -75,7 +76,7 @@ TEST(SCascadeTest, readCascade)

 namespace
 {
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;

    cv::Rect getFromTable(int idx)
    {
@@ -97,7 +98,6 @@ namespace
        return rois[idx];
    }

-
    void print(std::ostream &out, const Detection& d)
    {
    #if defined SHOW_DETECTIONS
@@ -156,36 +156,36 @@ namespace
 #endif
 }

-PARAM_TEST_CASE(SCascadeTestRoi, cv::gpu::DeviceInfo, std::string, std::string, int)
+class SCascadeTestRoi : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> >
 {
    virtual void SetUp()
    {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
    }
 };

-GPU_TEST_P(SCascadeTestRoi, Detect)
+TEST_P(SCascadeTestRoi, Detect)
 {
-    cv::Mat coloredCpu = cv::imread(path(GET_PARAM(2)));
+    cv::Mat coloredCpu = cv::imread(path(get<2>(GetParam())));
    ASSERT_FALSE(coloredCpu.empty());

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(path(GET_PARAM(1)), cv::FileStorage::READ);
+    cv::FileStorage fs(path(get<1>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

-    int nroi = GET_PARAM(3);
+    int nroi = get<3>(GetParam());
    cv::Mat result(coloredCpu);
    cv::RNG rng;
    for (int i = 0; i < nroi; ++i)
    {
        cv::Rect r = getFromTable(rng(10));
-        GpuMat sub(rois, r);
+        cv::gpu::GpuMat sub(rois, r);
        sub.setTo(1);
        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
    }
@@ -194,7 +194,7 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
    cascade.detect(colored, rois, objectBoxes);

    cv::Mat dt(objectBoxes);
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;

    Detection* dts = ((Detection*)dt.data) + 1;
    int* count = dt.ptr<int>(0);
@@ -211,15 +211,13 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
    SHOW(result);
 }

-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestRoi, testing::Combine(
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestRoi, testing::Combine(
    ALL_DEVICES,
    testing::Values(std::string("cascades/inria_caltech-17.01.2013.xml"),
                    std::string("cascades/sc_cvpr_2012_to_opencv_new_format.xml")),
    testing::Values(std::string("images/image_00000000_0.png")),
    testing::Range(0, 5)));

-////////////////////////////////////////
-
 namespace {

 struct Fixture
@@ -232,23 +230,24 @@ struct Fixture
 };
 }

-PARAM_TEST_CASE(SCascadeTestAll, cv::gpu::DeviceInfo, Fixture)
+typedef std::tr1::tuple<cv::gpu::DeviceInfo, Fixture> SCascadeTestAllFixture;
+class SCascadeTestAll : public ::testing::TestWithParam<SCascadeTestAllFixture>
 {
-
+protected:
    std::string xml;
    int expected;

    virtual void SetUp()
    {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
-        xml = path(GET_PARAM(1).path);
-        expected = GET_PARAM(1).expected;
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
+        xml = path(get<1>(GetParam()).path);
+        expected = get<1>(GetParam()).expected;
    }
 };

-GPU_TEST_P(SCascadeTestAll, detect)
+TEST_P(SCascadeTestAll, detect)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
@@ -258,12 +257,12 @@ GPU_TEST_P(SCascadeTestAll, detect)
    cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
    ASSERT_FALSE(coloredCpu.empty());

-    GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cascade.detect(colored, rois, objectBoxes);

-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
    cv::Mat dt(objectBoxes);


@@ -283,9 +282,9 @@ GPU_TEST_P(SCascadeTestAll, detect)
    ASSERT_EQ(*count, expected);
 }

-GPU_TEST_P(SCascadeTestAll, detectStream)
+TEST_P(SCascadeTestAll, detectStream)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
@@ -295,7 +294,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
    cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
    ASSERT_FALSE(coloredCpu.empty());

-    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(cv::Scalar::all(1));

    cv::gpu::Stream s;
@@ -304,14 +303,14 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
    cascade.detect(colored, rois, objectBoxes, s);
    s.waitForCompletion();

-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
    cv::Mat detections(objectBoxes);
    int a = *(detections.ptr<int>(0));
    ASSERT_EQ(a, expected);
 }

-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestAll, testing::Combine( ALL_DEVICES,
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestAll, testing::Combine( ALL_DEVICES,
                    testing::Values(Fixture("cascades/inria_caltech-17.01.2013.xml", 7),
                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));

-#endif
+#endif
\ No newline at end of file
--- a/modules/softcascade/test/test_main.cpp
+++ b/modules/softcascade/test/test_main.cpp
@@ -42,4 +42,4 @@

 #include "test_precomp.hpp"

-CV_TEST_MAIN("cv")
+CV_TEST_MAIN("cv")
\ No newline at end of file
--- a/modules/softcascade/test/test_precomp.hpp
+++ b/modules/softcascade/test/test_precomp.hpp
@@ -55,5 +55,6 @@
 # include "opencv2/softcascade.hpp"
 # include "opencv2/imgproc.hpp"
 # include "opencv2/highgui.hpp"
+# include "utility.hpp"

 #endif
--- a/modules/softcascade/test/utility.cpp
+++ b/modules/softcascade/test/utility.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+using namespace testing::internal;
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+
+bool supportFeature(const DeviceInfo& info, FeatureSet feature)
+{
+    return TargetArchs::builtWith(feature) && info.supports(feature);
+}
+
+DeviceManager& DeviceManager::instance()
+{
+    static DeviceManager obj;
+    return obj;
+}
+
+void DeviceManager::load(int i)
+{
+    devices_.clear();
+    devices_.reserve(1);
+
+    std::ostringstream msg;
+
+    if (i < 0 || i >= getCudaEnabledDeviceCount())
+    {
+        msg << "Incorrect device number - " << i;
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    DeviceInfo info(i);
+
+    if (!info.isCompatible())
+    {
+        msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    devices_.push_back(info);
+}
+
+void DeviceManager::loadAll()
+{
+    int deviceCount = getCudaEnabledDeviceCount();
+
+    devices_.clear();
+    devices_.reserve(deviceCount);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        DeviceInfo info(i);
+        if (info.isCompatible())
+        {
+            devices_.push_back(info);
+        }
+    }
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/src/gpu_init.cpp
+++ b/modules/gpu/src/gpu_init.cpp
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
 //
 //
-// License Agreement
-// For Open Source Computer Vision Library
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
 //
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
 //
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
@@ -40,20 +39,37 @@
 //
 //M*/

-#include "precomp.hpp"
+#ifndef __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
+#define __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__

-namespace cv { namespace gpu
-{
+#include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/ts.hpp"
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+//! return true if device supports specified feature and gpu module was built with support the feature.
+bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);

-CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
-                  obj.info()->addParam(obj, "minScale", obj.minScale);
-                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
-                  obj.info()->addParam(obj, "scales",   obj.scales));

-bool initModule_gpu(void)
+#if defined(HAVE_CUDA)
+class DeviceManager
 {
-    Ptr<Algorithm> sc = createSCascade();
-    return sc->info() != 0;
-}
+public:
+    static DeviceManager& instance();
+
+    void load(int i);
+    void loadAll();
+
+    const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
+
+private:
+    std::vector<cv::gpu::DeviceInfo> devices_;
+    DeviceManager() {loadAll();}
+};
+# define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
+#else
+# define ALL_DEVICES testing::ValuesIn(std::vector<cv::gpu::DeviceInfo>())
+#endif

-} }
\ No newline at end of file
+#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
--- a/samples/cpp/peopledetect.cpp
+++ b/samples/cpp/peopledetect.cpp
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <opencv2/softcascade/softcascade.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/objdetect.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/softcascade.hpp>

 #include <iostream>
 #include <vector>

--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
 SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
                                     opencv_ml opencv_video opencv_objdetect opencv_features2d
                                     opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
-                                     opencv_nonfree)
+                                     opencv_nonfree opencv_softcascade)

 ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})


--- a/samples/gpu/softcascade.cpp
+++ b/samples/gpu/softcascade.cpp
-#include <opencv2/gpu/gpu.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/gpu.hpp>
+#include <opencv2/softcascade.hpp>
+#include <opencv2/highgui.hpp>
 #include <iostream>

 int main(int argc, char** argv)
@@ -46,7 +47,7 @@ int main(int argc, char** argv)
    float maxScale =  parser.get<float>("max_scale");
    int scales     =  parser.get<int>("total_scales");

-    using cv::gpu::SCascade;
+    using cv::softcascade::SCascade;
    SCascade cascade(minScale, maxScale, scales);

    if (!cascade.load(fs.getFirstTopLevelNode()))
@@ -79,7 +80,7 @@ int main(int argc, char** argv)
        cascade.detect(dframe, roi, objects);

        cv::Mat dt(objects);
-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;

        Detection* dts = ((Detection*)dt.data) + 1;
        int* count = dt.ptr<int>(0);