Merge pull request #1042 from jet47:gpuimgproc-refactoring

4b234fa0 · Andrey Pavlenko · OpenCV Buildbot · 73b5cc35 · 39a25115 · 4b234fa0
Commit 4b234fa0 authored Jul 18, 2013 by Andrey Pavlenko Committed by OpenCV Buildbot Jul 18, 2013
52 changed files
--- a/modules/gpu/perf4au/main.cpp
+++ b/modules/gpu/perf4au/main.cpp
@@ -86,13 +86,14 @@ PERF_TEST_P(Image, HoughLinesP, testing::Values(std::string("im1_1280x800.jpg"))
    {
        cv::gpu::GpuMat d_image(image);
        cv::gpu::GpuMat d_lines;
-        cv::gpu::HoughLinesBuf d_buf;

-        cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        cv::Ptr<cv::gpu::HoughSegmentDetector> hough = cv::gpu::createHoughSegmentDetector(rho, theta, minLineLenght, maxLineGap);
+
+        hough->detect(d_image, d_lines);

        TEST_CYCLE()
        {
-            cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+            hough->detect(d_image, d_lines);
        }
    }
    else
@@ -147,17 +148,17 @@ PERF_TEST_P(Image_Depth, GoodFeaturesToTrack,

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k);
+        cv::Ptr<cv::gpu::CornersDetector> detector = cv::gpu::createGoodFeaturesToTrackDetector(src.type(), maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k);

        cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat d_mask(mask);
        cv::gpu::GpuMat d_pts;

-        d_detector(d_src, d_pts, d_mask);
+        detector->detect(d_src, d_pts, d_mask);

        TEST_CYCLE()
        {
-            d_detector(d_src, d_pts, d_mask);
+            detector->detect(d_src, d_pts, d_mask);
        }
    }
    else

--- a/modules/gpuimgproc/CMakeLists.txt
+++ b/modules/gpuimgproc/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "GPU-accelerated Image Processing")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)

-ocv_define_module(gpuimgproc opencv_imgproc opencv_gpufilters OPTIONAL opencv_gpuarithm)
+ocv_define_module(gpuimgproc opencv_imgproc OPTIONAL opencv_gpuarithm opencv_gpufilters)
--- a/modules/gpuimgproc/doc/color.rst
+++ b/modules/gpuimgproc/doc/color.rst
@@ -6,16 +6,16 @@ Color space processing


 gpu::cvtColor
-----------------
+-------------
 Converts an image from one color space to another.

-.. ocv:function:: void gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::cvtColor(InputArray src, OutputArray dst, int code, int dcn = 0, Stream& stream = Stream::Null())

    :param src: Source image with  ``CV_8U`` , ``CV_16U`` , or  ``CV_32F`` depth and 1, 3, or 4 channels.

-    :param dst: Destination image with the same size and depth as  ``src`` .
+    :param dst: Destination image.

-    :param code: Color space conversion code. For details, see  :ocv:func:`cvtColor` . Conversion to/from Luv and Bayer color spaces is not supported.
+    :param code: Color space conversion code. For details, see  :ocv:func:`cvtColor` .

    :param dcn: Number of channels in the destination image. If the parameter is 0, the number of the channels is derived automatically from  ``src`` and the  ``code`` .

@@ -27,11 +27,45 @@ Converts an image from one color space to another.



+gpu::demosaicing
+----------------
+Converts an image from Bayer pattern to RGB or grayscale.
+
+.. ocv:function:: void gpu::demosaicing(InputArray src, OutputArray dst, int code, int dcn = -1, Stream& stream = Stream::Null())
+
+    :param src: Source image (8-bit or 16-bit single channel).
+
+    :param dst: Destination image.
+
+    :param code: Color space conversion code (see the description below).
+
+    :param dcn: Number of channels in the destination image. If the parameter is 0, the number of the channels is derived automatically from  ``src`` and the  ``code`` .
+
+    :param stream: Stream for the asynchronous version.
+
+The function can do the following transformations:
+
+* Demosaicing using bilinear interpolation
+
+    * ``COLOR_BayerBG2GRAY`` , ``COLOR_BayerGB2GRAY`` , ``COLOR_BayerRG2GRAY`` , ``COLOR_BayerGR2GRAY``
+
+    * ``COLOR_BayerBG2BGR`` , ``COLOR_BayerGB2BGR`` , ``COLOR_BayerRG2BGR`` , ``COLOR_BayerGR2BGR``
+
+* Demosaicing using Malvar-He-Cutler algorithm ([MHT2011]_)
+
+    * ``COLOR_BayerBG2GRAY_MHT`` , ``COLOR_BayerGB2GRAY_MHT`` , ``COLOR_BayerRG2GRAY_MHT`` , ``COLOR_BayerGR2GRAY_MHT``
+
+    * ``COLOR_BayerBG2BGR_MHT`` , ``COLOR_BayerGB2BGR_MHT`` , ``COLOR_BayerRG2BGR_MHT`` , ``COLOR_BayerGR2BGR_MHT``
+
+.. seealso:: :ocv:func:`cvtColor`
+
+
+
 gpu::swapChannels
 -----------------
 Exchanges the color channels of an image in-place.

-.. ocv:function:: void gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::swapChannels(InputOutputArray image, const int dstOrder[4], Stream& stream = Stream::Null())

    :param image: Source image. Supports only ``CV_8UC4`` type.

@@ -43,11 +77,27 @@ The methods support arbitrary permutations of the original channels, including r



+gpu::gammaCorrection
+--------------------
+Routines for correcting image color gamma.
+
+.. ocv:function:: void gpu::gammaCorrection(InputArray src, OutputArray dst, bool forward = true, Stream& stream = Stream::Null())
+
+    :param src: Source image (3- or 4-channel 8 bit).
+
+    :param dst: Destination image.
+
+    :param forward: ``true`` for forward gamma correction or ``false`` for inverse gamma correction.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
 gpu::alphaComp
-------------------
+--------------
 Composites two images using alpha opacity values contained in each image.

-.. ocv:function:: void gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::alphaComp(InputArray img1, InputArray img2, OutputArray dst, int alpha_op, Stream& stream = Stream::Null())

    :param img1: First image. Supports ``CV_8UC4`` , ``CV_16UC4`` , ``CV_32SC4`` and ``CV_32FC4`` types.

@@ -72,3 +122,7 @@ Composites two images using alpha opacity values contained in each image.
            * **ALPHA_PREMUL**

    :param stream: Stream for the asynchronous version.
+
+
+
+.. [MHT2011] Pascal Getreuer, Malvar-He-Cutler Linear Image Demosaicking, Image Processing On Line, 2011
--- a/modules/gpuimgproc/doc/feature_detection.rst
+++ b/modules/gpuimgproc/doc/feature_detection.rst
@@ -5,15 +5,41 @@ Feature Detection



-gpu::cornerHarris
---------------------
-Computes the Harris cornerness criteria at each image pixel.
+gpu::CornernessCriteria
+-----------------------
+.. ocv:class:: gpu::CornernessCriteria : public Algorithm

-.. ocv:function:: void gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101)
+Base class for Cornerness Criteria computation. ::

-    :param src: Source image. Only  ``CV_8UC1`` and  ``CV_32FC1`` images are supported for now.
+    class CV_EXPORTS CornernessCriteria : public Algorithm
+    {
+    public:
+        virtual void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+    };
+
+
+
+gpu::CornernessCriteria::compute
+--------------------------------
+Computes the cornerness criteria at each image pixel.
+
+.. ocv:function:: void gpu::CornernessCriteria::compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source image.
+
+    :param dst: Destination image containing cornerness values. It will have the same size as ``src`` and ``CV_32FC1`` type.

-    :param dst: Destination image containing cornerness values. It has the same size as ``src`` and ``CV_32FC1`` type.
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::createHarrisCorner
+-----------------------
+Creates implementation for Harris cornerness criteria.
+
+.. ocv:function:: Ptr<CornernessCriteria> gpu::createHarrisCorner(int srcType, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101)
+
+    :param srcType: Input source type. Only  ``CV_8UC1`` and  ``CV_32FC1`` are supported for now.

    :param blockSize: Neighborhood size.

@@ -27,55 +53,70 @@ Computes the Harris cornerness criteria at each image pixel.



-gpu::cornerMinEigenVal
--------------------------
-Computes the minimum eigen value of a 2x2 derivative covariation matrix at each pixel (the cornerness criteria).
-
-.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101)
-
-.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101)
-
-.. ocv:function:: void gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null())
+gpu::createMinEigenValCorner
+----------------------------
+Creates implementation for the minimum eigen value of a 2x2 derivative covariation matrix (the cornerness criteria).

-    :param src: Source image. Only  ``CV_8UC1`` and  ``CV_32FC1`` images are supported for now.
+.. ocv:function:: Ptr<CornernessCriteria> gpu::createMinEigenValCorner(int srcType, int blockSize, int ksize, int borderType = BORDER_REFLECT101)

-    :param dst: Destination image containing cornerness values. The size is the same. The type is  ``CV_32FC1`` .
+    :param srcType: Input source type. Only  ``CV_8UC1`` and  ``CV_32FC1`` are supported for now.

    :param blockSize: Neighborhood size.

    :param ksize: Aperture parameter for the Sobel operator.

-    :param borderType: Pixel extrapolation method. Only ``BORDER_REFLECT101`` and ``BORDER_REPLICATE`` are supported for now.
+    :param borderType: Pixel extrapolation method. Only  ``BORDER_REFLECT101`` and  ``BORDER_REPLICATE`` are supported for now.

 .. seealso:: :ocv:func:`cornerMinEigenVal`



-gpu::GoodFeaturesToTrackDetector_GPU
------------------------------------
-.. ocv:class:: gpu::GoodFeaturesToTrackDetector_GPU
+gpu::CornersDetector
+--------------------
+.. ocv:class:: gpu::CornersDetector : public Algorithm

-Class used for strong corners detection on an image. ::
+Base class for Corners Detector. ::

-    class GoodFeaturesToTrackDetector_GPU
+    class CV_EXPORTS CornersDetector : public Algorithm
    {
    public:
-        explicit GoodFeaturesToTrackDetector_GPU(int maxCorners_ = 1000, double qualityLevel_ = 0.01, double minDistance_ = 0.0,
-            int blockSize_ = 3, bool useHarrisDetector_ = false, double harrisK_ = 0.04);
+        virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0;
+    };

-        void operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask = GpuMat());

-        int maxCorners;
-        double qualityLevel;
-        double minDistance;

-        int blockSize;
-        bool useHarrisDetector;
-        double harrisK;
+gpu::CornersDetector::detect
+----------------------------
+Determines strong corners on an image.

-        void releaseMemory();
-    };
+.. ocv:function:: void gpu::CornersDetector::detect(InputArray image, OutputArray corners, InputArray mask = noArray())
+
+    :param image: Input 8-bit or floating-point 32-bit, single-channel image.
+
+    :param corners: Output vector of detected corners (1-row matrix with CV_32FC2 type with corners positions).
+
+    :param mask: Optional region of interest. If the image is not empty (it needs to have the type  ``CV_8UC1``  and the same size as  ``image`` ), it  specifies the region in which the corners are detected.
+
+
+
+gpu::createGoodFeaturesToTrackDetector
+--------------------------------------
+Creates implementation for :ocv:class:`gpu::CornersDetector` .
+
+.. ocv:function:: Ptr<CornersDetector> gpu::createGoodFeaturesToTrackDetector(int srcType, int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0, int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04)
+
+    :param srcType: Input source type. Only  ``CV_8UC1`` and  ``CV_32FC1`` are supported for now.
+
+    :param maxCorners: Maximum number of corners to return. If there are more corners than are found, the strongest of them is returned.
+
+    :param qualityLevel: Parameter characterizing the minimal accepted quality of image corners. The parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue (see  :ocv:func:`cornerMinEigenVal` ) or the Harris function response (see  :ocv:func:`cornerHarris` ). The corners with the quality measure less than the product are rejected. For example, if the best corner has the quality measure = 1500, and the  ``qualityLevel=0.01`` , then all the corners with the quality measure less than 15 are rejected.
+
+    :param minDistance: Minimum possible Euclidean distance between the returned corners.
+
+    :param blockSize: Size of an average block for computing a derivative covariation matrix over each pixel neighborhood. See  :ocv:func:`cornerEigenValsAndVecs` .
+
+    :param useHarrisDetector: Parameter indicating whether to use a Harris detector (see :ocv:func:`cornerHarris`) or :ocv:func:`cornerMinEigenVal`.

-The class finds the most prominent corners in the image.
+    :param harrisK: Free parameter of the Harris detector.

 .. seealso:: :ocv:func:`goodFeaturesToTrack`
--- a/modules/gpuimgproc/doc/histogram.rst
+++ b/modules/gpuimgproc/doc/histogram.rst
@@ -5,11 +5,89 @@ Histogram Calculation



+gpu::calcHist
+-------------
+Calculates histogram for one channel 8-bit image.
+
+.. ocv:function:: void gpu::calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null())
+
+    :param src: Source image with ``CV_8UC1`` type.
+
+    :param hist: Destination histogram with one row, 256 columns, and the  ``CV_32SC1`` type.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::equalizeHist
+-----------------
+Equalizes the histogram of a grayscale image.
+
+.. ocv:function:: void gpu::equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null())
+
+    :param src: Source image with ``CV_8UC1`` type.
+
+    :param dst: Destination image.
+
+    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`equalizeHist`
+
+
+
+gpu::CLAHE
+----------
+.. ocv:class:: gpu::CLAHE : public cv::CLAHE
+
+Base class for Contrast Limited Adaptive Histogram Equalization. ::
+
+    class CV_EXPORTS CLAHE : public cv::CLAHE
+    {
+    public:
+        using cv::CLAHE::apply;
+        virtual void apply(InputArray src, OutputArray dst, Stream& stream) = 0;
+    };
+
+
+
+gpu::CLAHE::apply
+-----------------
+Equalizes the histogram of a grayscale image using Contrast Limited Adaptive Histogram Equalization.
+
+.. ocv:function:: void gpu::CLAHE::apply(InputArray src, OutputArray dst)
+
+.. ocv:function:: void gpu::CLAHE::apply(InputArray src, OutputArray dst, Stream& stream)
+
+    :param src: Source image with ``CV_8UC1`` type.
+
+    :param dst: Destination image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::createCLAHE
+----------------
+Creates implementation for :ocv:class:`gpu::CLAHE` .
+
+.. ocv:function:: Ptr<gpu::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8))
+
+    :param clipLimit: Threshold for contrast limiting.
+
+    :param tileGridSize: Size of grid for histogram equalization. Input image will be divided into equally sized rectangular tiles. ``tileGridSize`` defines the number of tiles in row and column.
+
+
+
+
 gpu::evenLevels
-------------------
+---------------
 Computes levels with even distribution.

-.. ocv:function:: void gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel)
+.. ocv:function:: void gpu::evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel)

    :param levels: Destination array.  ``levels`` has 1 row, ``nLevels`` columns, and the ``CV_32SC1`` type.

@@ -22,16 +100,16 @@ Computes levels with even distribution.


 gpu::histEven
-----------------
+-------------
 Calculates a histogram with evenly distributed bins.

-.. ocv:function:: void gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::histEven( const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::histEven( const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null())

    :param src: Source image. ``CV_8U``, ``CV_16U``, or ``CV_16S`` depth and 1 or 4 channels are supported. For a four-channel image, all channels are processed separately.

@@ -50,12 +128,16 @@ Calculates a histogram with evenly distributed bins.


 gpu::histRange
------------------
+--------------
 Calculates a histogram with bins determined by the ``levels`` array.

-.. ocv:function:: void gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null())

    :param src: Source image. ``CV_8U`` , ``CV_16U`` , or  ``CV_16S`` depth and 1 or 4 channels are supported. For a four-channel image, all channels are processed separately.

@@ -66,39 +148,3 @@ Calculates a histogram with bins determined by the ``levels`` array.
    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).

    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::calcHist
------------------
-Calculates histogram for one channel 8-bit image.
-
-.. ocv:function:: void gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null())
-
-    :param src: Source image.
-
-    :param hist: Destination histogram with one row, 256 columns, and the  ``CV_32SC1`` type.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::equalizeHist
------------------
-Equalizes the histogram of a grayscale image.
-
-.. ocv:function:: void gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null())
-
-    :param src: Source image.
-
-    :param dst: Destination image.
-
-    :param hist: Destination histogram with one row, 256 columns, and the  ``CV_32SC1`` type.
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`equalizeHist`
--- a/modules/gpuimgproc/doc/hough.rst
+++ b/modules/gpuimgproc/doc/hough.rst
--- a/modules/gpuimgproc/doc/imgproc.rst
+++ b/modules/gpuimgproc/doc/imgproc.rst
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
--- a/modules/gpuimgproc/perf/perf_canny.cpp
+++ b/modules/gpuimgproc/perf/perf_canny.cpp
@@ -70,9 +70,10 @@ PERF_TEST_P(Image_AppertureSz_L2gradient, Canny,
    {
        const cv::gpu::GpuMat d_image(image);
        cv::gpu::GpuMat dst;
-        cv::gpu::CannyBuf d_buf;

-        TEST_CYCLE() cv::gpu::Canny(d_image, d_buf, dst, low_thresh, high_thresh, apperture_size, useL2gradient);
+        cv::Ptr<cv::gpu::CannyEdgeDetector> canny = cv::gpu::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+
+        TEST_CYCLE() canny->detect(d_image, dst);

        GPU_SANITY_CHECK(dst);
    }

--- a/modules/gpuimgproc/perf/perf_corners.cpp
+++ b/modules/gpuimgproc/perf/perf_corners.cpp
@@ -75,11 +75,10 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, CornerHarris,
    {
        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_Dx;
-        cv::gpu::GpuMat d_Dy;
-        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::cornerHarris(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, k, borderMode);
+        cv::Ptr<cv::gpu::CornernessCriteria> harris = cv::gpu::createHarrisCorner(img.type(), blockSize, apertureSize, k, borderMode);
+
+        TEST_CYCLE() harris->compute(d_img, dst);

        GPU_SANITY_CHECK(dst, 1e-4);
    }
@@ -118,11 +117,10 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, CornerMinEigenVal,
    {
        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_Dx;
-        cv::gpu::GpuMat d_Dy;
-        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::cornerMinEigenVal(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, borderMode);
+        cv::Ptr<cv::gpu::CornernessCriteria> minEigenVal = cv::gpu::createMinEigenValCorner(img.type(), blockSize, apertureSize, borderMode);
+
+        TEST_CYCLE() minEigenVal->compute(d_img, dst);

        GPU_SANITY_CHECK(dst, 1e-4);
    }

--- a/modules/gpuimgproc/perf/perf_gftt.cpp
+++ b/modules/gpuimgproc/perf/perf_gftt.cpp
@@ -66,12 +66,12 @@ PERF_TEST_P(Image_MinDistance, GoodFeaturesToTrack,

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance);
+        cv::Ptr<cv::gpu::CornersDetector> d_detector = cv::gpu::createGoodFeaturesToTrackDetector(image.type(), maxCorners, qualityLevel, minDistance);

        const cv::gpu::GpuMat d_image(image);
        cv::gpu::GpuMat pts;

-        TEST_CYCLE() d_detector(d_image, pts);
+        TEST_CYCLE() d_detector->detect(d_image, pts);

        GPU_SANITY_CHECK(pts);
    }

--- a/modules/gpuimgproc/perf/perf_histogram.cpp
+++ b/modules/gpuimgproc/perf/perf_histogram.cpp
@@ -167,10 +167,9 @@ PERF_TEST_P(Sz, EqualizeHist,
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_hist;
        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::equalizeHist(d_src, dst, d_hist, d_buf);
+        TEST_CYCLE() cv::gpu::equalizeHist(d_src, dst, d_buf);

        GPU_SANITY_CHECK(dst);
    }

--- a/modules/gpuimgproc/perf/perf_hough.cpp
+++ b/modules/gpuimgproc/perf/perf_hough.cpp
@@ -103,9 +103,10 @@ PERF_TEST_P(Sz, HoughLines,
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat d_lines;
-        cv::gpu::HoughLinesBuf d_buf;

-        TEST_CYCLE() cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold);
+        cv::Ptr<cv::gpu::HoughLinesDetector> hough = cv::gpu::createHoughLinesDetector(rho, theta, threshold);
+
+        TEST_CYCLE() hough->detect(d_src, d_lines);

        cv::Mat gpu_lines(d_lines.row(0));
        cv::Vec2f* begin = gpu_lines.ptr<cv::Vec2f>(0);
@@ -151,9 +152,10 @@ PERF_TEST_P(Image, HoughLinesP,
    {
        const cv::gpu::GpuMat d_mask(mask);
        cv::gpu::GpuMat d_lines;
-        cv::gpu::HoughLinesBuf d_buf;

-        TEST_CYCLE() cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        cv::Ptr<cv::gpu::HoughSegmentDetector> hough = cv::gpu::createHoughSegmentDetector(rho, theta, minLineLenght, maxLineGap);
+
+        TEST_CYCLE() hough->detect(d_mask, d_lines);

        cv::Mat gpu_lines(d_lines);
        cv::Vec4i* begin = gpu_lines.ptr<cv::Vec4i>();
@@ -201,9 +203,10 @@ PERF_TEST_P(Sz_Dp_MinDist, HoughCircles,
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat d_circles;
-        cv::gpu::HoughCirclesBuf d_buf;

-        TEST_CYCLE() cv::gpu::HoughCircles(d_src, d_circles, d_buf, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+        cv::Ptr<cv::gpu::HoughCirclesDetector> houghCircles = cv::gpu::createHoughCirclesDetector(dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+
+        TEST_CYCLE() houghCircles->detect(d_src, d_circles);

        cv::Mat gpu_circles(d_circles);
        cv::Vec3f* begin = gpu_circles.ptr<cv::Vec3f>(0);
@@ -283,7 +286,7 @@ PERF_TEST_P(Method_Sz, GeneralizedHough,
        const cv::gpu::GpuMat d_dy(dy);
        cv::gpu::GpuMat posAndVotes;

-        cv::Ptr<cv::gpu::GeneralizedHough_GPU> d_hough = cv::gpu::GeneralizedHough_GPU::create(method);
+        cv::Ptr<cv::gpu::GeneralizedHough> d_hough = cv::gpu::GeneralizedHough::create(method);
        if (method & GHT_ROTATION)
        {
            d_hough->set("maxAngle", 90.0);

--- a/modules/gpuimgproc/perf/perf_match_template.cpp
+++ b/modules/gpuimgproc/perf/perf_match_template.cpp
@@ -76,7 +76,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate8U,
        const cv::gpu::GpuMat d_templ(templ);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);

        GPU_SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
    }
@@ -116,7 +118,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate32F,
        const cv::gpu::GpuMat d_templ(templ);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);

        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
    }

--- a/modules/gpuimgproc/src/bilateral_filter.cpp
+++ b/modules/gpuimgproc/src/bilateral_filter.cpp
@@ -47,7 +47,7 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::bilateralFilter(const GpuMat&, GpuMat&, int, float, float, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::bilateralFilter(InputArray, OutputArray, int, float, float, int, Stream&) { throw_no_cuda(); }

 #else

@@ -60,7 +60,7 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& s)
+void cv::gpu::bilateralFilter(InputArray _src, OutputArray _dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& stream)
 {
    using cv::gpu::cudev::imgproc::bilateral_filter_gpu;

@@ -79,18 +79,21 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f
    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;

-
    int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
    kernel_size = std::max(radius, 1)*2 + 1;

-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
+    CV_Assert( func != 0 );

-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();

-    dst.create(src.size(), src.type());
-    func(src, dst, kernel_size, sigma_spatial, sigma_color, borderMode, StreamAccessor::getStream(s));
+    func(src, dst, kernel_size, sigma_spatial, sigma_color, borderMode, StreamAccessor::getStream(stream));
 }

 #endif
--- a/modules/gpuimgproc/src/blend.cpp
+++ b/modules/gpuimgproc/src/blend.cpp
@@ -47,7 +47,7 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::blendLinear(InputArray, InputArray, InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }

 #else

@@ -67,21 +67,28 @@ namespace cv { namespace gpu { namespace cudev

 using namespace ::cv::gpu::cudev::blend;

-void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
-                          GpuMat& result, Stream& stream)
+void cv::gpu::blendLinear(InputArray _img1, InputArray _img2, InputArray _weights1, InputArray _weights2,
+                          OutputArray _result, Stream& stream)
 {
-    CV_Assert(img1.size() == img2.size());
-    CV_Assert(img1.type() == img2.type());
-    CV_Assert(weights1.size() == img1.size());
-    CV_Assert(weights2.size() == img2.size());
-    CV_Assert(weights1.type() == CV_32F);
-    CV_Assert(weights2.type() == CV_32F);
+    GpuMat img1 = _img1.getGpuMat();
+    GpuMat img2 = _img2.getGpuMat();
+
+    GpuMat weights1 = _weights1.getGpuMat();
+    GpuMat weights2 = _weights2.getGpuMat();
+
+    CV_Assert( img1.size() == img2.size() );
+    CV_Assert( img1.type() == img2.type() );
+    CV_Assert( weights1.size() == img1.size() );
+    CV_Assert( weights2.size() == img2.size() );
+    CV_Assert( weights1.type() == CV_32FC1 );
+    CV_Assert( weights2.type() == CV_32FC1 );

    const Size size = img1.size();
    const int depth = img1.depth();
    const int cn = img1.channels();

-    result.create(size, CV_MAKE_TYPE(depth, cn));
+    _result.create(size, CV_MAKE_TYPE(depth, cn));
+    GpuMat result = _result.getGpuMat();

    switch (depth)
    {

--- a/modules/gpuimgproc/src/canny.cpp
+++ b/modules/gpuimgproc/src/canny.cpp
@@ -47,46 +47,10 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
-void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_no_cuda(); }
-void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_no_cuda(); }
-void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_no_cuda(); }
-void cv::gpu::CannyBuf::create(const Size&, int) { throw_no_cuda(); }
-void cv::gpu::CannyBuf::release() { throw_no_cuda(); }
+Ptr<CannyEdgeDetector> cv::gpu::createCannyEdgeDetector(double, double, int, bool) { throw_no_cuda(); return Ptr<CannyEdgeDetector>(); }

 #else /* !defined (HAVE_CUDA) */

-void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
-{
-    if (apperture_size > 0)
-    {
-        ensureSizeIsEnough(image_size, CV_32SC1, dx);
-        ensureSizeIsEnough(image_size, CV_32SC1, dy);
-
-        if (apperture_size != 3)
-        {
-            filterDX = createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size, false, 1, BORDER_REPLICATE);
-            filterDY = createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size, false, 1, BORDER_REPLICATE);
-        }
-    }
-
-    ensureSizeIsEnough(image_size, CV_32FC1, mag);
-    ensureSizeIsEnough(image_size, CV_32SC1, map);
-
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
-}
-
-void cv::gpu::CannyBuf::release()
-{
-    dx.release();
-    dy.release();
-    mag.release();
-    map.release();
-    st1.release();
-    st2.release();
-}
-
 namespace canny
 {
    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
@@ -103,84 +67,168 @@ namespace canny

 namespace
 {
-    void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
+    class CannyImpl : public CannyEdgeDetector
    {
-        using namespace canny;
+    public:
+        CannyImpl(double low_thresh, double high_thresh, int apperture_size, bool L2gradient) :
+            low_thresh_(low_thresh), high_thresh_(high_thresh), apperture_size_(apperture_size), L2gradient_(L2gradient)
+        {
+            old_apperture_size_ = -1;
+        }

-        buf.map.setTo(Scalar::all(0));
-        calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
+        void detect(InputArray image, OutputArray edges);
+        void detect(InputArray dx, InputArray dy, OutputArray edges);

-        edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
+        void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; }
+        double getLowThreshold() const { return low_thresh_; }

-        edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
+        void setHighThreshold(double high_thresh) { high_thresh_ = high_thresh; }
+        double getHighThreshold() const { return high_thresh_; }

-        getEdges(buf.map, dst);
-    }
-}
+        void setAppertureSize(int apperture_size) { apperture_size_ = apperture_size; }
+        int getAppertureSize() const { return apperture_size_; }

-void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
-{
-    CannyBuf buf;
-    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
-}
+        void setL2Gradient(bool L2gradient) { L2gradient_ = L2gradient; }
+        bool getL2Gradient() const { return L2gradient_; }

-void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
-{
-    using namespace canny;
+        void write(FileStorage& fs) const
+        {
+            fs << "name" << "Canny_GPU"
+            << "low_thresh" << low_thresh_
+            << "high_thresh" << high_thresh_
+            << "apperture_size" << apperture_size_
+            << "L2gradient" << L2gradient_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "Canny_GPU" );
+            low_thresh_ = (double)fn["low_thresh"];
+            high_thresh_ = (double)fn["high_thresh"];
+            apperture_size_ = (int)fn["apperture_size"];
+            L2gradient_ = (int)fn["L2gradient"] != 0;
+        }

-    CV_Assert(src.type() == CV_8UC1);
+    private:
+        void createBuf(Size image_size);
+        void CannyCaller(GpuMat& edges);
+
+        double low_thresh_;
+        double high_thresh_;
+        int apperture_size_;
+        bool L2gradient_;
+
+        GpuMat dx_, dy_;
+        GpuMat mag_;
+        GpuMat map_;
+        GpuMat st1_, st2_;
+#ifdef HAVE_OPENCV_GPUFILTERS
+        Ptr<Filter> filterDX_, filterDY_;
+#endif
+        int old_apperture_size_;
+    };
+
+    void CannyImpl::detect(InputArray _image, OutputArray _edges)
+    {
+        GpuMat image = _image.getGpuMat();

-    if (!deviceSupports(SHARED_ATOMICS))
-        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics");
+        CV_Assert( image.type() == CV_8UC1 );
+        CV_Assert( deviceSupports(SHARED_ATOMICS) );

-    if( low_thresh > high_thresh )
-        std::swap( low_thresh, high_thresh);
+        if (low_thresh_ > high_thresh_)
+            std::swap(low_thresh_, high_thresh_);

-    dst.create(src.size(), CV_8U);
-    buf.create(src.size(), apperture_size);
+        createBuf(image.size());

-    if (apperture_size == 3)
-    {
-        Size wholeSize;
-        Point ofs;
-        src.locateROI(wholeSize, ofs);
-        GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
+        _edges.create(image.size(), CV_8UC1);
+        GpuMat edges = _edges.getGpuMat();

-        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
+        if (apperture_size_ == 3)
+        {
+            Size wholeSize;
+            Point ofs;
+            image.locateROI(wholeSize, ofs);
+            GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step);
+
+            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_);
+        }
+        else
+        {
+#ifndef HAVE_OPENCV_GPUFILTERS
+            throw_no_cuda();
+#else
+            filterDX_->apply(image, dx_);
+            filterDY_->apply(image, dy_);
+
+            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+#endif
+        }
+
+        CannyCaller(edges);
    }
-    else
+
+    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges)
    {
-        buf.filterDX->apply(src, buf.dx);
-        buf.filterDY->apply(src, buf.dy);
+        GpuMat dx = _dx.getGpuMat();
+        GpuMat dy = _dy.getGpuMat();
+
+        CV_Assert( dx.type() == CV_32SC1 );
+        CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() );
+        CV_Assert( deviceSupports(SHARED_ATOMICS) );
+
+        dx.copyTo(dx_);
+        dy.copyTo(dy_);
+
+        if (low_thresh_ > high_thresh_)
+            std::swap(low_thresh_, high_thresh_);
+
+        createBuf(dx.size());
+
+        _edges.create(dx.size(), CV_8UC1);
+        GpuMat edges = _edges.getGpuMat();
+
+        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);

-        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
+        CannyCaller(edges);
    }

-    CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
-}
+    void CannyImpl::createBuf(Size image_size)
+    {
+        ensureSizeIsEnough(image_size, CV_32SC1, dx_);
+        ensureSizeIsEnough(image_size, CV_32SC1, dy_);

-void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
-{
-    CannyBuf buf;
-    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
-}
+#ifdef HAVE_OPENCV_GPUFILTERS
+        if (apperture_size_ != 3 && apperture_size_ != old_apperture_size_)
+        {
+            filterDX_ = gpu::createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size_, false, 1, BORDER_REPLICATE);
+            filterDY_ = gpu::createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size_, false, 1, BORDER_REPLICATE);
+            old_apperture_size_ = apperture_size_;
+        }
+#endif

-void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
-{
-    using namespace canny;
+        ensureSizeIsEnough(image_size, CV_32FC1, mag_);
+        ensureSizeIsEnough(image_size, CV_32SC1, map_);

-    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
-    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
+        ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1_);
+        ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2_);
+    }
+
+    void CannyImpl::CannyCaller(GpuMat& edges)
+    {
+        map_.setTo(Scalar::all(0));
+        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_));

-    if( low_thresh > high_thresh )
-        std::swap( low_thresh, high_thresh);
+        canny::edgesHysteresisLocal(map_, st1_.ptr<ushort2>());

-    dst.create(dx.size(), CV_8U);
-    buf.create(dx.size(), -1);
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<ushort2>(), st2_.ptr<ushort2>());

-    calcMagnitude(dx, dy, buf.mag, L2gradient);
+        canny::getEdges(map_, edges);
+    }
+}

-    CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+Ptr<CannyEdgeDetector> cv::gpu::createCannyEdgeDetector(double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    return new CannyImpl(low_thresh, high_thresh, apperture_size, L2gradient);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpuimgproc/src/color.cpp
+++ b/modules/gpuimgproc/src/color.cpp
--- a/modules/gpuimgproc/src/corners.cpp
+++ b/modules/gpuimgproc/src/corners.cpp
@@ -45,15 +45,10 @@
 using namespace cv;
 using namespace cv::gpu;

-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_GPUFILTERS)

-void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_no_cuda(); }
-void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_no_cuda(); }
-void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
-void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); }
-void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
+Ptr<gpu::CornernessCriteria> cv::gpu::createHarrisCorner(int, int, int, double, int) { throw_no_cuda(); return Ptr<gpu::CornernessCriteria>(); }
+Ptr<gpu::CornernessCriteria> cv::gpu::createMinEigenValCorner(int, int, int, int) { throw_no_cuda(); return Ptr<gpu::CornernessCriteria>(); }

 #else /* !defined (HAVE_CUDA) */

@@ -68,89 +63,127 @@ namespace cv { namespace gpu { namespace cudev

 namespace
 {
-    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
+    class CornerBase : public CornernessCriteria
+    {
+    protected:
+        CornerBase(int srcType, int blockSize, int ksize, int borderType);
+
+        void extractCovData(const GpuMat& src, Stream& stream);
+
+        int srcType_;
+        int blockSize_;
+        int ksize_;
+        int borderType_;
+        GpuMat Dx_, Dy_;
+
+    private:
+        Ptr<gpu::Filter> filterDx_, filterDy_;
+    };
+
+    CornerBase::CornerBase(int srcType, int blockSize, int ksize, int borderType) :
+        srcType_(srcType), blockSize_(blockSize), ksize_(ksize), borderType_(borderType)
    {
-        (void) buf;
+        CV_Assert( borderType_ == BORDER_REFLECT101 || borderType_ == BORDER_REPLICATE || borderType_ == BORDER_REFLECT );

-        double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
+        const int sdepth = CV_MAT_DEPTH(srcType_);
+        const int cn = CV_MAT_CN(srcType_);

-        if (ksize < 0)
+        CV_Assert( cn == 1 );
+
+        double scale = static_cast<double>(1 << ((ksize_ > 0 ? ksize_ : 3) - 1)) * blockSize_;
+
+        if (ksize_ < 0)
            scale *= 2.;

-        if (src.depth() == CV_8U)
+        if (sdepth == CV_8U)
            scale *= 255.;

        scale = 1./scale;

-        Dx.create(src.size(), CV_32F);
-        Dy.create(src.size(), CV_32F);
-
-        Ptr<gpu::Filter> filterDx, filterDy;
-
-        if (ksize > 0)
+        if (ksize_ > 0)
        {
-            filterDx = gpu::createSobelFilter(src.type(), CV_32F, 1, 0, ksize, scale, borderType);
-            filterDy = gpu::createSobelFilter(src.type(), CV_32F, 0, 1, ksize, scale, borderType);
+            filterDx_ = gpu::createSobelFilter(srcType, CV_32F, 1, 0, ksize_, scale, borderType_);
+            filterDy_ = gpu::createSobelFilter(srcType, CV_32F, 0, 1, ksize_, scale, borderType_);
        }
        else
        {
-            filterDx = gpu::createScharrFilter(src.type(), CV_32F, 1, 0, scale, borderType);
-            filterDy = gpu::createScharrFilter(src.type(), CV_32F, 0, 1, scale, borderType);
+            filterDx_ = gpu::createScharrFilter(srcType, CV_32F, 1, 0, scale, borderType_);
+            filterDy_ = gpu::createScharrFilter(srcType, CV_32F, 0, 1, scale, borderType_);
        }
+    }

-        filterDx->apply(src, Dx);
-        filterDy->apply(src, Dy);
+    void CornerBase::extractCovData(const GpuMat& src, Stream& stream)
+    {
+        CV_Assert( src.type() == srcType_ );
+        filterDx_->apply(src, Dx_, stream);
+        filterDy_->apply(src, Dy_, stream);
    }
-}

-void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType)
-{
-    GpuMat Dx, Dy;
-    cornerHarris(src, dst, Dx, Dy, blockSize, ksize, k, borderType);
-}
+    class Harris : public CornerBase
+    {
+    public:
+        Harris(int srcType, int blockSize, int ksize, double k, int borderType) :
+            CornerBase(srcType, blockSize, ksize, borderType), k_(static_cast<float>(k))
+        {
+        }

-void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)
-{
-    GpuMat buf;
-    cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);
-}
+        void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null());

-void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
+    private:
+        float k_;
+    };

-    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+    void Harris::compute(InputArray _src, OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::imgproc;

-    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
+        GpuMat src = _src.getGpuMat();

-    dst.create(src.size(), CV_32F);
+        extractCovData(src, stream);

-    cornerHarris_gpu(blockSize, static_cast<float>(k), Dx, Dy, dst, borderType, StreamAccessor::getStream(stream));
-}
+        _dst.create(src.size(), CV_32FC1);
+        GpuMat dst = _dst.getGpuMat();

-void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
-{
-    GpuMat Dx, Dy;
-    cornerMinEigenVal(src, dst, Dx, Dy, blockSize, ksize, borderType);
-}
+        cornerHarris_gpu(blockSize_, k_, Dx_, Dy_, dst, borderType_, StreamAccessor::getStream(stream));
+    }

-void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
-{
-    GpuMat buf;
-    cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);
-}
+    class MinEigenVal : public CornerBase
+    {
+    public:
+        MinEigenVal(int srcType, int blockSize, int ksize, int borderType) :
+            CornerBase(srcType, blockSize, ksize, borderType)
+        {
+        }

-void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
-{
-    using namespace ::cv::gpu::cudev::imgproc;
+        void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null());

-    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+    private:
+        float k_;
+    };

-    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
+    void MinEigenVal::compute(InputArray _src, OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::imgproc;
+
+        GpuMat src = _src.getGpuMat();
+
+        extractCovData(src, stream);
+
+        _dst.create(src.size(), CV_32FC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        cornerMinEigenVal_gpu(blockSize_, Dx_, Dy_, dst, borderType_, StreamAccessor::getStream(stream));
+    }
+}

-    dst.create(src.size(), CV_32F);
+Ptr<gpu::CornernessCriteria> cv::gpu::createHarrisCorner(int srcType, int blockSize, int ksize, double k, int borderType)
+{
+    return new Harris(srcType, blockSize, ksize, k, borderType);
+}

-    cornerMinEigenVal_gpu(blockSize, Dx, Dy, dst, borderType, StreamAccessor::getStream(stream));
+Ptr<gpu::CornernessCriteria> cv::gpu::createMinEigenValCorner(int srcType, int blockSize, int ksize, int borderType)
+{
+    return new MinEigenVal(srcType, blockSize, ksize, borderType);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpuimgproc/src/cuda/build_point_list.cu
+++ b/modules/gpuimgproc/src/cuda/build_point_list.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough
+    {
+        __device__ int g_counter;
+
+        template <int PIXELS_PER_THREAD>
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
+        {
+            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
+            __shared__ int s_qsize[4];
+            __shared__ int s_globStart[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (threadIdx.x == 0)
+                s_qsize[threadIdx.y] = 0;
+            __syncthreads();
+
+            if (y < src.rows)
+            {
+                // fill the queue
+                const uchar* srcRow = src.ptr(y);
+                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
+                {
+                    if (srcRow[xx])
+                    {
+                        const unsigned int val = (y << 16) | xx;
+                        const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
+                        s_queues[threadIdx.y][qidx] = val;
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            // let one thread reserve the space required in the global list
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                // find how many items are stored in each list
+                int totalSize = 0;
+                for (int i = 0; i < blockDim.y; ++i)
+                {
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
+                }
+
+                // calculate the offset in the global list
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_globStart[i] += globalOffset;
+            }
+
+            __syncthreads();
+
+            // copy local queues to global queue
+            const int qsize = s_qsize[threadIdx.y];
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
+        }
+
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
+        {
+            const int PIXELS_PER_THREAD = 16;
+
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 4);
+            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
+
+            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpuimgproc/src/cuda/corners.cu
+++ b/modules/gpuimgproc/src/cuda/corners.cu
@@ -48,6 +48,10 @@
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"

+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_GPUFILTERS
+
 namespace cv { namespace gpu { namespace cudev
 {
    namespace imgproc
@@ -271,4 +275,6 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-#endif
+#endif // HAVE_OPENCV_GPUFILTERS
+
+#endif // CUDA_DISABLER
--- a/modules/gpuimgproc/src/cuda/hough.cu
+++ b/modules/gpuimgproc/src/cuda/hough.cu
--- a/modules/gpuimgproc/src/cuda/hough_circles.cu
+++ b/modules/gpuimgproc/src/cuda/hough_circles.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_GPUFILTERS
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough_circles
+    {
+        __device__ int g_counter;
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumCenters
+
+        __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
+                                            PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
+        {
+            const int SHIFT = 10;
+            const int ONE = 1 << SHIFT;
+
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= count)
+                return;
+
+            const unsigned int val = list[tid];
+
+            const int x = (val & 0xFFFF);
+            const int y = (val >> 16) & 0xFFFF;
+
+            const int vx = dx(y, x);
+            const int vy = dy(y, x);
+
+            if (vx == 0 && vy == 0)
+                return;
+
+            const float mag = ::sqrtf(vx * vx + vy * vy);
+
+            const int x0 = __float2int_rn((x * idp) * ONE);
+            const int y0 = __float2int_rn((y * idp) * ONE);
+
+            int sx = __float2int_rn((vx * idp) * ONE / mag);
+            int sy = __float2int_rn((vy * idp) * ONE / mag);
+
+            // Step from minRadius to maxRadius in both directions of the gradient
+            for (int k1 = 0; k1 < 2; ++k1)
+            {
+                int x1 = x0 + minRadius * sx;
+                int y1 = y0 + minRadius * sy;
+
+                for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
+                {
+                    const int x2 = x1 >> SHIFT;
+                    const int y2 = y1 >> SHIFT;
+
+                    if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
+                        break;
+
+                    ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
+                }
+
+                sx = -sx;
+                sy = -sy;
+            }
+        }
+
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(count, block.x));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
+
+            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // buildCentersList
+
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < accum.cols - 2 && y < accum.rows - 2)
+            {
+                const int top = accum(y, x + 1);
+
+                const int left = accum(y + 1, x);
+                const int cur = accum(y + 1, x + 1);
+                const int right = accum(y + 1, x + 2);
+
+                const int bottom = accum(y + 2, x + 1);
+
+                if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
+                {
+                    const unsigned int val = (y << 16) | x;
+                    const int idx = ::atomicAdd(&g_counter, 1);
+                    centers[idx] = val;
+                }
+            }
+        }
+
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
+
+            buildCentersList<<<grid, block>>>(accum, centers, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumRadius
+
+        __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
+                                           float3* circles, const int maxCircles, const float dp,
+                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
+                smem[i] = 0;
+            __syncthreads();
+
+            unsigned int val = centers[blockIdx.x];
+
+            float cx = (val & 0xFFFF);
+            float cy = (val >> 16) & 0xFFFF;
+
+            cx = (cx + 0.5f) * dp;
+            cy = (cy + 0.5f) * dp;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
+                if (rad >= minRadius && rad <= maxRadius)
+                {
+                    const int r = __float2int_rn(rad - minRadius);
+
+                    Emulation::smem::atomicAdd(&smem[r + 1], 1);
+                }
+            }
+
+            __syncthreads();
+
+            for (int i = threadIdx.x; i < histSize; i += blockDim.x)
+            {
+                const int curVotes = smem[i + 1];
+
+                if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
+                {
+                    const int ind = ::atomicAdd(&g_counter, 1);
+                    if (ind < maxCircles)
+                        circles[ind] = make_float3(cx, cy, i + minRadius);
+                }
+            }
+        }
+
+        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(centersCount);
+
+            const int histSize = maxRadius - minRadius + 1;
+            size_t smemSize = (histSize + 2) * sizeof(int);
+
+            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxCircles);
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif // HAVE_OPENCV_GPUFILTERS
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpuimgproc/src/cuda/hough_lines.cu
+++ b/modules/gpuimgproc/src/cuda/hough_lines.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace hough_lines
+    {
+        __device__ int g_counter;
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesAccum
+
+        __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                ::atomicAdd(accumRow + r + 1, 1);
+            }
+        }
+
+        __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                smem[i] = 0;
+
+            __syncthreads();
+
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                Emulation::smem::atomicAdd(&smem[r + 1], 1);
+            }
+
+            __syncthreads();
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
+        }
+
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        {
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(accum.rows - 2);
+
+            size_t smemSize = (accum.cols - 1) * sizeof(int);
+
+            if (smemSize < sharedMemPerBlock - 1000)
+                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+            else
+                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesGetResult
+
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  accum(n + 1, r) &&
+                curVotes >= accum(n + 1, r + 2) &&
+                curVotes >  accum(n, r + 1) &&
+                curVotes >= accum(n + 2, r + 1))
+            {
+                const float radius = (r - (numrho - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                const int ind = ::atomicAdd(&g_counter, 1);
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float2(radius, angle);
+                    votes[ind] = curVotes;
+                }
+            }
+        }
+
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
+
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            if (doSort && totalCount > 0)
+            {
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
+            }
+
+            return totalCount;
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpuimgproc/src/cuda/hough_segments.cu
+++ b/modules/gpuimgproc/src/cuda/hough_segments.cu
--- a/modules/gpuimgproc/src/hough.cpp
+++ b/modules/gpuimgproc/src/hough.cpp
--- a/modules/gpuimgproc/src/gftt.cpp
+++ b/modules/gpuimgproc/src/gftt.cpp
@@ -45,9 +45,9 @@
 using namespace cv;
 using namespace cv::gpu;

-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_GPUARITHM)

-void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
+Ptr<gpu::CornersDetector> cv::gpu::createGoodFeaturesToTrackDetector(int, int, double, double, int, bool, double) { throw_no_cuda(); return Ptr<gpu::CornersDetector>(); }

 #else /* !defined (HAVE_CUDA) */

@@ -60,117 +60,156 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask)
+namespace
 {
-#ifndef HAVE_OPENCV_GPUARITHM
-    (void) image;
-    (void) corners;
-    (void) mask;
-    throw_no_cuda();
-#else
-    using namespace cv::gpu::cudev::gfft;
+    class GoodFeaturesToTrackDetector : public CornersDetector
+    {
+    public:
+        GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                    int blockSize, bool useHarrisDetector, double harrisK);
+
+        void detect(InputArray image, OutputArray corners, InputArray mask = noArray());
+
+    private:
+        int maxCorners_;
+        double qualityLevel_;
+        double minDistance_;
+
+        Ptr<gpu::CornernessCriteria> cornerCriteria_;
+
+        GpuMat Dx_;
+        GpuMat Dy_;
+        GpuMat buf_;
+        GpuMat eig_;
+        GpuMat minMaxbuf_;
+        GpuMat tmpCorners_;
+    };
+
+    GoodFeaturesToTrackDetector::GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                                             int blockSize, bool useHarrisDetector, double harrisK) :
+        maxCorners_(maxCorners), qualityLevel_(qualityLevel), minDistance_(minDistance)
+    {
+        CV_Assert( qualityLevel_ > 0 && minDistance_ >= 0 && maxCorners_ >= 0 );

-    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
+        cornerCriteria_ = useHarrisDetector ?
+                    gpu::createHarrisCorner(srcType, blockSize, 3, harrisK) :
+                    gpu::createMinEigenValCorner(srcType, blockSize, 3);
+    }

-    ensureSizeIsEnough(image.size(), CV_32F, eig_);
+    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask)
+    {
+        using namespace cv::gpu::cudev::gfft;

-    if (useHarrisDetector)
-        cornerHarris(image, eig_, Dx_, Dy_, buf_, blockSize, 3, harrisK);
-    else
-        cornerMinEigenVal(image, eig_, Dx_, Dy_, buf_, blockSize, 3);
+        GpuMat image = _image.getGpuMat();
+        GpuMat mask = _mask.getGpuMat();

-    double maxVal = 0;
-    gpu::minMax(eig_, 0, &maxVal, GpuMat(), minMaxbuf_);
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );

-    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+        ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
+        cornerCriteria_->compute(image, eig_);

-    int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
+        double maxVal = 0;
+        gpu::minMax(eig_, 0, &maxVal, noArray(), minMaxbuf_);

-    if (total == 0)
-    {
-        corners.release();
-        return;
-    }
+        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);

-    sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
+        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);

-    if (minDistance < 1)
-        tmpCorners_.colRange(0, maxCorners > 0 ? std::min(maxCorners, total) : total).copyTo(corners);
-    else
-    {
-        std::vector<Point2f> tmp(total);
-        Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
-        tmpCorners_.colRange(0, total).download(tmpMat);
+        if (total == 0)
+        {
+            _corners.release();
+            return;
+        }

-        std::vector<Point2f> tmp2;
-        tmp2.reserve(total);
+        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);

-        const int cell_size = cvRound(minDistance);
-        const int grid_width = (image.cols + cell_size - 1) / cell_size;
-        const int grid_height = (image.rows + cell_size - 1) / cell_size;
+        if (minDistance_ < 1)
+        {
+            tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners);
+        }
+        else
+        {
+            std::vector<Point2f> tmp(total);
+            Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
+            tmpCorners_.colRange(0, total).download(tmpMat);

-        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+            std::vector<Point2f> tmp2;
+            tmp2.reserve(total);

-        for (int i = 0; i < total; ++i)
-        {
-            Point2f p = tmp[i];
+            const int cell_size = cvRound(minDistance_);
+            const int grid_width = (image.cols + cell_size - 1) / cell_size;
+            const int grid_height = (image.rows + cell_size - 1) / cell_size;

-            bool good = true;
+            std::vector< std::vector<Point2f> > grid(grid_width * grid_height);

-            int x_cell = static_cast<int>(p.x / cell_size);
-            int y_cell = static_cast<int>(p.y / cell_size);
+            for (int i = 0; i < total; ++i)
+            {
+                Point2f p = tmp[i];

-            int x1 = x_cell - 1;
-            int y1 = y_cell - 1;
-            int x2 = x_cell + 1;
-            int y2 = y_cell + 1;
+                bool good = true;

-            // boundary check
-            x1 = std::max(0, x1);
-            y1 = std::max(0, y1);
-            x2 = std::min(grid_width - 1, x2);
-            y2 = std::min(grid_height - 1, y2);
+                int x_cell = static_cast<int>(p.x / cell_size);
+                int y_cell = static_cast<int>(p.y / cell_size);

-            for (int yy = y1; yy <= y2; yy++)
-            {
-                for (int xx = x1; xx <= x2; xx++)
-                {
-                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
+                int x1 = x_cell - 1;
+                int y1 = y_cell - 1;
+                int x2 = x_cell + 1;
+                int y2 = y_cell + 1;
+
+                // boundary check
+                x1 = std::max(0, x1);
+                y1 = std::max(0, y1);
+                x2 = std::min(grid_width - 1, x2);
+                y2 = std::min(grid_height - 1, y2);

-                    if (!m.empty())
+                for (int yy = y1; yy <= y2; yy++)
+                {
+                    for (int xx = x1; xx <= x2; xx++)
                    {
-                        for(size_t j = 0; j < m.size(); j++)
-                        {
-                            float dx = p.x - m[j].x;
-                            float dy = p.y - m[j].y;
+                        std::vector<Point2f>& m = grid[yy * grid_width + xx];

-                            if (dx * dx + dy * dy < minDistance * minDistance)
+                        if (!m.empty())
+                        {
+                            for(size_t j = 0; j < m.size(); j++)
                            {
-                                good = false;
-                                goto break_out;
+                                float dx = p.x - m[j].x;
+                                float dy = p.y - m[j].y;
+
+                                if (dx * dx + dy * dy < minDistance_ * minDistance_)
+                                {
+                                    good = false;
+                                    goto break_out;
+                                }
                            }
                        }
                    }
                }
-            }

-            break_out:
+                break_out:

-            if(good)
-            {
-                grid[y_cell * grid_width + x_cell].push_back(p);
+                if(good)
+                {
+                    grid[y_cell * grid_width + x_cell].push_back(p);

-                tmp2.push_back(p);
+                    tmp2.push_back(p);

-                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
-                    break;
+                    if (maxCorners_ > 0 && tmp2.size() == static_cast<size_t>(maxCorners_))
+                        break;
+                }
            }
-        }

-        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+            _corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2);
+            GpuMat corners = _corners.getGpuMat();
+
+            corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+        }
    }
-#endif
+}
+
+Ptr<gpu::CornersDetector> cv::gpu::createGoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                                                     int blockSize, bool useHarrisDetector, double harrisK)
+{
+    return new GoodFeaturesToTrackDetector(srcType, maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, harrisK);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpuimgproc/src/histogram.cpp
+++ b/modules/gpuimgproc/src/histogram.cpp
--- a/modules/gpuimgproc/src/hough_circles.cpp
+++ b/modules/gpuimgproc/src/hough_circles.cpp
--- a/modules/gpuimgproc/src/hough_lines.cpp
+++ b/modules/gpuimgproc/src/hough_lines.cpp
--- a/modules/gpuimgproc/src/hough_segments.cpp
+++ b/modules/gpuimgproc/src/hough_segments.cpp
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
--- a/modules/gpuimgproc/src/mean_shift.cpp
+++ b/modules/gpuimgproc/src/mean_shift.cpp
--- a/modules/gpuimgproc/src/mssegmentation.cpp
+++ b/modules/gpuimgproc/src/mssegmentation.cpp
--- a/modules/gpuimgproc/src/precomp.hpp
+++ b/modules/gpuimgproc/src/precomp.hpp
--- a/modules/gpuimgproc/test/test_canny.cpp
+++ b/modules/gpuimgproc/test/test_canny.cpp
--- a/modules/gpuimgproc/test/test_corners.cpp
+++ b/modules/gpuimgproc/test/test_corners.cpp
--- a/modules/gpuimgproc/test/test_gftt.cpp
+++ b/modules/gpuimgproc/test/test_gftt.cpp
--- a/modules/gpuimgproc/test/test_histogram.cpp
+++ b/modules/gpuimgproc/test/test_histogram.cpp
--- a/modules/gpuimgproc/test/test_hough.cpp
+++ b/modules/gpuimgproc/test/test_hough.cpp
--- a/modules/gpuimgproc/test/test_match_template.cpp
+++ b/modules/gpuimgproc/test/test_match_template.cpp
--- a/modules/gpuimgproc/test/test_precomp.hpp
+++ b/modules/gpuimgproc/test/test_precomp.hpp
--- a/modules/gpuoptflow/src/needle_map.cpp
+++ b/modules/gpuoptflow/src/needle_map.cpp
--- a/modules/superres/src/input_array_utility.cpp
+++ b/modules/superres/src/input_array_utility.cpp
--- a/modules/videostab/include/opencv2/videostab/global_motion.hpp
+++ b/modules/videostab/include/opencv2/videostab/global_motion.hpp
--- a/modules/videostab/src/global_motion.cpp
+++ b/modules/videostab/src/global_motion.cpp
--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
--- a/samples/gpu/generalized_hough.cpp
+++ b/samples/gpu/generalized_hough.cpp
--- a/samples/gpu/houghlines.cpp
+++ b/samples/gpu/houghlines.cpp
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp