Merge pull request #1010 from jet47:gpufilters-refactoring

84639e44 · Roman Donchenko · OpenCV Buildbot · 13323882 · f08d75a8 · 84639e44
Commit 84639e44 authored Jun 21, 2013 by Roman Donchenko Committed by OpenCV Buildbot Jun 21, 2013
21 changed files
--- a/modules/gpuarithm/doc/arithm.rst
+++ b/modules/gpuarithm/doc/arithm.rst
@@ -157,8 +157,6 @@ Computes a convolution (or cross-correlation) of two images.
    :param stream: Stream for the asynchronous version.
-.. seealso:: :ocv:func:`gpu::filter2D`
 gpu::createConvolution

--- a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
+++ b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
@@ -321,7 +321,7 @@ private:
    GpuMat colors_;
    GpuMat weights_;
-    Ptr<FilterEngine_GPU> boxFilter_;
+    Ptr<gpu::Filter> boxFilter_;
    GpuMat buf_;
 };

--- a/modules/gpubgsegm/src/fgd.cpp
+++ b/modules/gpubgsegm/src/fgd.cpp
@@ -228,11 +228,10 @@ private:
    cv::gpu::GpuMat countBuf_;
    cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
    cv::gpu::GpuMat filterBrd_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
+    cv::Ptr<cv::gpu::Filter> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
+    cv::Ptr<cv::gpu::Filter> erodeFilter_;
    CvMemStorage* storage_;
 };
@@ -305,8 +304,8 @@ void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, cons
        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
        cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
+        dilateFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, CV_8UC1, kernel, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
+        erodeFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, CV_8UC1, kernel, anchor);
    }
 }
@@ -326,7 +325,6 @@ void cv::gpu::FGDStatModel::Impl::release()
    countBuf_.release();
    buf_.release();
-    filterBuf_.release();
    filterBrd_.release();
 }
@@ -488,14 +486,14 @@ namespace
 namespace
 {
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
+    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::Filter>& filter, cv::Scalar brdVal)
    {
        cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
+        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst);
    }
    void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
+                          cv::Ptr<cv::gpu::Filter>& erodeFilter, cv::Ptr<cv::gpu::Filter>& dilateFilter,
                          const cv::gpu::FGDStatModel::Params& params)
    {
        const int brd = params.perform_morphing;

--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
@@ -100,7 +100,7 @@ void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
    nfeatures_.setTo(cv::Scalar::all(0));
    if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
+        boxFilter_ = cv::gpu::createBoxFilter(CV_8UC1, -1, cv::Size(smoothingRadius, smoothingRadius));
    loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
 }
@@ -141,7 +141,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
    // medianBlur
    if (smoothingRadius > 0)
    {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
+        boxFilter_->apply(fgmask, buf_, stream);
        int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
        double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
        cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);

--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
@@ -351,7 +351,7 @@ private:
    FAST_GPU fastDetector_;
-    Ptr<FilterEngine_GPU> blurFilter;
+    Ptr<gpu::Filter> blurFilter;
    GpuMat d_keypoints_;
 };

--- a/modules/gpufeatures2d/src/orb.cpp
+++ b/modules/gpufeatures2d/src/orb.cpp
@@ -468,7 +468,7 @@ cv::gpu::ORB_GPU::ORB_GPU(int nFeatures, float scaleFactor, int nLevels, int edg
    pattern_.upload(h_pattern);
-    blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+    blurFilter = gpu::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
    blurForDescriptor = false;
 }
@@ -632,7 +632,7 @@ void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
        {
            // preprocess the resized image
            ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
+            blurFilter->apply(imagePyr_[level], buf_);
        }
        computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),

--- a/modules/gpufilters/CMakeLists.txt
+++ b/modules/gpufilters/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "GPU-accelerated Image Filtering")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
-ocv_define_module(gpufilters opencv_imgproc OPTIONAL opencv_gpuarithm)
+ocv_define_module(gpufilters opencv_imgproc opencv_gpuarithm)
--- a/modules/gpufilters/doc/filtering.rst
+++ b/modules/gpufilters/doc/filtering.rst
@@ -7,713 +7,328 @@ Functions and classes described in this section are used to perform various line
-gpu::BaseRowFilter_GPU
+gpu::Filter
----------------------
+-----------
-.. ocv:class:: gpu::BaseRowFilter_GPU
+.. ocv:class:: gpu::Filter
-Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
-    class BaseRowFilter_GPU
-    {
-    public:
-        BaseRowFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseRowFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
-    };
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-gpu::BaseColumnFilter_GPU
-------------------------
-.. ocv:class:: gpu::BaseColumnFilter_GPU
-Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
-    class BaseColumnFilter_GPU
-    {
-    public:
-        BaseColumnFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseColumnFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
-    };
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
+Common interface for all GPU filters ::
+    class CV_EXPORTS Filter : public Algorithm
-gpu::BaseFilter_GPU
-------------------
-.. ocv:class:: gpu::BaseFilter_GPU
-Base class for non-separable 2D filters. ::
-    class CV_EXPORTS BaseFilter_GPU
    {
    public:
-        BaseFilter_GPU(const Size& ksize_, const Point& anchor_);
+        virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
-        virtual ~BaseFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        Size ksize;
-        Point anchor;
    };
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-gpu::FilterEngine_GPU
---------------------
-.. ocv:class:: gpu::FilterEngine_GPU
-Base class for the Filter Engine. ::
-    class CV_EXPORTS FilterEngine_GPU
-    {
-    public:
-        virtual ~FilterEngine_GPU() {}
-        virtual void apply(const GpuMat& src, GpuMat& dst,
-                           Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-    };
-The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`gpu::filter2D`, :ocv:func:`gpu::erode`, :ocv:func:`gpu::Sobel` , and others.
-By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        // Allocate and release buffers at each iterations
-        gpu::GaussianBlur(src, dst, ksize, sigma1);
-    }
-    // Allocate buffers only once
-    cv::Ptr<gpu::FilterEngine_GPU> filter =
-        gpu::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-    // Release buffers only once
-    filter.release();
-``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
-.. note:: The GPU filters do not support the in-place mode.
-.. seealso:: :ocv:class:`gpu::BaseRowFilter_GPU`, :ocv:class:`gpu::BaseColumnFilter_GPU`, :ocv:class:`gpu::BaseFilter_GPU`, :ocv:func:`gpu::createFilter2D_GPU`, :ocv:func:`gpu::createSeparableFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`, :ocv:func:`gpu::createMorphologyFilter_GPU`, :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`gpu::createDerivFilter_GPU`, :ocv:func:`gpu::createGaussianFilter_GPU`
-gpu::createFilter2D_GPU
---------------------------
-Creates a non-separable filter engine with the specified filter.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createFilter2D_GPU( const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-    :param filter2D: Non-separable 2D filter.
-    :param srcType: Input image type. It must be supported by  ``filter2D`` .
-    :param dstType: Output image type. It must be supported by  ``filter2D`` .
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`.
-gpu::createSeparableFilter_GPU
----------------------------------
-Creates a separable filter engine with the specified filters.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableFilter_GPU( const Ptr<BaseRowFilter_GPU>& rowFilter, const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-    :param rowFilter: "Horizontal" 1D filter.
-    :param columnFilter: "Vertical" 1D filter.
-    :param srcType: Input image type. It must be supported by  ``rowFilter`` .
-    :param bufType: Buffer image type. It must be supported by  ``rowFilter``  and  ``columnFilter`` .
-    :param dstType: Output image type. It must be supported by  ``columnFilter`` .
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createSeparableLinearFilter_GPU`.
-gpu::getRowSumFilter_GPU
----------------------------
-Creates a horizontal 1D box filter.
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1)
-    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
-    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
-    :param ksize: Kernel size.
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+gpu::Filter::apply
+------------------
+Applies the specified filter to the image.
-gpu::getColumnSumFilter_GPU
+.. ocv:function:: void gpu::Filter::apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0
-------------------------------
-Creates a vertical 1D box filter.
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1)
-    :param sumType: Input image type. Only ``CV_8UC1`` type is supported for now.
-    :param dstType: Output image type. Only ``CV_32FC1`` type is supported for now.
-    :param ksize: Kernel size.
+    :param src: Input image.
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+    :param dst: Output image.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param stream: Stream for the asynchronous version.
-gpu::createBoxFilter_GPU
+gpu::createBoxFilter
----------------------------
+--------------------
 Creates a normalized 2D box filter.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1))
-    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
+    :param srcType: Input image type. Only ``CV_8UC1`` and ``CV_8UC4`` are supported for now.
-    :param dstType: Output image type.  It supports only the same values as the source type.
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
    :param ksize: Kernel size.
    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-.. seealso:: :ocv:func:`boxFilter`
-gpu::boxFilter
------------------
-Smooths the image using the normalized box filter.
-.. ocv:function:: void gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
-    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
-    :param dst: Output image type. The size and type is the same as ``src`` .
-    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
+    :param borderVal: Default border value.
-    :param ksize: Kernel size.
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-    :param stream: Stream for the asynchronous version.
-.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
 .. seealso:: :ocv:func:`boxFilter`
-gpu::blur
+gpu::createLinearFilter
-------------
+-----------------------
-Acts as a synonym for the normalized box filter.
+Creates a non-separable linear 2D filter.
-.. ocv:function:: void gpu::blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
-    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-    :param dst: Output image type with the same size and type as  ``src`` .
-    :param ksize: Kernel size.
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-    :param stream: Stream for the asynchronous version.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-.. seealso:: :ocv:func:`blur`, :ocv:func:`gpu::boxFilter`
-gpu::createMorphologyFilter_GPU
-----------------------------------
-Creates a 2D morphological filter.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1)
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor=Point(-1,-1))
-    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
-    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
-    :param kernel: 2D 8-bit structuring element for the morphological operation.
-    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
-    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-.. seealso:: :ocv:func:`createMorphologyFilter`
-gpu::erode
--------------
-Erodes an image by using a specific structuring element.
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-    :param src: Source image. Only  ``CV_8UC1``  and  ``CV_8UC4``  types are supported.
-    :param dst: Destination image with the same size and type as  ``src`` .
-    :param kernel: Structuring element used for erosion. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-    :param iterations: Number of times erosion to be applied.
-    :param stream: Stream for the asynchronous version.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-.. seealso:: :ocv:func:`erode`
-gpu::dilate
---------------
-Dilates an image by using a specific structuring element.
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-    :param src: Source image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
-    :param dst: Destination image with the same size and type as ``src``.
-    :param kernel: Structuring element used for dilation. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
+.. ocv:function:: Ptr<Filter> gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
-    :param iterations: Number of times dilation to be applied.
-    :param stream: Stream for the asynchronous version.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-.. seealso:: :ocv:func:`dilate`
-gpu::morphologyEx
---------------------
-Applies an advanced morphological operation to an image.
-.. ocv:function::  void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-.. ocv:function:: void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-    :param src: Source image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-    :param dst: Destination image with the same size and type as  ``src`` .
-    :param op: Type of morphological operation. The following types are possible:
-        * **MORPH_OPEN** opening
-        * **MORPH_CLOSE** closing
-        * **MORPH_GRADIENT** morphological gradient
-        * **MORPH_TOPHAT** "top hat"
-        * **MORPH_BLACKHAT** "black hat"
-    :param kernel: Structuring element.
-    :param anchor: Position of an anchor within the element. The default value ``Point(-1, -1)`` means that the anchor is at the element center.
-    :param iterations: Number of times erosion and dilation to be applied.
-    :param stream: Stream for the asynchronous version.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-.. seealso:: :ocv:func:`morphologyEx`
-gpu::createLinearFilter_GPU
-------------------------------
-Creates a non-separable linear filter.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT)
    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-    :param dstType: Output image type. The same type as ``src`` is supported.
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
-    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:class:`gpu::Convolution`.
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-.. seealso:: :ocv:func:`createLinearFilter`
-gpu::filter2D
-----------------
-Applies the non-separable 2D linear filter to an image.
-.. ocv:function:: void gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
-    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
    :param kernel: 2D array of filter coefficients.
-    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
+    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-    :param stream: Stream for the asynchronous version.
+    :param borderVal: Default border value.
-.. seealso:: :ocv:func:`filter2D`, :ocv:class:`gpu::Convolution`
+.. seealso:: :ocv:func:`filter2D`
-gpu::Laplacian
+gpu::createLaplacianFilter
------------------
+--------------------------
-Applies the Laplacian operator to an image.
+Creates a Laplacian operator.
-.. ocv:function:: void gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
+.. ocv:function:: Ptr<Filter> gpu::createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
-    :param src: Source image. ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-    :param dst: Destination image. The size and number of channels is the same as  ``src`` .
-    :param ddepth: Desired depth of the destination image. It supports only the same depth as the source image depth.
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-    :param stream: Stream for the asynchronous version.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-.. seealso:: :ocv:func:`Laplacian`, :ocv:func:`gpu::filter2D`
-gpu::getLinearRowFilter_GPU
-------------------------------
-Creates a primitive row filter with the specified kernel.
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getLinearRowFilter_GPU( int srcType, int bufType, const Mat& rowKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
-    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. For details on limitations, see below.
-There are two versions of the algorithm: NPP and OpenCV.
-    * NPP version is called when ``srcType == CV_8UC1`` or ``srcType == CV_8UC4`` and ``bufType == srcType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside the image.
-.. seealso:: :ocv:func:`createSeparableLinearFilter` .
-gpu::getLinearColumnFilter_GPU
----------------------------------
-Creates a primitive column filter with the specified kernel.
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getLinearColumnFilter_GPU( int bufType, int dstType, const Mat& columnKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
+    :param borderVal: Default border value.
-    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
+.. seealso:: :ocv:func:`Laplacian`
-    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
-    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
+gpu::createSeparableLinearFilter
+--------------------------------
+Creates a separable linear filter.
-    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` . For details on limitations, see below.
+.. ocv:function:: Ptr<Filter> gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
-There are two versions of the algorithm: NPP and OpenCV.
+    :param srcType: Source array type.
-    * NPP version is called when ``dstType == CV_8UC1`` or ``dstType == CV_8UC4`` and ``bufType == dstType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
+    :param dstType: Destination array type.
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside image.
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 32`` .
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 32`` .
-gpu::createSeparableLinearFilter_GPU
----------------------------------------
-Creates a separable linear filter engine.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
-    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
-    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
-    :param rowBorderType: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`. For details on limitations, see :ocv:func:`gpu::getLinearRowFilter_GPU`, cpp:ocv:func:`gpu::getLinearColumnFilter_GPU`.
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`.
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`gpu::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+.. seealso:: :ocv:func:`sepFilter2D`
-gpu::sepFilter2D
+gpu::createDerivFilter
--------------------
+----------------------
-Applies a separable 2D linear filter to an image.
+Creates a generalized Deriv operator.
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+.. ocv:function:: Ptr<Filter> gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize = false, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+    :param srcType: Source image type.
+    :param dstType: Destination array type.
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+    :param dx: Derivative order in respect of x.
-    :param dst: Destination image with the same size and number of channels as  ``src`` .
+    :param dy: Derivative order in respect of y.
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
-    :param kernelX: Horizontal filter coefficients.
+    :param normalize: Flag indicating whether to normalize (scale down) the filter coefficients or not. See  :ocv:func:`getDerivKernels` for details.
-    :param kernelY: Vertical filter coefficients.
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
-    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-    :param stream: Stream for the asynchronous version.
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
+gpu::createSobelFilter
+----------------------
+Creates a Sobel operator.
+.. ocv:function:: Ptr<Filter> gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+    :param srcType: Source image type.
-gpu::createDerivFilter_GPU
+    :param dstType: Destination array type.
------------------------------
-Creates a filter engine for the generalized Sobel operator.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
+    :param dx: Derivative order in respect of x.
-    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+    :param dy: Derivative order in respect of y.
-    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
+    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
-    :param dx: Derivative order in respect of x.
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
-    :param dy: Derivative order in respect of y.
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+.. seealso:: :ocv:func:`Sobel`
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
+gpu::createScharrFilter
+-----------------------
+Creates a vertical or horizontal Scharr operator.
+.. ocv:function:: Ptr<Filter> gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
-gpu::Sobel
+    :param srcType: Source image type.
--------------
-Applies the generalized Sobel operator to an image.
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+    :param dstType: Destination array type.
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+    :param dx: Order of the derivative x.
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+    :param dy: Order of the derivative y.
-    :param dst: Destination image with the same size and number of channels as source image.
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-    :param dx: Derivative order in respect of x.
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
-    :param dy: Derivative order in respect of y.
+.. seealso:: :ocv:func:`Scharr`
-    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+gpu::createGaussianFilter
+-------------------------
+Creates a Gaussian filter.
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+.. ocv:function:: Ptr<Filter> gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2 = 0, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
-    :param stream: Stream for the asynchronous version.
+    :param srcType: Source image type.
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Sobel`
+    :param dstType: Destination array type.
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
-gpu::Scharr
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
---------------
-Calculates the first x- or y- image derivative using the Scharr operator.
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+.. seealso:: :ocv:func:`GaussianBlur`
-    :param dst: Destination image with the same size and number of channels as  ``src`` has.
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-    :param dx: Order of the derivative x.
+gpu::createMorphologyFilter
+---------------------------
+Creates a 2D morphological filter.
-    :param dy: Order of the derivative y.
+.. ocv:function:: Ptr<Filter> gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1)
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
+    :param op: Type of morphological operation. The following types are possible:
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+        * **MORPH_ERODE** erode
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+        * **MORPH_DILATE** dilate
-    :param stream: Stream for the asynchronous version.
+        * **MORPH_OPEN** opening
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Scharr`
+        * **MORPH_CLOSE** closing
+        * **MORPH_GRADIENT** morphological gradient
+        * **MORPH_TOPHAT** "top hat"
-gpu::createGaussianFilter_GPU
+        * **MORPH_BLACKHAT** "black hat"
---------------------------------
-Creates a Gaussian filter engine.
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createGaussianFilter_GPU( int type, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
-    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
-    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
-    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
+    :param iterations: Number of times erosion and dilation to be applied.
-    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
+.. seealso:: :ocv:func:`morphologyEx`
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
+gpu::createBoxMaxFilter
+-----------------------
+Creates the maximum filter.
+.. ocv:function:: Ptr<Filter> gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-gpu::GaussianBlur
+    :param ksize: Kernel size.
---------------------
-Smooths an image using the Gaussian filter.
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+    :param borderVal: Default border value.
-    :param dst: Destination image with the same size and type as  ``src`` .
-    :param ksize: Gaussian kernel size.  ``ksize.width``  and  ``ksize.height``  can differ but they both must be positive and odd. If they are zeros, they are computed from  ``sigma1``  and  ``sigma2`` .
-    :param sigma1: Gaussian kernel standard deviation in X direction.
+gpu::createBoxMinFilter
+-----------------------
+Creates the minimum filter.
-    :param sigma2: Gaussian kernel standard deviation in Y direction. If  ``sigma2``  is zero, it is set to be equal to  ``sigma1`` . If they are both zeros, they are computed from  ``ksize.width``  and  ``ksize.height``, respectively. See  :ocv:func:`getGaussianKernel` for details. To fully control the result regardless of possible future modification of all this semantics, you are recommended to specify all of  ``ksize`` , ``sigma1`` , and  ``sigma2`` .
+.. ocv:function:: Ptr<Filter> gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+    :param ksize: Kernel size.
-    :param stream: Stream for the asynchronous version.
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-.. seealso:: :ocv:func:`gpu::createGaussianFilter_GPU`, :ocv:func:`GaussianBlur`
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+    :param borderVal: Default border value.
-gpu::getMaxFilter_GPU
-------------------------
-Creates the maximum filter.
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+gpu::createRowSumFilter
+-----------------------
+Creates a horizontal 1D box filter.
+.. ocv:function:: Ptr<Filter> gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
    :param ksize: Kernel size.
    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+    :param borderVal: Default border value.
-gpu::getMinFilter_GPU
-------------------------
-Creates the minimum filter.
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+gpu::createColumnSumFilter
+--------------------------
+Creates a vertical 1D box filter.
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
+.. ocv:function:: Ptr<Filter> gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
    :param ksize: Kernel size.
    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+    :param borderVal: Default border value.
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -48,221 +48,101 @@
 #endif
 #include "opencv2/core/gpu.hpp"
-#include "opencv2/core/base.hpp"
+#include "opencv2/imgproc.hpp"
 namespace cv { namespace gpu {
-/*!
+class CV_EXPORTS Filter : public Algorithm
-The Base Class for 1D or Row-wise Filters
-This is the base class for linear or non-linear filters that process 1D data.
-In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseRowFilter_GPU
 {
 public:
-    BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
+    virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
-    virtual ~BaseRowFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
-};
-/*!
-The Base Class for Column-wise Filters
-This is the base class for linear or non-linear filters that process columns of 2D arrays.
-Such filters are used for the "vertical" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseColumnFilter_GPU
-{
-public:
-    BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseColumnFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
-};
-/*!
-The Base Class for Non-Separable 2D Filters.
-This is the base class for linear or non-linear 2D filters.
-*/
-class CV_EXPORTS BaseFilter_GPU
-{
-public:
-    BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    Size ksize;
-    Point anchor;
-};
-/*!
-The Base Class for Filter Engine.
-The class can be used to apply an arbitrary filtering operation to an image.
-It contains all the necessary intermediate buffers.
-*/
-class CV_EXPORTS FilterEngine_GPU
-{
-public:
-    virtual ~FilterEngine_GPU() {}
-    virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
 };
-//! returns the non-separable filter engine with the specified filter
+////////////////////////////////////////////////////////////////////////////////////////////////////
-CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
+// Box Filter
-//! returns the separable filter engine with the specified filters
+//! creates a normalized 2D box filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
+//! supports CV_8UC1, CV_8UC4 types
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
+CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
+                                       int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);
-//! returns horizontal 1D box filter
-//! supports only CV_8UC1 source type and CV_32FC1 sum type
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
-//! returns vertical 1D box filter
-//! supports only CV_8UC1 sum type and CV_32FC1 dst type
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
-//! returns 2D box filter
-//! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
-CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
-//! returns box filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
-    const Point& anchor = Point(-1,-1));
-//! returns 2D morphological filter
-//! only MORPH_ERODE and MORPH_DILATE are supported
-//! supports CV_8UC1 and CV_8UC4 types
-//! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
-    Point anchor=Point(-1,-1));
-//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-//! returns 2D filter with the specified kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////
-//! supports CV_8U, CV_16U and CV_32F one and four channel image
+// Linear Filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-//! returns the non-separable linear filter engine
+//! Creates a non-separable linear 2D filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
+//! supports 1 and 4 channel CV_8U, CV_16U and CV_32F input
-    Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT);
+CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
-//! returns the primitive row filter with the specified kernel.
+////////////////////////////////////////////////////////////////////////////////////////////////////
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
+// Laplacian Filter
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-//! returns the primitive column filter with the specified kernel.
+//! creates a Laplacian operator
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
+//! supports only ksize = 1 and ksize = 3
-//! there are two version of algorithm: NPP and OpenCV.
+CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
-//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
+                                             int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-//! returns the separable linear filter engine
+////////////////////////////////////////////////////////////////////////////////////////////////////
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
+// Separable Linear Filter
-    const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-//! returns filter engine for the generalized Sobel operator
+//! creates a separable linear filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
+CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-//! returns the Gaussian filter engine
+////////////////////////////////////////////////////////////////////////////////////////////////////
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
+// Deriv Filter
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-//! returns maximum filter
+//! creates a generalized Deriv operator
-CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
+CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
+                                         int ksize, bool normalize = false, double scale = 1,
+                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
-//! returns minimum filter
+//! creates a Sobel operator
-CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
+CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
+                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
-//! smooths the image using the normalized box filter
+//! creates a vertical or horizontal Scharr operator
-//! supports CV_8UC1, CV_8UC4 types
+CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
-CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
+                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
-//! a synonym for normalized box filter
+////////////////////////////////////////////////////////////////////////////////////////////////////
-static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+// Gaussian Filter
-{
-    boxFilter(src, dst, -1, ksize, anchor, stream);
-}
-//! erodes the image (applies the local minimum operator)
+//! creates a Gaussian filter
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
+CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
+                                            double sigma1, double sigma2 = 0,
-                      Point anchor = Point(-1, -1), int iterations = 1,
+                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
-                      Stream& stream = Stream::Null());
-//! dilates the image (applies the local maximum operator)
+////////////////////////////////////////////////////////////////////////////////////////////////////
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
+// Morphology Filter
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                       Point anchor = Point(-1, -1), int iterations = 1,
-                       Stream& stream = Stream::Null());
-//! applies an advanced morphological operation to the image
+//! creates a 2D morphological filter
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
+//! supports CV_8UC1 and CV_8UC4 types
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
+CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
-                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-//! applies non-separable 2D linear filter to the image
+////////////////////////////////////////////////////////////////////////////////////////////////////
-CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+// Image Rank Filter
-//! applies separable 2D linear filter to the image
+//! result pixel value is the maximum of pixel values under the rectangular mask region
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
+CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+                                          Point anchor = Point(-1, -1),
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                            Stream& stream = Stream::Null());
-//! applies generalized Sobel operator to the image
+//! result pixel value is the maximum of pixel values under the rectangular mask region
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
+CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+                                          Point anchor = Point(-1, -1),
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-//! applies the vertical or horizontal Scharr operator to the image
+////////////////////////////////////////////////////////////////////////////////////////////////////
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
+// 1D Sum Filter
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-//! smooths the image using Gaussian filter.
+//! creates a horizontal 1D box filter
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
+//! supports only CV_8UC1 source type and CV_32FC1 sum type
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+CV_EXPORTS Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-//! applies Laplacian operator to the image
+//! creates a vertical 1D box filter
-//! supports only ksize = 1 and ksize = 3
+//! supports only CV_8UC1 sum type and CV_32FC1 dst type
-CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 }} // namespace cv { namespace gpu {

--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -70,7 +70,9 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
+        cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(d_src.type(), -1, cv::Size(ksize, ksize));
+        TEST_CYCLE() blurFilter->apply(d_src, dst);
        GPU_SANITY_CHECK(dst, 1);
    }
@@ -85,9 +87,9 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
 }
 //////////////////////////////////////////////////////////////////////
-// Sobel
+// Filter2D
-PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
 {
    declare.time(20.0);
@@ -98,13 +100,17 @@ PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
    if (PERF_RUN_GPU())
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
-        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);
+        cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(d_src.type(), -1, kernel);
+        TEST_CYCLE() filter2D->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -112,21 +118,22 @@ PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::Sobel(src, dst, -1, 1, 1, ksize);
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
-// Scharr
+// Laplacian
-PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
+PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
 {
    declare.time(20.0);
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);
@@ -135,9 +142,10 @@ PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
-        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);
+        cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(d_src.type(), -1, ksize);
+        TEST_CYCLE() laplacian->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -145,16 +153,16 @@ PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::Scharr(src, dst, -1, 1, 0);
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
-// GaussianBlur
+// Sobel
-PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
    declare.time(20.0);
@@ -169,9 +177,10 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
-        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(d_src.type(), -1, 1, 1, ksize);
+        TEST_CYCLE() sobel->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -179,22 +188,21 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+        TEST_CYCLE() cv::Sobel(src, dst, -1, 1, 1, ksize);
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
-// Laplacian
+// Scharr
-PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
 {
    declare.time(20.0);
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);
@@ -204,7 +212,9 @@ PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(C
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);
+        cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(d_src.type(), -1, 1, 0);
+        TEST_CYCLE() scharr->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -212,34 +222,34 @@ PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(C
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
+        TEST_CYCLE() cv::Scharr(src, dst, -1, 1, 0);
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
-// Erode
+// GaussianBlur
-PERF_TEST_P(Sz_Type, Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
    declare.time(20.0);
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);
-    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
    if (PERF_RUN_GPU())
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
-        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(ksize, ksize), 0.5);
+        TEST_CYCLE() gauss->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -247,16 +257,16 @@ PERF_TEST_P(Sz_Type, Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8U
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::erode(src, dst, ker);
+        TEST_CYCLE() cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
-// Dilate
+// Erode
-PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+PERF_TEST_P(Sz_Type, Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
    declare.time(20.0);
@@ -272,9 +282,10 @@ PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
-        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), ker);
+        TEST_CYCLE() erode->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -282,26 +293,21 @@ PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::dilate(src, dst, ker);
+        TEST_CYCLE() cv::erode(src, dst, ker);
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
-// MorphologyEx
+// Dilate
-CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
-DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
-PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), MorphOp::all()))
+PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
    declare.time(20.0);
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);
-    const int morphOp = GET_PARAM(2);
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);
@@ -312,10 +318,10 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf1;
-        cv::gpu::GpuMat d_buf2;
-        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);
+        cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), ker);
+        TEST_CYCLE() dilate->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -323,35 +329,40 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::morphologyEx(src, dst, morphOp, ker);
+        TEST_CYCLE() cv::dilate(src, dst, ker);
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
-// Filter2D
+// MorphologyEx
-PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
+PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), MorphOp::all()))
 {
    declare.time(20.0);
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
+    const int morphOp = GET_PARAM(2);
    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-    declare.in(kernel, WARMUP_RNG);
    if (PERF_RUN_GPU())
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);
+        cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), ker);
+        TEST_CYCLE() morph->apply(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -359,7 +370,7 @@ PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV
    {
        cv::Mat dst;
-        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+        TEST_CYCLE() cv::morphologyEx(src, dst, morphOp, ker);
        CPU_SANITY_CHECK(dst);
    }

--- a/modules/gpufilters/src/cuda/filter2d.cu
+++ b/modules/gpufilters/src/cuda/filter2d.cu
@@ -48,111 +48,104 @@
 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
+    template <class SrcPtr, typename D>
+    __global__ void filter2D(const SrcPtr src, PtrStepSz<D> dst,
+                             const float* __restrict__ kernel,
+                             const int kWidth, const int kHeight,
+                             const int anchorX, const int anchorY)
    {
-        #define FILTER2D_MAX_KERNEL_SIZE 16
+        typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
-        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        template <class SrcT, typename D>
+        if (x >= dst.cols || y >= dst.rows)
-        __global__ void filter2D(const SrcT src, PtrStepSz<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+            return;
-        {
-            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-            sum_t res = VecTraits<sum_t>::all(0);
+        sum_t res = VecTraits<sum_t>::all(0);
-            int kInd = 0;
+        int kInd = 0;
-            for (int i = 0; i < kHeight; ++i)
+        for (int i = 0; i < kHeight; ++i)
-            {
+        {
-                for (int j = 0; j < kWidth; ++j)
+            for (int j = 0; j < kWidth; ++j)
-                    res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
+                res = res + src(y - anchorY + i, x - anchorX + j) * kernel[kInd++];
-            }
-            dst(y, x) = saturate_cast<D>(res);
        }
-        template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+        dst(y, x) = saturate_cast<D>(res);
+    }
-        #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+    template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
-            texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_filter2D_ ## type ## _reader \
+    #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+        texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        struct tex_filter2D_ ## type ## _reader \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            const int xoff; \
+            const int yoff; \
+            tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
            { \
-                typedef type elem_type; \
+                return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
-                typedef int index_type; \
+            } \
-                const int xoff; \
+        }; \
-                const int yoff; \
+        template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
-                tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+        { \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel, \
-                { \
+                int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
-                    return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
            { \
-                static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, \
+                typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+                dim3 block(16, 16); \
-                { \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                bindTexture(&tex_filter2D_ ## type , srcWhole); \
-                    dim3 block(16, 16); \
+                tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
-                    bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
-                    tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kernel, kWidth, kHeight, anchorX, anchorY); \
-                    Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                cudaSafeCall( cudaGetLastError() ); \
-                    BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                if (stream == 0) \
-                    filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                    cudaSafeCall( cudaGetLastError() ); \
+            } \
-                    if (stream == 0) \
+        };
-                        cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
+    IMPLEMENT_FILTER2D_TEX_READER(uchar);
-            };
+    IMPLEMENT_FILTER2D_TEX_READER(uchar4);
-        IMPLEMENT_FILTER2D_TEX_READER(uchar);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort);
-        IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort4);
-        IMPLEMENT_FILTER2D_TEX_READER(ushort);
+    IMPLEMENT_FILTER2D_TEX_READER(float);
-        IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+    IMPLEMENT_FILTER2D_TEX_READER(float4);
-        IMPLEMENT_FILTER2D_TEX_READER(float);
+    #undef IMPLEMENT_FILTER2D_TEX_READER
-        IMPLEMENT_FILTER2D_TEX_READER(float4);
+    template <typename T, typename D>
-        #undef IMPLEMENT_FILTER2D_TEX_READER
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
-        template <typename T, typename D>
+                  int borderMode, const float* borderValue, cudaStream_t stream)
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
+    {
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
+        typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream)
+                               int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+        static const func_t funcs[] =
        {
-            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+            Filter2DCaller<T, D, BrdConstant>::call,
-            static const func_t funcs[] =
+            Filter2DCaller<T, D, BrdReplicate>::call,
-            {
+            Filter2DCaller<T, D, BrdReflect>::call,
-                Filter2DCaller<T, D, BrdConstant>::call,
+            Filter2DCaller<T, D, BrdWrap>::call,
-                Filter2DCaller<T, D, BrdReplicate>::call,
+            Filter2DCaller<T, D, BrdReflect101>::call
-                Filter2DCaller<T, D, BrdReflect>::call,
+        };
-                Filter2DCaller<T, D, BrdWrap>::call,
-                Filter2DCaller<T, D, BrdReflect101>::call
+        funcs[borderMode]((PtrStepSz<T>) srcWhole, ofsX, ofsY, (PtrStepSz<D>) dst, kernel,
-            };
+                          kWidth, kHeight, anchorX, anchorY, borderValue, stream);
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
-        }
-        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
    }
+    template void filter2D<uchar  , uchar  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<uchar4 , uchar4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort , ushort >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float  , float  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float4 , float4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
 }}}
 #endif // CUDA_DISABLER
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -47,1284 +47,952 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createBoxFilter(int, int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
+Ptr<Filter> cv::gpu::createLaplacianFilter(int, int, int, double, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int, int, InputArray, InputArray, Point, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createDerivFilter(int, int, int, int, int, bool, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createSobelFilter(int, int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<Filter> cv::gpu::createScharrFilter(int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
+Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createMorphologyFilter(int, int, InputArray, Point, int) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createBoxMinFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createRowSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<Filter> cv::gpu::createColumnSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_no_cuda(); }
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_no_cuda(); }
 #else
 namespace
 {
-    inline void normalizeAnchor(int& anchor, int ksize)
+    void normalizeAnchor(int& anchor, int ksize)
    {
        if (anchor < 0)
            anchor = ksize >> 1;
-        CV_Assert(0 <= anchor && anchor < ksize);
+        CV_Assert( 0 <= anchor && anchor < ksize );
    }
-    inline void normalizeAnchor(Point& anchor, const Size& ksize)
+    void normalizeAnchor(Point& anchor, Size ksize)
    {
        normalizeAnchor(anchor.x, ksize.width);
        normalizeAnchor(anchor.y, ksize.height);
    }
-    inline void normalizeROI(Rect& roi, const Size& ksize, const Point& anchor, const Size& src_size)
-    {
-        if (roi == Rect(0,0,-1,-1))
-            roi = Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height);
-        CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
-    }
-    inline void normalizeKernel(const Mat& kernel, GpuMat& gpu_krnl, int type = CV_8U, int* nDivisor = 0, bool reverse = false)
-    {
-        int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
-        if (nDivisor) *nDivisor = scale;
-        Mat temp(kernel.size(), type);
-        kernel.convertTo(temp, type, scale);
-        Mat cont_krnl = temp.reshape(1, 1);
-        if (reverse)
-        {
-            int count = cont_krnl.cols >> 1;
-            for (int i = 0; i < count; ++i)
-            {
-                std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
-            }
-        }
-        gpu_krnl.upload(cont_krnl);
-    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
+// Box Filter
 namespace
 {
-    struct Filter2DEngine_GPU : public FilterEngine_GPU
+    class NPPBoxFilter : public Filter
    {
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
+    public:
-            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
+        NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
-        {}
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-            Size src_size = src.size();
-            dst.create(src_size, dstType);
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-            GpuMat srcROI = src(roi);
+    private:
-            GpuMat dstROI = dst(roi);
+        typedef NppStatus (*nppFilterBox_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep,
+                                            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
-            (*filter2D)(srcROI, dstROI, stream);
+        Size ksize_;
-        }
+        Point anchor_;
+        int type_;
-        Ptr<BaseFilter_GPU> filter2D;
+        nppFilterBox_t func_;
-        int srcType, dstType;
+        int borderMode_;
+        Scalar borderVal_;
+        GpuMat srcBorder_;
    };
-}
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
+    NPPBoxFilter::NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
-{
+        ksize_(ksize), anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
-    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// SeparableFilter
-namespace
-{
-    struct SeparableFilterEngine_GPU : public FilterEngine_GPU
    {
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
+        static const nppFilterBox_t funcs[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
-                                  int srcType_, int bufType_, int dstType_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-            pbuf = &buf;
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
-        }
+        CV_Assert( dstType == srcType );
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_,
-                                  GpuMat& buf_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-            pbuf = &buf_;
+        normalizeAnchor(anchor_, ksize);
-        }
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
+        func_ = funcs[CV_MAT_CN(srcType)];
-        {
+    }
-            CV_Assert(src.type() == srcType);
-            Size src_size = src.size();
-            dst.create(src_size, dstType);
+    void NPPBoxFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
-            if (roi.size() != src_size)
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-            ensureSizeIsEnough(src_size, bufType, *pbuf);
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
-            normalizeROI(roi, ksize, anchor, src_size);
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
-            GpuMat srcROI = src(roi);
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
-            GpuMat dstROI = dst(roi);
+        NppStreamHandler h(stream);
-            GpuMat bufROI = (*pbuf)(roi);
-            (*rowFilter)(srcROI, bufROI, stream);
+        NppiSize oSizeROI;
-            (*columnFilter)(bufROI, dstROI, stream);
+        oSizeROI.width = src.cols;
-        }
+        oSizeROI.height = src.rows;
-        Ptr<BaseRowFilter_GPU> rowFilter;
+        NppiSize oMaskSize;
-        Ptr<BaseColumnFilter_GPU> columnFilter;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
-        int srcType, bufType, dstType;
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
-        Size ksize;
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
-        Point anchor;
+                           dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
-        GpuMat buf;
+        if (stream == 0)
-        GpuMat* pbuf;
+            cudaSafeCall( cudaDeviceSynchronize() );
-    };
+    }
 }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
+Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
 {
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
+    if (dstType < 0)
-}
+        dstType = srcType;
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
-{
+    return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// 1D Sum Filter
+// Linear Filter
+namespace cv { namespace gpu { namespace cudev
+{
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream);
+}}}
 namespace
 {
-    struct NppRowSumFilter : public BaseRowFilter_GPU
+    class LinearFilter : public Filter
    {
-        NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {}
+    public:
+        LinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal);
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            cudaStream_t stream = StreamAccessor::getStream(s);
+    private:
+        typedef void (*filter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                                   int kWidth, int kHeight, int anchorX, int anchorY,
+                                   int borderMode, const float* borderValue, cudaStream_t stream);
-            NppStreamHandler h(stream);
+        GpuMat kernel_;
+        Point anchor_;
+        int type_;
+        filter2D_t func_;
+        int borderMode_;
+        Scalar_<float> borderVal_;
+    };
-            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+    LinearFilter::LinearFilter(int srcType, int dstType, InputArray _kernel, Point anchor, int borderMode, Scalar borderVal) :
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
+        anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int scn = CV_MAT_CN(srcType);
-            if (stream == 0)
+        Mat kernel = _kernel.getMat();
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor)
+        CV_Assert( sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F );
-{
+        CV_Assert( scn == 1 || scn == 4 );
-    CV_Assert(srcType == CV_8UC1 && sumType == CV_32FC1);
+        CV_Assert( dstType == srcType );
+        CV_Assert( kernel.channels() == 1 );
+        CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
-    normalizeAnchor(anchor, ksize);
+        Mat kernel32F;
+        kernel.convertTo(kernel32F, CV_32F);
-    return Ptr<BaseRowFilter_GPU>(new NppRowSumFilter(ksize, anchor));
+        kernel_ = gpu::createContinuous(kernel.size(), CV_32FC1);
-}
+        kernel_.upload(kernel32F);
-namespace
+        normalizeAnchor(anchor_, kernel.size());
-{
-    struct NppColumnSumFilter : public BaseColumnFilter_GPU
-    {
-        NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+        switch (srcType)
        {
-            NppiSize sz;
+        case CV_8UC1:
-            sz.width = src.cols;
+            func_ = cudev::filter2D<uchar, uchar>;
-            sz.height = src.rows;
+            break;
+        case CV_8UC4:
+            func_ = cudev::filter2D<uchar4, uchar4>;
+            break;
+        case CV_16UC1:
+            func_ = cudev::filter2D<ushort, ushort>;
+            break;
+        case CV_16UC4:
+            func_ = cudev::filter2D<ushort4, ushort4>;
+            break;
+        case CV_32FC1:
+            func_ = cudev::filter2D<float, float>;
+            break;
+        case CV_32FC4:
+            func_ = cudev::filter2D<float4, float4>;
+            break;
+        }
+    }
-            cudaStream_t stream = StreamAccessor::getStream(s);
+    void LinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
-            NppStreamHandler h(stream);
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
-            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+        Point ofs;
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
+        Size wholeSize;
+        src.locateROI(wholeSize, ofs);
-            if (stream == 0)
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart);
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
+        func_(srcWhole, ofs.x, ofs.y, dst, kernel_.ptr<float>(),
-    };
+              kernel_.cols, kernel_.rows, anchor_.x, anchor_.y,
+              borderMode_, borderVal_.val, StreamAccessor::getStream(_stream));
+    }
 }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor)
+Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal)
 {
-    CV_Assert(sumType == CV_8UC1 && dstType == CV_32FC1);
+    if (dstType < 0)
+        dstType = srcType;
-    normalizeAnchor(anchor, ksize);
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
-    return Ptr<BaseColumnFilter_GPU>(new NppColumnSumFilter(ksize, anchor));
+    return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// Box Filter
+// Laplacian Filter
-namespace
+Ptr<Filter> cv::gpu::createLaplacianFilter(int srcType, int dstType, int ksize, double scale, int borderMode, Scalar borderVal)
 {
-    typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
+    CV_Assert( ksize == 1 || ksize == 3 );
-        NppiSize oMaskSize, NppiPoint oAnchor);
-    struct NPPBoxFilter : public BaseFilter_GPU
+    static const float K[2][9] =
    {
-        NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
+        {0.0f, 1.0f, 0.0f, 1.0f, -4.0f, 1.0f, 0.0f, 1.0f, 0.0f},
+        {2.0f, 0.0f, 2.0f, 0.0f, -8.0f, 0.0f, 2.0f, 0.0f, 2.0f}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-            cudaStream_t stream = StreamAccessor::getStream(s);
-            NppStreamHandler h(stream);
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        nppFilterBox_t func;
    };
-}
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
-{
-    static const nppFilterBox_t nppFilterBox_callers[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-    normalizeAnchor(anchor, ksize);
+    Mat kernel(3, 3, CV_32FC1, (void*)K[ksize == 3]);
+    if (scale != 1)
+        kernel *= scale;
-    return Ptr<BaseFilter_GPU>(new NPPBoxFilter(ksize, anchor, nppFilterBox_callers[CV_MAT_CN(srcType)]));
+    return gpu::createLinearFilter(srcType, dstType, kernel, Point(-1,-1), borderMode, borderVal);
 }
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)
+////////////////////////////////////////////////////////////////////////////////////////////////////
-{
+// Separable Linear Filter
-    Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);
-    return createFilter2D_GPU(boxFilter, srcType, dstType);
-}
-void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor, Stream& stream)
+namespace filter
 {
-    int sdepth = src.depth(), cn = src.channels();
+    template <typename T, typename D>
-    if( ddepth < 0 )
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        ddepth = sdepth;
-    dst.create(src.size(), CV_MAKETYPE(ddepth, cn));
-    Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor);
+    template <typename T, typename D>
-    f->apply(src, dst, Rect(0,0,-1,-1), stream);
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Morphology Filter
 namespace
 {
-    typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint);
+    class SeparableLinearFilter : public Filter
-    struct NPPMorphFilter : public BaseFilter_GPU
    {
-        NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
+    public:
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
+        SeparableLinearFilter(int srcType, int dstType,
+                              InputArray rowKernel, InputArray columnKernel,
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+                              Point anchor, int rowBorderMode, int columnBorderMode);
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-            cudaStream_t stream = StreamAccessor::getStream(s);
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-            NppStreamHandler h(stream);
+    private:
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
+        int srcType_, bufType_, dstType_;
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );
+        GpuMat rowKernel_, columnKernel_;
+        func_t rowFilter_, columnFilter_;
+        Point anchor_;
+        int rowBorderMode_, columnBorderMode_;
-            if (stream == 0)
+        GpuMat buf_;
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        GpuMat kernel;
-        nppMorfFilter_t func;
-    };
-}
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
-{
-    static const nppMorfFilter_t nppMorfFilter_callers[2][5] =
-    {
-        {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
-        {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
    };
-    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
+    SeparableLinearFilter::SeparableLinearFilter(int srcType, int dstType,
-    CV_Assert(type == CV_8UC1 || type == CV_8UC4);
+                                                 InputArray _rowKernel, InputArray _columnKernel,
+                                                 Point anchor, int rowBorderMode, int columnBorderMode) :
-    GpuMat gpu_krnl;
+        srcType_(srcType), dstType_(dstType), anchor_(anchor), rowBorderMode_(rowBorderMode), columnBorderMode_(columnBorderMode)
-    normalizeKernel(kernel, gpu_krnl);
-    normalizeAnchor(anchor, ksize);
-    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
-}
-namespace
-{
-    struct MorphologyFilterEngine_GPU : public FilterEngine_GPU
    {
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) :
+        static const func_t rowFilterFuncs[7][4] =
-            filter2D(filter2D_), type(type_), iters(iters_)
        {
-            pbuf = &buf;
+            {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
-        }
+            {0, 0, 0, 0},
+            {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) :
+            {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
-            filter2D(filter2D_), type(type_), iters(iters_)
+            {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
+            {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+            {0, 0, 0, 0}
+        };
+        static const func_t columnFilterFuncs[7][4] =
        {
-            pbuf = &buf_;
+            {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
-        }
+            {0, 0, 0, 0},
+            {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
+            {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+            {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
+            {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+            {0, 0, 0, 0}
+        };
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
+        const int sdepth = CV_MAT_DEPTH(srcType);
-        {
+        const int cn = CV_MAT_CN(srcType);
-            CV_Assert(src.type() == type);
+        const int ddepth = CV_MAT_DEPTH(dstType);
-            Size src_size = src.size();
+        Mat rowKernel = _rowKernel.getMat();
+        Mat columnKernel = _columnKernel.getMat();
-            dst.create(src_size, type);
+        CV_Assert( sdepth <= CV_64F && cn <= 4 );
+        CV_Assert( rowKernel.channels() == 1 );
+        CV_Assert( columnKernel.channels() == 1 );
+        CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP );
+        CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP );
-            if (roi.size() != src_size)
+        Mat kernel32F;
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
+        rowKernel.convertTo(kernel32F, CV_32F);
+        rowKernel_.upload(kernel32F.reshape(1, 1));
-            if (iters > 1)
+        columnKernel.convertTo(kernel32F, CV_32F);
-                pbuf->create(src_size, type);
+        columnKernel_.upload(kernel32F.reshape(1, 1));
-            GpuMat srcROI = src(roi);
+        CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 );
-            GpuMat dstROI = dst(roi);
+        CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 );
-            (*filter2D)(srcROI, dstROI, stream);
+        normalizeAnchor(anchor_.x, rowKernel_.cols);
+        normalizeAnchor(anchor_.y, columnKernel_.cols);
-            for(int i = 1; i < iters; ++i)
+        bufType_ = CV_MAKE_TYPE(CV_32F, cn);
-            {
-                dst.swap((*pbuf));
-                dstROI = dst(roi);
+        rowFilter_ = rowFilterFuncs[sdepth][cn - 1];
-                GpuMat bufROI = (*pbuf)(roi);
+        CV_Assert( rowFilter_ != 0 );
-                (*filter2D)(bufROI, dstROI, stream);
+        columnFilter_ = columnFilterFuncs[ddepth][cn - 1];
-            }
+        CV_Assert( columnFilter_ != 0 );
-        }
+    }
-        Ptr<BaseFilter_GPU> filter2D;
+    void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
-        int type;
+        _dst.create(src.size(), dstType_);
-        int iters;
+        GpuMat dst = _dst.getGpuMat();
-        GpuMat buf;
+        ensureSizeIsEnough(src.size(), bufType_, buf_);
-        GpuMat* pbuf;
-    };
-}
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor, int iterations)
+        DeviceInfo devInfo;
-{
+        const int cc = devInfo.major() * 10 + devInfo.minor();
-    CV_Assert(iterations > 0);
-    Size ksize = kernel.size();
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
+        rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
+        columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
+    }
 }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, const Point& anchor, int iterations)
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor, int rowBorderMode, int columnBorderMode)
 {
-    CV_Assert(iterations > 0);
+    if (dstType < 0)
+        dstType = srcType;
-    Size ksize = kernel.size();
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
+    if (columnBorderMode < 0)
+        columnBorderMode = rowBorderMode;
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations, buf));
+    return new SeparableLinearFilter(srcType, dstType, rowKernel, columnKernel, anchor, rowBorderMode, columnBorderMode);
 }
-namespace
+////////////////////////////////////////////////////////////////////////////////////////////////////
-{
+// Deriv Filter
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream = Stream::Null())
-    {
-        Mat kernel;
-        Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
-        normalizeAnchor(anchor, ksize);
-        if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
-        {
-            src.copyTo(dst, stream);
-            return;
-        }
-        dst.create(src.size(), src.type());
-        if (!_kernel.data)
-        {
-            kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2));
-            anchor = Point(iterations, iterations);
-            iterations = 1;
-        }
-        else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
-        {
-            anchor = Point(anchor.x * iterations, anchor.y * iterations);
-            kernel = getStructuringElement(MORPH_RECT,
-                                           Size(ksize.width + (iterations - 1) * (ksize.width - 1),
-                                                ksize.height + (iterations - 1) * (ksize.height - 1)),
-                                           anchor);
-            iterations = 1;
-        }
-        else
-            kernel = _kernel;
-        Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, buf, anchor, iterations);
-        f->apply(src, dst, Rect(0,0,-1,-1), stream);
+Ptr<Filter> cv::gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize, double scale, int rowBorderMode, int columnBorderMode)
-    }
+{
+    Mat kx, ky;
+    getDerivKernels(kx, ky, dx, dy, ksize, normalize, CV_32F);
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations)
+    if (scale != 1)
    {
-        GpuMat buf;
+        // usually the smoothing part is the slowest to compute,
-        morphOp(op, src, dst, _kernel, buf, anchor, iterations);
+        // so try to scale it instead of the faster differenciating part
+        if (dx == 0)
+            kx *= scale;
+        else
+            ky *= scale;
    }
-}
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
+    return gpu::createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1, -1), rowBorderMode, columnBorderMode);
-{
-    morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations);
-}
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
-{
-    morphOp(MORPH_ERODE, src, dst, kernel, buf, anchor, iterations, stream);
 }
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
+Ptr<Filter> cv::gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize, double scale, int rowBorderMode, int columnBorderMode)
 {
-    morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations);
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, ksize, false, scale, rowBorderMode, columnBorderMode);
 }
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
+Ptr<Filter> cv::gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale, int rowBorderMode, int columnBorderMode)
 {
-    morphOp(MORPH_DILATE, src, dst, kernel, buf, anchor, iterations, stream);
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, -1, false, scale, rowBorderMode, columnBorderMode);
 }
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations)
+////////////////////////////////////////////////////////////////////////////////////////////////////
-{
+// Gaussian Filter
-    GpuMat buf1;
-    GpuMat buf2;
-    morphologyEx(src, dst, op, kernel, buf1, buf2, anchor, iterations);
-}
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor, int iterations, Stream& stream)
+Ptr<Filter> cv::gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2, int rowBorderMode, int columnBorderMode)
 {
-    switch( op )
+    const int depth = CV_MAT_DEPTH(srcType);
-    {
-    case MORPH_ERODE:
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-    case MORPH_DILATE:
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-    case MORPH_OPEN:
+    if (sigma2 <= 0)
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
+        sigma2 = sigma1;
-        dilate(buf2, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-    case MORPH_CLOSE:
+    // automatic detection of kernel size from sigma
-        dilate(src, buf2, kernel, buf1, anchor, iterations, stream);
+    if (ksize.width <= 0 && sigma1 > 0)
-        erode(buf2, dst, kernel, buf1, anchor, iterations, stream);
+        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-        break;
+    if (ksize.height <= 0 && sigma2 > 0)
+        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-#ifdef HAVE_OPENCV_GPUARITHM
+    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
-    case MORPH_GRADIENT:
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(dst, buf2, dst, GpuMat(), -1, stream);
-        break;
-    case MORPH_TOPHAT:
+    sigma1 = std::max(sigma1, 0.0);
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
+    sigma2 = std::max(sigma2, 0.0);
-        dilate(dst, buf2, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(src, buf2, dst, GpuMat(), -1, stream);
-        break;
-    case MORPH_BLACKHAT:
+    Mat kx = getGaussianKernel(ksize.width, sigma1, CV_32F);
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
+    Mat ky;
-        erode(dst, buf2, kernel, buf1, anchor, iterations, stream);
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
-        gpu::subtract(buf2, src, dst, GpuMat(), -1, stream);
+        ky = kx;
-        break;
+    else
-#endif
+        ky = getGaussianKernel(ksize.height, sigma2, CV_32F);
-    default:
+    return createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1,-1), rowBorderMode, columnBorderMode);
-        CV_Error(cv::Error::StsBadArg, "unknown morphological operation");
-    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// Linear Filter
+// Morphology Filter
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream);
-    }
-}}}
 namespace
 {
-    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
+    class MorphologyFilter : public Filter
-        const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);
-    struct NPPLinearFilter : public BaseFilter_GPU
    {
-        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
+    public:
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+        MorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations);
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-            cudaStream_t stream = StreamAccessor::getStream(s);
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-            NppStreamHandler h(stream);
+    private:
+        typedef NppStatus (*nppMorfFilter_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
+        int type_;
-                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );
+        GpuMat kernel_;
+        Point anchor_;
+        int iters_;
+        nppMorfFilter_t func_;
-            if (stream == 0)
+        GpuMat srcBorder_;
-                cudaSafeCall( cudaDeviceSynchronize() );
+        GpuMat buf_;
-        }
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter2D_t func;
    };
-    typedef void (*gpuFilter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
+    MorphologyFilter::MorphologyFilter(int op, int srcType, InputArray _kernel, Point anchor, int iterations) :
-                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
+        type_(srcType), anchor_(anchor), iters_(iterations)
-                                   int borderMode, const float* borderValue, cudaStream_t stream);
-    struct GpuFilter2D : public BaseFilter_GPU
    {
-        GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) :
+        static const nppMorfFilter_t funcs[2][5] =
-            BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_)
-        {
-        }
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
        {
-            using namespace cv::gpu::cudev::imgproc;
+            {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
+            {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
-            Point ofs;
+        };
-            Size wholeSize;
-            src.locateROI(wholeSize, ofs);
-            GpuMat srcWhole(wholeSize, src.type(), src.datastart);
-            static const Scalar_<float> zero = Scalar_<float>::all(0.0f);
-            func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), brd_type, zero.val, StreamAccessor::getStream(stream));
-        }
-        gpuFilter2D_t func;
+        CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE );
-        GpuMat kernel;
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
-        int brd_type;
-    };
-}
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type)
-{
-    using namespace cv::gpu::cudev::imgproc;
-    int sdepth = CV_MAT_DEPTH(srcType);
+        Mat kernel = _kernel.getMat();
-    int scn = CV_MAT_CN(srcType);
+        Size ksize = !kernel.empty() ? _kernel.size() : Size(3, 3);
-    CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F);
+        normalizeAnchor(anchor_, ksize);
-    CV_Assert(scn == 1 || scn == 4);
-    CV_Assert(dstType == srcType);
-    CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP);
-    Size ksize = kernel.size();
+        if (kernel.empty())
+        {
+            kernel = getStructuringElement(MORPH_RECT, Size(1 + iters_ * 2, 1 + iters_ * 2));
+            anchor_ = Point(iters_, iters_);
+            iters_ = 1;
+        }
+        else if (iters_ > 1 && countNonZero(kernel) == (int) kernel.total())
+        {
+            anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_);
+            kernel = getStructuringElement(MORPH_RECT,
+                                           Size(ksize.width + (iters_ - 1) * (ksize.width - 1),
+                                                ksize.height + (iters_ - 1) * (ksize.height - 1)),
+                                           anchor_);
+            iters_ = 1;
+        }
-#if 0
+        CV_Assert( kernel.channels() == 1 );
-    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT)
-    {
-        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
-        GpuMat gpu_krnl;
+        Mat kernel8U;
-        int nDivisor;
+        kernel.convertTo(kernel8U, CV_8U);
-        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
-        normalizeAnchor(anchor, ksize);
+        kernel_ = gpu::createContinuous(kernel.size(), CV_8UC1);
+        kernel_.upload(kernel8U);
-        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
+        func_ = funcs[op][CV_MAT_CN(srcType)];
    }
-#endif
-    CV_Assert(ksize.width * ksize.height <= 16 * 16);
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl, CV_32F);
-    normalizeAnchor(anchor, ksize);
-    gpuFilter2D_t func = 0;
-    switch (srcType)
+    void MorphologyFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
    {
-    case CV_8UC1:
+        GpuMat src = _src.getGpuMat();
-        func = filter2D_gpu<uchar, uchar>;
+        CV_Assert( src.type() == type_ );
-        break;
-    case CV_8UC4:
-        func = filter2D_gpu<uchar4, uchar4>;
-        break;
-    case CV_16UC1:
-        func = filter2D_gpu<ushort, ushort>;
-        break;
-    case CV_16UC4:
-        func = filter2D_gpu<ushort4, ushort4>;
-        break;
-    case CV_32FC1:
-        func = filter2D_gpu<float, float>;
-        break;
-    case CV_32FC4:
-        func = filter2D_gpu<float4, float4>;
-        break;
-    }
-    return Ptr<BaseFilter_GPU>(new GpuFilter2D(ksize, anchor, func, gpu_krnl, brd_type));
+        Size ksize = kernel_.size();
-}
+        gpu::copyMakeBorder(src, srcBorder_, ksize.height, ksize.height, ksize.width, ksize.width, BORDER_DEFAULT, Scalar(), _stream);
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType)
+        GpuMat srcRoi = srcBorder_(Rect(ksize.width, ksize.height, src.cols, src.rows));
-{
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType);
-    return createFilter2D_GPU(linearFilter, srcType, dstType);
+        GpuMat bufRoi;
-}
+        if (iters_ > 1)
+        {
+            ensureSizeIsEnough(srcBorder_.size(), type_, buf_);
+            buf_.setTo(Scalar::all(0), _stream);
+            bufRoi = buf_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+        }
-void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream)
+        _dst.create(src.size(), src.type());
-{
+        GpuMat dst = _dst.getGpuMat();
-    if (ddepth < 0)
-        ddepth = src.depth();
-    int dst_type = CV_MAKE_TYPE(ddepth, src.channels());
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
-    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType);
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
-    dst.create(src.size(), dst_type);
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize.height;
+        oMaskSize.width = ksize.width;
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+        NppiPoint oAnchor;
-}
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
-////////////////////////////////////////////////////////////////////////////////////////////////////
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
-// Separable Linear Filter
+                           oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
-namespace filter
+        for(int i = 1; i < iters_; ++i)
-{
+        {
-    template <typename T, typename D>
+            dst.copyTo(bufRoi, _stream);
-    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    template <typename T, typename D>
+            nppSafeCall( func_(bufRoi.ptr<Npp8u>(), static_cast<int>(bufRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
-    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+                               oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+        }
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 namespace
 {
-    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
+    class MorphologyExFilter : public Filter
-        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
-    typedef void (*gpuFilter1D_t)(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    struct NppLinearRowFilter : public BaseRowFilter_GPU
    {
-        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
+    public:
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+        MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations);
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+    protected:
-        {
+        Ptr<gpu::Filter> erodeFilter_, dilateFilter_;
-            NppiSize sz;
+        GpuMat buf_;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            cudaStream_t stream = StreamAccessor::getStream(s);
-            NppStreamHandler h(stream);
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
    };
-    struct GpuLinearRowFilter : public BaseRowFilter_GPU
+    MorphologyExFilter::MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations)
    {
-        GpuLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
+        erodeFilter_ = gpu::createMorphologyFilter(MORPH_ERODE, srcType, kernel, anchor, iterations);
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
+        dilateFilter_ = gpu::createMorphologyFilter(MORPH_DILATE, srcType, kernel, anchor, iterations);
+    }
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
-        GpuMat kernel;
+    // MORPH_OPEN
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
+    class MorphologyOpenFilter : public MorphologyExFilter
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
-        {0, 0, 0, 0},
-        {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
-        {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
-        {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
-        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
    {
-        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
+    public:
+        MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
    };
-    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
+    MorphologyOpenFilter::MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
    {
-        CV_Assert( borderType == BORDER_CONSTANT );
-        GpuMat gpu_row_krnl;
-        int nDivisor;
-        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
-        const int ksize = gpu_row_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, npp_funcs[CV_MAT_CN(srcType)]));
    }
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
+    void MorphologyOpenFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(buf_, dst, stream);
+    }
-    const int sdepth = CV_MAT_DEPTH(srcType);
+    // MORPH_CLOSE
-    const int cn = CV_MAT_CN(srcType);
-    CV_Assert( sdepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-    const gpuFilter1D_t func = funcs[sdepth][cn - 1];
+    class MorphologyCloseFilter : public MorphologyExFilter
-    CV_Assert( func != 0 );
+    {
+    public:
+        MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations);
-    GpuMat gpu_row_krnl;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-    normalizeKernel(rowKernel, gpu_row_krnl, CV_32F);
+    };
-    const int ksize = gpu_row_krnl.cols;
+    MorphologyCloseFilter::MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
-    CV_Assert( ksize > 0 && ksize <= 32 );
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
-    normalizeAnchor(anchor, ksize);
+    void MorphologyCloseFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, buf_, stream);
+        erodeFilter_->apply(buf_, dst, stream);
+    }
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, borderType));
+    // MORPH_GRADIENT
-}
-namespace
+    class MorphologyGradientFilter : public MorphologyExFilter
-{
-    struct NppLinearColumnFilter : public BaseColumnFilter_GPU
    {
-        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
+    public:
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+        MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations);
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-        {
+    };
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            cudaStream_t stream = StreamAccessor::getStream(s);
+    MorphologyGradientFilter::MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
-            NppStreamHandler h(stream);
+    void MorphologyGradientFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(src, dst, stream);
+        gpu::subtract(dst, buf_, dst, noArray(), -1, stream);
+    }
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
+    // MORPH_TOPHAT
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-            if (stream == 0)
+    class MorphologyTophatFilter : public MorphologyExFilter
-                cudaSafeCall( cudaDeviceSynchronize() );
+    {
-        }
+    public:
+        MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
-        GpuMat kernel;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-        Npp32s nDivisor;
-        nppFilter1D_t func;
    };
-    struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
+    MorphologyTophatFilter::MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
    {
-        GpuLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
+    }
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            if (ksize > 16 && cc < 20)
-                CV_Error(cv::Error::StsNotImplemented, "column linear filter doesn't implemented for kernel size > 16 for device with compute capabilities less than 2.0");
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
+    void MorphologyTophatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
-        }
+    {
+        erodeFilter_->apply(src, dst, stream);
+        dilateFilter_->apply(dst, buf_, stream);
+        gpu::subtract(src, buf_, dst, noArray(), -1, stream);
+    }
-        GpuMat kernel;
+    // MORPH_BLACKHAT
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
+    class MorphologyBlackhatFilter : public MorphologyExFilter
-{
-    static const gpuFilter1D_t funcs[7][4] =
    {
-        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+    public:
-        {0, 0, 0, 0},
+        MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
-        {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
-        {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-        {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
-        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
    };
-    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
+    MorphologyBlackhatFilter::MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
    {
-        CV_Assert( borderType == BORDER_CONSTANT );
+    }
-        GpuMat gpu_col_krnl;
-        int nDivisor;
-        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
-        const int ksize = gpu_col_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, npp_funcs[CV_MAT_CN(bufType)]));
+    void MorphologyBlackhatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, dst, stream);
+        erodeFilter_->apply(dst, buf_, stream);
+        gpu::subtract(buf_, src, dst, noArray(), -1, stream);
    }
+}
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
+Ptr<Filter> cv::gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations)
+{
+    switch( op )
+    {
+    case MORPH_ERODE:
+    case MORPH_DILATE:
+        return new MorphologyFilter(op, srcType, kernel, anchor, iterations);
+        break;
-    const int ddepth = CV_MAT_DEPTH(dstType);
+    case MORPH_OPEN:
-    const int cn = CV_MAT_CN(dstType);
+        return new MorphologyOpenFilter(srcType, kernel, anchor, iterations);
-    CV_Assert( ddepth <= CV_64F && cn <= 4 );
+        break;
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-    gpuFilter1D_t func = funcs[ddepth][cn - 1];
+    case MORPH_CLOSE:
-    CV_Assert( func != 0 );
+        return new MorphologyCloseFilter(srcType, kernel, anchor, iterations);
+        break;
-    GpuMat gpu_col_krnl;
+    case MORPH_GRADIENT:
-    normalizeKernel(columnKernel, gpu_col_krnl, CV_32F);
+        return new MorphologyGradientFilter(srcType, kernel, anchor, iterations);
+        break;
-    const int ksize = gpu_col_krnl.cols;
+    case MORPH_TOPHAT:
-    CV_Assert(ksize > 0 && ksize <= 32);
+        return new MorphologyTophatFilter(srcType, kernel, anchor, iterations);
+        break;
-    normalizeAnchor(anchor, ksize);
+    case MORPH_BLACKHAT:
+        return new MorphologyBlackhatFilter(srcType, kernel, anchor, iterations);
+        break;
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, borderType));
+    default:
+        CV_Error(Error::StsBadArg, "Unknown morphological operation");
+        return Ptr<Filter>();
+    }
 }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
+////////////////////////////////////////////////////////////////////////////////////////////////////
-    const Point& anchor, int rowBorderType, int columnBorderType)
+// Image Rank Filter
+namespace
 {
-    if (columnBorderType < 0)
+    enum
-        columnBorderType = rowBorderType;
+    {
+        RANK_MAX,
+        RANK_MIN
+    };
-    int cn = CV_MAT_CN(srcType);
+    class NPPRankFilter : public Filter
-    int bdepth = CV_32F;
+    {
-    int bufType = CV_MAKETYPE(bdepth, cn);
+    public:
+        NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
+    private:
-}
+        typedef NppStatus (*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             NppiSize oMaskSize, NppiPoint oAnchor);
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
+        int type_;
-    const Point& anchor, int rowBorderType, int columnBorderType)
+        Size ksize_;
-{
+        Point anchor_;
-    if (columnBorderType < 0)
+        int borderMode_;
-        columnBorderType = rowBorderType;
+        Scalar borderVal_;
+        nppFilterRank_t func_;
-    int cn = CV_MAT_CN(srcType);
+        GpuMat srcBorder_;
-    int bdepth = CV_32F;
+    };
-    int bufType = CV_MAKETYPE(bdepth, cn);
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
+    NPPRankFilter::NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
+        type_(srcType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
+        static const nppFilterRank_t minFuncs[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
-}
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
+        normalizeAnchor(anchor_, ksize_);
-                          Point anchor, int rowBorderType, int columnBorderType)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
+        if (op == RANK_MAX)
+            func_ = maxFuncs[CV_MAT_CN(srcType)];
+        else
+            func_ = minFuncs[CV_MAT_CN(srcType)];
+    }
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
+    void NPPRankFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
+    {
-}
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
-                          Point anchor, int rowBorderType, int columnBorderType,
-                          Stream& stream)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType);
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
-// Deriv Filter
+        NppStreamHandler h(stream);
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
+        NppiSize oSizeROI;
-{
+        oSizeROI.width = src.cols;
-    Mat kx, ky;
+        oSizeROI.height = src.rows;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
-}
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType)
+        NppiSize oMaskSize;
-{
+        oMaskSize.height = ksize_.height;
-    Mat kx, ky;
+        oMaskSize.width = ksize_.width;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
+        NppiPoint oAnchor;
-{
+        oAnchor.x = anchor_.x;
-    GpuMat buf;
+        oAnchor.y = anchor_.y;
-    Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType);
-}
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
-{
+                           oSizeROI, oMaskSize, oAnchor) );
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    if (scale != 1)
+        if (stream == 0)
-    {
+            cudaSafeCall( cudaDeviceSynchronize() );
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if (dx == 0)
-            kx *= scale;
-        else
-            ky *= scale;
    }
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
 }
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    GpuMat buf;
+    return new NPPRankFilter(RANK_MAX, srcType, ksize, anchor, borderMode, borderVal);
-    Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType);
 }
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+Ptr<Filter> cv::gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    Mat kx, ky;
+    return new NPPRankFilter(RANK_MIN, srcType, ksize, anchor, borderMode, borderVal);
-    getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
-    if( scale != 1 )
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if( dx == 0 )
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
-}
-void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
-{
-    CV_Assert(ksize == 1 || ksize == 3);
-    static const int K[2][9] =
-    {
-        {0, 1, 0, 1, -4, 1, 0, 1, 0},
-        {2, 0, 2, 0, -8, 0, 2, 0, 2}
-    };
-    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
-    if (scale != 1)
-        kernel *= scale;
-    filter2D(src, dst, ddepth, kernel, Point(-1,-1), borderType, stream);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// Gaussian Filter
+// 1D Sum Filter
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
+namespace
 {
-    int depth = CV_MAT_DEPTH(type);
+    class NppRowSumFilter : public Filter
+    {
+    public:
+        NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
-    if (sigma2 <= 0)
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-        sigma2 = sigma1;
-    // automatic detection of kernel size from sigma
+    private:
-    if (ksize.width <= 0 && sigma1 > 0)
+        int srcType_, dstType_;
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+        int ksize_;
-    if (ksize.height <= 0 && sigma2 > 0)
+        int anchor_;
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+        int borderMode_;
+        Scalar borderVal_;
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
+        GpuMat srcBorder_;
+    };
-    sigma1 = std::max(sigma1, 0.0);
+    NppRowSumFilter::NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
-    sigma2 = std::max(sigma2, 0.0);
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
+        normalizeAnchor(anchor_, ksize_);
-    Mat ky;
+    }
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
+    void NppRowSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
-}
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
+        gpu::copyMakeBorder(src, srcBorder_, 0, 0, ksize_, ksize_, borderMode_, borderVal_, _stream);
-{
-    int depth = CV_MAT_DEPTH(type);
-    if (sigma2 <= 0)
+        _dst.create(src.size(), dstType_);
-        sigma2 = sigma1;
+        GpuMat dst = _dst.getGpuMat();
-    // automatic detection of kernel size from sigma
+        GpuMat srcRoi = srcBorder_(Rect(ksize_, 0, src.cols, src.rows));
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
-    sigma1 = std::max(sigma1, 0.0);
+        NppiSize oSizeROI;
-    sigma2 = std::max(sigma2, 0.0);
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
+        nppSafeCall( nppiSumWindowRow_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
-    Mat ky;
+                                                dst.ptr<Npp32f>(), static_cast<int>(dst.step),
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
+                                                oSizeROI, ksize_, anchor_) );
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
 {
-    if (ksize.width == 1 && ksize.height == 1)
+    return new NppRowSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
-    {
-        src.copyTo(dst);
-        return;
-    }
-    dst.create(src.size(), src.type());
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
 }
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
+namespace
 {
-    if (ksize.width == 1 && ksize.height == 1)
+    class NppColumnSumFilter : public Filter
    {
-        src.copyTo(dst);
+    public:
-        return;
+        NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
-    }
-    dst.create(src.size(), src.type());
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
+    private:
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+        int srcType_, dstType_;
-}
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
-////////////////////////////////////////////////////////////////////////////////////////////////////
+        GpuMat srcBorder_;
-// Image Rank Filter
+    };
-namespace
-{
-    typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        NppiSize oMaskSize, NppiPoint oAnchor);
-    struct NPPRankFilter : public BaseFilter_GPU
+    NppColumnSumFilter::NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
    {
-        NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-            cudaStream_t stream = StreamAccessor::getStream(s);
+        normalizeAnchor(anchor_, ksize_);
+    }
-            NppStreamHandler h(stream);
+    void NppColumnSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
+        gpu::copyMakeBorder(src, srcBorder_, ksize_, ksize_, 0, 0, borderMode_, borderVal_, _stream);
-            if (stream == 0)
+        _dst.create(src.size(), dstType_);
-                cudaSafeCall( cudaDeviceSynchronize() );
+        GpuMat dst = _dst.getGpuMat();
-        }
-        nppFilterRank_t func;
+        GpuMat srcRoi = srcBorder_(Rect(0, ksize_, src.cols, src.rows));
-    };
-}
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
-{
+        NppStreamHandler h(stream);
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
-    normalizeAnchor(anchor, ksize);
+        nppSafeCall( nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                   dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                   oSizeROI, ksize_, anchor_) );
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
+Ptr<Filter> cv::gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
 {
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+    return new NppColumnSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-    normalizeAnchor(anchor, ksize);
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
 }
 #endif
--- a/modules/gpufilters/src/precomp.hpp
+++ b/modules/gpufilters/src/precomp.hpp
@@ -46,14 +46,9 @@
 #include <limits>
 #include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuarithm.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/core/private.gpu.hpp"
-#include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_GPUARITHM
-#  include "opencv2/gpuarithm.hpp"
-#endif
 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -70,13 +70,14 @@ namespace
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Blur
-PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi)
+PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
    int type;
    cv::Size ksize;
    cv::Point anchor;
+    int borderType;
    bool useRoi;
    virtual void SetUp()
@@ -86,7 +87,8 @@ PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, Use
        type = GET_PARAM(2);
        ksize = GET_PARAM(3);
        anchor = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
        cv::gpu::setDevice(devInfo.deviceID());
    }
@@ -96,13 +98,15 @@ GPU_TEST_P(Blur, Accuracy)
 {
    cv::Mat src = randomMat(size, type);
+    cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(src.type(), -1, ksize, anchor, borderType);
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::blur(loadMat(src, useRoi), dst, ksize, anchor);
+    blurFilter->apply(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
-    cv::blur(src, dst_gold, ksize, anchor);
+    cv::blur(src, dst_gold, ksize, anchor, borderType);
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
 }
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
@@ -111,6 +115,173 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter2D
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+GPU_TEST_P(Filter2D, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
+    cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(src.type(), -1, kernel, anchor, borderType);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter2D->apply(loadMat(src, useRoi), dst);
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
+}
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian
+PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    bool useRoi;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+GPU_TEST_P(Laplacian, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(src.type(), -1, ksize.width);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    laplacian->apply(loadMat(src, useRoi), dst);
+    cv::Mat dst_gold;
+    cv::Laplacian(src, dst_gold, -1, ksize.width);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
+}
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
+    WHOLE_SUBMAT));
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SeparableLinearFilter
+PARAM_TEST_CASE(SeparableLinearFilter, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+    int type;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        anchor = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
+        cv::gpu::setDevice(devInfo.deviceID());
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+GPU_TEST_P(SeparableLinearFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat rowKernel = randomMat(Size(ksize.width, 1), CV_32FC1, 0.0, 1.0);
+    cv::Mat columnKernel = randomMat(Size(ksize.height, 1), CV_32FC1, 0.0, 1.0);
+    cv::Ptr<cv::gpu::Filter> filter = cv::gpu::createSeparableLinearFilter(src.type(), -1, rowKernel, columnKernel, anchor, borderType);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter->apply(loadMat(src, useRoi), dst);
+    cv::Mat dst_gold;
+    cv::sepFilter2D(src, dst_gold, -1, rowKernel, columnKernel, anchor, 0, borderType);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+INSTANTIATE_TEST_CASE_P(GPU_Filters, SeparableLinearFilter, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(23, 15)),
+                    KSize(cv::Size(31, 3))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -155,13 +326,15 @@ GPU_TEST_P(Sobel, Accuracy)
    cv::Mat src = randomMat(size, type);
+    cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(src.type(), -1, dx, dy, ksize.width, 1.0, borderType);
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Sobel(loadMat(src, useRoi), dst, -1, dx, dy, ksize.width, 1.0, borderType);
+    sobel->apply(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
    cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Sobel, testing::Combine(
@@ -218,13 +391,15 @@ GPU_TEST_P(Scharr, Accuracy)
    cv::Mat src = randomMat(size, type);
+    cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(src.type(), -1, dx, dy, 1.0, borderType);
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Scharr(loadMat(src, useRoi), dst, -1, dx, dy, 1.0, borderType);
+    scharr->apply(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
    cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Scharr, testing::Combine(
@@ -277,28 +452,15 @@ GPU_TEST_P(GaussianBlur, Accuracy)
    double sigma1 = randomDouble(0.1, 1.0);
    double sigma2 = randomDouble(0.1, 1.0);
-    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
+    cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, borderType);
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
-        cv::Mat dst_gold;
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+    gauss->apply(loadMat(src, useRoi), dst);
-        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
+    cv::Mat dst_gold;
-    }
+    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 4.0 : 1e-4);
 }
 INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
@@ -327,49 +489,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Laplacian
-PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    bool useRoi;
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-GPU_TEST_P(Laplacian, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
-    cv::Mat dst_gold;
-    cv::Laplacian(src, dst_gold, -1, ksize.width);
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
-}
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
-    WHOLE_SUBMAT));
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
@@ -400,8 +519,10 @@ GPU_TEST_P(Erode, Accuracy)
    cv::Mat src = randomMat(size, type);
    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
+    cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), kernel, anchor, iterations);
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::erode(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    erode->apply(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
    cv::erode(src, dst_gold, kernel, anchor, iterations);
@@ -449,8 +570,10 @@ GPU_TEST_P(Dilate, Accuracy)
    cv::Mat src = randomMat(size, type);
    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
+    cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), kernel, anchor, iterations);
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::dilate(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    dilate->apply(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
    cv::dilate(src, dst_gold, kernel, anchor, iterations);
@@ -502,8 +625,10 @@ GPU_TEST_P(MorphEx, Accuracy)
    cv::Mat src = randomMat(size, type);
    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
+    cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), kernel, anchor, iterations);
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::morphologyEx(loadMat(src, useRoi), dst, morphOp, kernel, anchor, iterations);
+    morph->apply(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
    cv::morphologyEx(src, dst_gold, morphOp, kernel, anchor, iterations);
@@ -522,56 +647,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, MorphEx, testing::Combine(
    testing::Values(Iterations(1), Iterations(2), Iterations(3)),
    WHOLE_SUBMAT));
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    cv::Point anchor;
-    int borderType;
-    bool useRoi;
-    cv::Mat img;
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        anchor = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-GPU_TEST_P(Filter2D, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType);
-    cv::Mat dst_gold;
-    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
-}
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
-    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
-    WHOLE_SUBMAT));
 #endif // HAVE_CUDA
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -158,7 +158,7 @@ struct CV_EXPORTS CannyBuf
    GpuMat mag;
    GpuMat map;
    GpuMat st1, st2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
+    Ptr<Filter> filterDX, filterDY;
 };
 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);

--- a/modules/gpuimgproc/src/canny.cpp
+++ b/modules/gpuimgproc/src/canny.cpp
@@ -65,8 +65,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
        if (apperture_size != 3)
        {
-            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
+            filterDX = createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size, false, 1, BORDER_REPLICATE);
-            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+            filterDY = createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size, false, 1, BORDER_REPLICATE);
        }
    }
@@ -150,8 +150,8 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
    }
    else
    {
-        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
+        buf.filterDX->apply(src, buf.dx);
-        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+        buf.filterDY->apply(src, buf.dy);
        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
    }

--- a/modules/gpuimgproc/src/corners.cpp
+++ b/modules/gpuimgproc/src/corners.cpp
@@ -70,6 +70,8 @@ namespace
 {
    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
    {
+        (void) buf;
        double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
        if (ksize < 0)
@@ -83,16 +85,21 @@ namespace
        Dx.create(src.size(), CV_32F);
        Dy.create(src.size(), CV_32F);
+        Ptr<gpu::Filter> filterDx, filterDy;
        if (ksize > 0)
        {
-            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
+            filterDx = gpu::createSobelFilter(src.type(), CV_32F, 1, 0, ksize, scale, borderType);
-            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
+            filterDy = gpu::createSobelFilter(src.type(), CV_32F, 0, 1, ksize, scale, borderType);
        }
        else
        {
-            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
+            filterDx = gpu::createScharrFilter(src.type(), CV_32F, 1, 0, scale, borderType);
-            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
+            filterDy = gpu::createScharrFilter(src.type(), CV_32F, 0, 1, scale, borderType);
        }
+        filterDx->apply(src, Dx);
+        filterDy->apply(src, Dy);
    }
 }

--- a/modules/superres/src/btv_l1_gpu.cpp
+++ b/modules/superres/src/btv_l1_gpu.cpp
@@ -230,7 +230,7 @@ namespace
        Ptr<DenseOpticalFlowExt> opticalFlow_;
    private:
-        std::vector<Ptr<FilterEngine_GPU> > filters_;
+        std::vector<Ptr<gpu::Filter> > filters_;
        int curBlurKernelSize_;
        double curBlurSigma_;
        int curSrcType_;
@@ -299,7 +299,7 @@ namespace
        {
            filters_.resize(src.size());
            for (size_t i = 0; i < src.size(); ++i)
-                filters_[i] = createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                filters_[i] = gpu::createGaussianFilter(src[0].type(), -1, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
            curBlurKernelSize_ = blurKernelSize_;
            curBlurSigma_ = blurSigma_;
            curSrcType_ = src[0].type();
@@ -346,7 +346,7 @@ namespace
                // a = M * Ih
                gpu::remap(highRes_, a_[k], backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
                // b = HM * Ih
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                // c = DHF * Ih
                gpu::resize(b_[k], c_[k], lowResSize, 0, 0, INTER_NEAREST, streams_[k]);
@@ -355,7 +355,7 @@ namespace
                // a = Dt * diff
                upscale(c_[k], a_[k], scale_, streams_[k]);
                // b = HtDt * diff
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                // diffTerm = MtHtDt * diff
                gpu::remap(b_[k], diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
            }

--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
@@ -308,6 +308,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
    gpu::split(tmp2, vI2);
    Scalar mssim;
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(vI2[0].type(), -1, Size(11, 11), 1.5);
    for( int i = 0; i < gI1.channels(); ++i )
    {
        gpu::GpuMat I2_2, I1_2, I1_I2;
@@ -318,8 +320,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
        /*************************** END INITS **********************************/
        gpu::GpuMat mu1, mu2;   // PRELIMINARY COMPUTING
-        gpu::GaussianBlur(vI1[i], mu1, Size(11, 11), 1.5);
+        gauss->apply(vI1[i], mu1);
-        gpu::GaussianBlur(vI2[i], mu2, Size(11, 11), 1.5);
+        gauss->apply(vI2[i], mu2);
        gpu::GpuMat mu1_2, mu2_2, mu1_mu2;
        gpu::multiply(mu1, mu1, mu1_2);
@@ -328,13 +330,13 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
        gpu::GpuMat sigma1_2, sigma2_2, sigma12;
-        gpu::GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
+        gauss->apply(I1_2, sigma1_2);
        gpu::subtract(sigma1_2, mu1_2, sigma1_2); // sigma1_2 -= mu1_2;
-        gpu::GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
+        gauss->apply(I2_2, sigma2_2);
        gpu::subtract(sigma2_2, mu2_2, sigma2_2); // sigma2_2 -= mu2_2;
-        gpu::GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
+        gauss->apply(I1_I2, sigma12);
        gpu::subtract(sigma12, mu1_mu2, sigma12); // sigma12 -= mu1_mu2;
        ///////////////////////////////// FORMULA ////////////////////////////////
@@ -375,7 +377,7 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
    gpu::split(b.t2, b.vI2, stream);
    Scalar mssim;
-    gpu::GpuMat buf;
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(b.vI1[0].type(), -1, Size(11, 11), 1.5);
    for( int i = 0; i < b.gI1.channels(); ++i )
    {
@@ -383,22 +385,22 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
        gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
        gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
-        gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.vI1[i], b.mu1, stream);
-        gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.vI2[i], b.mu2, stream);
        gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
        gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
        gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
-        gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_2, b.sigma1_2, stream);
        gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);
        //b.sigma1_2 -= b.mu1_2;  - This would result in an extra data transfer operation
-        gpu::GaussianBlur(b.I2_2, b.sigma2_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I2_2, b.sigma2_2, stream);
        gpu::subtract(b.sigma2_2, b.mu2_2, b.sigma2_2, gpu::GpuMat(), -1, stream);
        //b.sigma2_2 -= b.mu2_2;
-        gpu::GaussianBlur(b.I1_I2, b.sigma12, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_I2, b.sigma12, stream);
        gpu::subtract(b.sigma12, b.mu1_mu2, b.sigma12, gpu::GpuMat(), -1, stream);
        //b.sigma12 -= b.mu1_mu2;

--- a/samples/gpu/morphology.cpp
+++ b/samples/gpu/morphology.cpp
+#include <iostream>
-#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/gpufilters.hpp"
-#include <stdlib.h>
+#include "opencv2/gpuimgproc.hpp"
-#include <stdio.h>
 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
-static void help()
+class App
 {
+public:
+    App(int argc, const char* argv[]);
-printf("\nShow off image morphology: erosion, dialation, open and close\n"
+    int run();
-    "Call:\n   morphology2 [image]\n"
-    "This program also shows use of rect, elipse and cross kernels\n\n");
-printf( "Hot keys: \n"
-    "\tESC - quit the program\n"
-    "\tr - use rectangle structuring element\n"
-    "\te - use elliptic structuring element\n"
-    "\tc - use cross-shaped structuring element\n"
-    "\tSPACE - loop through all the options\n" );
-}
-GpuMat src, dst;
+private:
+    void help();
-int element_shape = MORPH_RECT;
+    void OpenClose();
+    void ErodeDilate();
-//the address of variable which receives trackbar position update
+    static void OpenCloseCallback(int, void*);
-int max_iters = 10;
+    static void ErodeDilateCallback(int, void*);
-int open_close_pos = 0;
-int erode_dilate_pos = 0;
-// callback function for open/close trackbar
+    gpu::GpuMat src, dst;
-static void OpenClose(int, void*)
-{
-    int n = open_close_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::morphologyEx(src, dst, MORPH_OPEN, element);
-    else
-        cv::gpu::morphologyEx(src, dst, MORPH_CLOSE, element);
-    imshow("Open/Close",(Mat)dst);
-}
-// callback function for erode/dilate trackbar
+    int element_shape;
-static void ErodeDilate(int, void*)
-{
-    int n = erode_dilate_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::erode(src, dst, element);
-    else
-        cv::gpu::dilate(src, dst, element);
-    imshow("Erode/Dilate",(Mat)dst);
-}
+    int max_iters;
+    int open_close_pos;
+    int erode_dilate_pos;
+};
-int main( int argc, char** argv )
+App::App(int argc, const char* argv[])
 {
-    char* filename = argc == 2 ? argv[1] : (char*)"baboon.jpg";
+    element_shape = MORPH_RECT;
-    if (string(argv[1]) == "--help")
+    open_close_pos = erode_dilate_pos = max_iters = 10;
-    {
-        help();
-        return -1;
-    }
-    src.upload(imread(filename, 1));
+    if (argc == 2 && String(argv[1]) == "--help")
-    if (src.empty())
    {
        help();
-        return -1;
+        exit(0);
    }
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+    String filename = argc == 2 ? argv[1] : "baboon.jpg";
-    help();
+    Mat img = imread(filename);
+    if (img.empty())
+    {
+        cerr << "Can't open image " << filename.c_str() << endl;
+        exit(-1);
+    }
+    src.upload(img);
    if (src.channels() == 3)
    {
        // gpu support only 4th channel images
-        GpuMat src4ch;
+        gpu::GpuMat src4ch;
-        cv::gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
+        gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
        src = src4ch;
    }
-    //create windows for output images
+    help();
-    namedWindow("Open/Close",1);
-    namedWindow("Erode/Dilate",1);
-    open_close_pos = erode_dilate_pos = max_iters;
+    gpu::printShortCudaDeviceInfo(gpu::getDevice());
-    createTrackbar("iterations", "Open/Close",&open_close_pos,max_iters*2+1,OpenClose);
+}
-    createTrackbar("iterations", "Erode/Dilate",&erode_dilate_pos,max_iters*2+1,ErodeDilate);
+int App::run()
+{
+    // create windows for output images
+    namedWindow("Open/Close");
+    namedWindow("Erode/Dilate");
+    createTrackbar("iterations", "Open/Close", &open_close_pos, max_iters * 2 + 1, OpenCloseCallback, this);
+    createTrackbar("iterations", "Erode/Dilate", &erode_dilate_pos, max_iters * 2 + 1, ErodeDilateCallback, this);
    for(;;)
    {
-        int c;
+        OpenClose();
+        ErodeDilate();
-        OpenClose(open_close_pos, 0);
+        char c = (char) waitKey();
-        ErodeDilate(erode_dilate_pos, 0);
-        c = waitKey();
-        if( (char)c == 27 )
+        switch (c)
+        {
+        case 27:
+            return 0;
            break;
-        if( (char)c == 'e' )
+        case 'e':
            element_shape = MORPH_ELLIPSE;
-        else if( (char)c == 'r' )
+            break;
+        case 'r':
            element_shape = MORPH_RECT;
-        else if( (char)c == 'c' )
+            break;
+        case 'c':
            element_shape = MORPH_CROSS;
-        else if( (char)c == ' ' )
+            break;
+        case ' ':
            element_shape = (element_shape + 1) % 3;
+            break;
+        }
+    }
+}
+void App::help()
+{
+    cout << "Show off image morphology: erosion, dialation, open and close \n";
+    cout << "Call: \n";
+    cout << "   gpu-example-morphology [image] \n";
+    cout << "This program also shows use of rect, elipse and cross kernels \n" << endl;
+    cout << "Hot keys: \n";
+    cout << "\tESC - quit the program \n";
+    cout << "\tr - use rectangle structuring element \n";
+    cout << "\te - use elliptic structuring element \n";
+    cout << "\tc - use cross-shaped structuring element \n";
+    cout << "\tSPACE - loop through all the options \n" << endl;
+}
+void App::OpenClose()
+{
+    int n = open_close_pos - max_iters;
+    int an = n > 0 ? n : -n;
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> openFilter = gpu::createMorphologyFilter(MORPH_OPEN, src.type(), element);
+        openFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> closeFilter = gpu::createMorphologyFilter(MORPH_CLOSE, src.type(), element);
+        closeFilter->apply(src, dst);
+    }
+    Mat h_dst(dst);
+    imshow("Open/Close", h_dst);
+}
+void App::ErodeDilate()
+{
+    int n = erode_dilate_pos - max_iters;
+    int an = n > 0 ? n : -n;
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> erodeFilter = gpu::createMorphologyFilter(MORPH_ERODE, src.type(), element);
+        erodeFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> dilateFilter = gpu::createMorphologyFilter(MORPH_DILATE, src.type(), element);
+        dilateFilter->apply(src, dst);
    }
-    return 0;
+    Mat h_dst(dst);
+    imshow("Erode/Dilate", h_dst);
+}
+void App::OpenCloseCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->OpenClose();
+}
+void App::ErodeDilateCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->ErodeDilate();
+}
+int main(int argc, const char* argv[])
+{
+    App app(argc, argv);
+    return app.run();
 }
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -746,10 +746,12 @@ TEST(erode)
        d_src.upload(src);
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        Ptr<gpu::Filter> erode = gpu::createMorphologyFilter(MORPH_ERODE, d_src.type(), ker);
+        erode->apply(d_src, d_dst);
        GPU_ON;
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        erode->apply(d_src, d_dst);
        GPU_OFF;
    }
 }
@@ -929,10 +931,12 @@ TEST(GaussianBlur)
        gpu::GpuMat d_dst(src.size(), src.type());
        gpu::GpuMat d_buf;
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(3, 3), 1);
+        gauss->apply(d_src, d_dst);
        GPU_ON;
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        gauss->apply(d_src, d_dst);
        GPU_OFF;
    }
 }
@@ -961,10 +965,11 @@ TEST(filter2D)
            gpu::GpuMat d_src(src);
            gpu::GpuMat d_dst;
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            Ptr<gpu::Filter> filter2D = gpu::createLinearFilter(d_src.type(), -1, kernel);
+            filter2D->apply(d_src, d_dst);
            GPU_ON;
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            filter2D->apply(d_src, d_dst);
            GPU_OFF;
        }
    }