Merge pull request #978 from jet47:gpuarithm-refactoring

2fc49ac5 · Roman Donchenko · OpenCV Buildbot · 4bc4f4aa · a3341006 · 2fc49ac5
Commit 2fc49ac5 authored Jun 14, 2013 by Roman Donchenko Committed by OpenCV Buildbot Jun 14, 2013
27 changed files
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -458,7 +458,7 @@ public:
                // generate integral for scale
                gpu::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
-                gpu::integralBuffered(src, sint, buff);
+                gpu::integral(src, sint, buff);
                // calculate job
                int totalWidth = level.workArea.width / step;

--- a/modules/gpuarithm/doc/arithm.rst
+++ b/modules/gpuarithm/doc/arithm.rst
@@ -6,10 +6,10 @@ Arithm Operations on Matrices
 gpu::gemm
------------------
+---------
 Performs generalized matrix multiplication.
-.. ocv:function:: void gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::gemm(InputArray src1, InputArray src2, double alpha, InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null())
    :param src1: First multiplied input matrix that should have  ``CV_32FC1`` , ``CV_64FC1`` , ``CV_32FC2`` , or  ``CV_64FC2``  type.
@@ -44,38 +44,40 @@ The function performs generalized matrix multiplication similar to the ``gemm``
 gpu::mulSpectrums
---------------------
+-----------------
 Performs a per-element multiplication of two Fourier spectrums.
-.. ocv:function:: void gpu::mulSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null())
-    :param a: First spectrum.
+    :param src1: First spectrum.
-    :param b: Second spectrum with the same size and type as  ``a`` .
+    :param src2: Second spectrum with the same size and type as  ``a`` .
-    :param c: Destination spectrum.
+    :param dst: Destination spectrum.
    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+    :param stream: Stream for the asynchronous version.
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
 .. seealso:: :ocv:func:`mulSpectrums`
 gpu::mulAndScaleSpectrums
-----------------------------
+-------------------------
 Performs a per-element multiplication of two Fourier spectrums and scales the result.
-.. ocv:function:: void gpu::mulAndScaleSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null())
-    :param a: First spectrum.
+    :param src1: First spectrum.
-    :param b: Second spectrum with the same size and type as  ``a`` .
+    :param src2: Second spectrum with the same size and type as  ``a`` .
-    :param c: Destination spectrum.
+    :param dst: Destination spectrum.
    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
@@ -83,17 +85,17 @@ Performs a per-element multiplication of two Fourier spectrums and scales the re
    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
 .. seealso:: :ocv:func:`mulSpectrums`
 gpu::dft
------------
+--------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
-.. ocv:function:: void gpu::dft( const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null())
    :param src: Source matrix (real or complex).
@@ -125,46 +127,25 @@ The source matrix should be continuous, otherwise reallocation and data copying
-gpu::ConvolveBuf
+gpu::Convolution
 ----------------
-.. ocv:struct:: gpu::ConvolveBuf
+.. ocv:class:: gpu::Convolution : public Algorithm
-Class providing a memory buffer for :ocv:func:`gpu::convolve` function, plus it allows to adjust some specific parameters. ::
+Base class for convolution (or cross-correlation) operator. ::
-    struct CV_EXPORTS ConvolveBuf
+    class CV_EXPORTS Convolution : public Algorithm
    {
-        Size result_size;
+    public:
-        Size block_size;
+        virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
-        Size user_block_size;
-        Size dft_size;
-        int spect_len;
-        GpuMat image_spect, templ_spect, result_spect;
-        GpuMat image_block, templ_block, result_data;
-        void create(Size image_size, Size templ_size);
-        static Size estimateBlockSize(Size result_size, Size templ_size);
    };
-You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
-gpu::ConvolveBuf::create
------------------------
-.. ocv:function:: gpu::ConvolveBuf::create(Size image_size, Size templ_size)
-Constructs a buffer for :ocv:func:`gpu::convolve` function with respective arguments.
+gpu::Convolution::convolve
+---------------------------
-gpu::convolve
-----------------
 Computes a convolution (or cross-correlation) of two images.
-.. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr=false)
+.. ocv:function:: void gpu::Convolution::convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::convolve( const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream=Stream::Null() )
    :param image: Source image. Only  ``CV_32FC1`` images are supported for now.
@@ -174,38 +155,16 @@ Computes a convolution (or cross-correlation) of two images.
    :param ccorr: Flags to evaluate cross-correlation instead of convolution.
-    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`gpu::ConvolveBuf`.
    :param stream: Stream for the asynchronous version.
 .. seealso:: :ocv:func:`gpu::filter2D`
-gpu::integral
+gpu::createConvolution
-----------------
+----------------------
-Computes an integral image.
+Creates implementation for :ocv:class:`gpu::Convolution` .
-.. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null())
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
-    :param stream: Stream for the asynchronous version.
-.. seealso:: :ocv:func:`integral`
+.. ocv:function:: Ptr<Convolution> createConvolution(Size user_block_size = Size())
+    :param user_block_size: Block size. If you leave default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
-gpu::sqrIntegral
--------------------
-Computes a squared integral image.
-.. ocv:function:: void gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null())
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
-    :param stream: Stream for the asynchronous version.
--- a/modules/gpuarithm/doc/core.rst
+++ b/modules/gpuarithm/doc/core.rst
@@ -6,12 +6,12 @@ Core Operations on Matrices
 gpu::merge
--------------
+----------
 Makes a multi-channel matrix out of several single-channel matrices.
-.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null())
    :param src: Array/vector of source matrices.
@@ -26,12 +26,12 @@ Makes a multi-channel matrix out of several single-channel matrices.
 gpu::split
--------------
+----------
 Copies each plane of a multi-channel matrix into an array.
-.. ocv:function:: void gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::split(InputArray src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
    :param src: Source matrix.
@@ -43,86 +43,108 @@ Copies each plane of a multi-channel matrix into an array.
-gpu::copyMakeBorder
+gpu::transpose
-----------------------
+--------------
-Forms a border around an image.
+Transposes a matrix.
-.. ocv:function:: void gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null())
-    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and  ``CV_32FC1`` types are supported.
+    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now.
-    :param dst: Destination image with the same type as  ``src``. The size is  ``Size(src.cols+left+right, src.rows+top+bottom)`` .
+    :param dst: Destination matrix.
-    :param top:
+    :param stream: Stream for the asynchronous version.
-    :param bottom:
+.. seealso:: :ocv:func:`transpose`
-    :param left:
-    :param right: Number of pixels in each direction from the source image rectangle to extrapolate. For example:  ``top=1, bottom=1, left=1, right=1`` mean that 1 pixel-wide border needs to be built.
-    :param borderType: Border type. See  :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
+gpu::flip
+---------
+Flips a 2D matrix around vertical, horizontal, or both axes.
-    :param value: Border value.
+.. ocv:function:: void gpu::flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null())
-    :param stream: Stream for the asynchronous version.
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
-.. seealso:: :ocv:func:`copyMakeBorder`
+    :param dst: Destination matrix.
+    :param flipCode: Flip mode for the source:
+        * ``0`` Flips around x-axis.
-gpu::transpose
+        * ``> 0`` Flips around y-axis.
------------------
-Transposes a matrix.
-.. ocv:function:: void gpu::transpose( const GpuMat& src1, GpuMat& dst, Stream& stream=Stream::Null() )
+        * ``< 0`` Flips around both axes.
-    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc).
+    :param stream: Stream for the asynchronous version.
-    :param dst: Destination matrix.
+.. seealso:: :ocv:func:`flip`
-    :param stream: Stream for the asynchronous version.
-.. seealso:: :ocv:func:`transpose`
+gpu::LookUpTable
+----------------
+.. ocv:class:: gpu::LookUpTable : public Algorithm
+Base class for transform using lookup table. ::
-gpu::flip
+    class CV_EXPORTS LookUpTable : public Algorithm
-------------
+    {
-Flips a 2D matrix around vertical, horizontal, or both axes.
+    public:
+        virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+    };
-.. ocv:function:: void gpu::flip( const GpuMat& a, GpuMat& b, int flipCode, Stream& stream=Stream::Null() )
+.. seealso:: :ocv:func:`LUT`
-    :param a: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
-    :param b: Destination matrix.
-    :param flipCode: Flip mode for the source:
+gpu::LookUpTable::transform
+---------------------------
+Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))`` .
-        * ``0`` Flips around x-axis.
+.. ocv:function:: void gpu::LookUpTable::transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
-        * ``>0`` Flips around y-axis.
+    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
-        * ``<0`` Flips around both axes.
+    :param dst: Destination matrix.
    :param stream: Stream for the asynchronous version.
-.. seealso:: :ocv:func:`flip`
+gpu::createLookUpTable
+----------------------
+Creates implementation for :ocv:class:`gpu::LookUpTable` .
-gpu::LUT
+.. ocv:function:: Ptr<LookUpTable> createLookUpTable(InputArray lut)
------------
-Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))``
-.. ocv:function:: void gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null())
+    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
-    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
-    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
-    :param dst: Destination matrix with the same depth as  ``lut``  and the same number of channels as  ``src`` .
+gpu::copyMakeBorder
+-----------------------
+Forms a border around an image.
+.. ocv:function:: void gpu::copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType, Scalar value = Scalar(), Stream& stream = Stream::Null())
+    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and ``CV_32FC1`` types are supported.
+    :param dst: Destination image with the same type as  ``src``. The size is ``Size(src.cols+left+right, src.rows+top+bottom)`` .
+    :param top:
+    :param bottom:
+    :param left:
+    :param right: Number of pixels in each direction from the source image rectangle to extrapolate. For example:  ``top=1, bottom=1, left=1, right=1`` mean that 1 pixel-wide border needs to be built.
+    :param borderType: Border type. See  :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
+    :param value: Border value.
    :param stream: Stream for the asynchronous version.
-.. seealso:: :ocv:func:`LUT`
+.. seealso:: :ocv:func:`copyMakeBorder`
--- a/modules/gpuarithm/doc/element_operations.rst
+++ b/modules/gpuarithm/doc/element_operations.rst
--- a/modules/gpuarithm/doc/reductions.rst
+++ b/modules/gpuarithm/doc/reductions.rst
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
--- a/modules/gpuarithm/perf/perf_arithm.cpp
+++ b/modules/gpuarithm/perf/perf_arithm.cpp
@@ -228,10 +228,11 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
        cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1);
        d_templ.upload(templ);
+        cv::Ptr<cv::gpu::Convolution> convolution = cv::gpu::createConvolution();
        cv::gpu::GpuMat dst;
-        cv::gpu::ConvolveBuf d_buf;
-        TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf);
+        TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr);
        GPU_SANITY_CHECK(dst);
    }
@@ -265,7 +266,7 @@ PERF_TEST_P(Sz, Integral,
        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;
-        TEST_CYCLE() cv::gpu::integralBuffered(d_src, dst, d_buf);
+        TEST_CYCLE() cv::gpu::integral(d_src, dst, d_buf);
        GPU_SANITY_CHECK(dst);
    }
@@ -293,9 +294,9 @@ PERF_TEST_P(Sz, IntegralSqr,
    if (PERF_RUN_GPU())
    {
        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat dst, buf;
-        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst);
+        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst, buf);
        GPU_SANITY_CHECK(dst);
    }

--- a/modules/gpuarithm/perf/perf_core.cpp
+++ b/modules/gpuarithm/perf/perf_core.cpp
@@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel,
    if (PERF_RUN_GPU())
    {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }
@@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,
    if (PERF_RUN_GPU())
    {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
        GPU_SANITY_CHECK(dst);
    }

--- a/modules/gpuarithm/perf/perf_reductions.cpp
+++ b/modules/gpuarithm/perf/perf_reductions.cpp
@@ -108,9 +108,10 @@ PERF_TEST_P(Sz_Norm, NormDiff,
    {
        const cv::gpu::GpuMat d_src1(src1);
        const cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_buf;
        double gpu_dst;
-        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, normType);
+        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, d_buf, normType);
        SANITY_CHECK(gpu_dst);

--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
--- a/modules/gpuarithm/src/core.cpp
+++ b/modules/gpuarithm/src/core.cpp
--- a/modules/gpuarithm/src/cuda/div_inv.cu
+++ b/modules/gpuarithm/src/cuda/div_inv.cu
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#if !defined CUDA_DISABLER
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
-#include "arithm_func_traits.hpp"
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-namespace arithm
-{
-    template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
-    {
-        S val;
-        __host__ explicit DivInv(S val_) : val(val_) {}
-        __device__ __forceinline__ D operator ()(T a) const
-        {
-            return a != 0 ? saturate_cast<D>(val / a) : 0;
-        }
-    };
-}
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
-    {
-    };
-}}}
-namespace arithm
-{
-    template <typename T, typename S, typename D>
-    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        DivInv<T, S, D> op(static_cast<S>(val));
-        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
-    }
-    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}
-#endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/cuda/div_scalar.cu
+++ b/modules/gpuarithm/src/cuda/div_scalar.cu
--- a/modules/gpuarithm/src/cuda/split_merge.cu
+++ b/modules/gpuarithm/src/cuda/split_merge.cu
@@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace cudev
        }
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
+        void merge(const PtrStepSzb* src, PtrStepSzb& dst,
                                     int total_channels, size_t elem_size,
                                     const cudaStream_t& stream)
        {
@@ -487,7 +487,7 @@ namespace cv { namespace gpu { namespace cudev
        }
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+        void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
        {
            static SplitFunction split_func_tbl[] =
            {

--- a/modules/gpuarithm/src/cuda/sub_scalar.cu
+++ b/modules/gpuarithm/src/cuda/sub_scalar.cu
--- a/modules/gpuarithm/src/element_operations.cpp
+++ b/modules/gpuarithm/src/element_operations.cpp
--- a/modules/gpuarithm/src/reductions.cpp
+++ b/modules/gpuarithm/src/reductions.cpp
--- a/modules/gpuarithm/test/test_arithm.cpp
+++ b/modules/gpuarithm/test/test_arithm.cpp
@@ -419,8 +419,10 @@ GPU_TEST_P(Convolve, Accuracy)
    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
+    cv::Ptr<cv::gpu::Convolution> conv = cv::gpu::createConvolution();
    cv::gpu::GpuMat dst;
-    cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr);
+    conv->convolve(loadMat(src), loadMat(kernel), dst, ccorr);
    cv::Mat dst_gold;
    convolveDFT(src, kernel, dst_gold, ccorr);

--- a/modules/gpuarithm/test/test_core.cpp
+++ b/modules/gpuarithm/test/test_core.cpp
@@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel)
    cv::Mat src = randomMat(size, type);
    cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
    cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
    cv::LUT(src, lut, dst_gold);
@@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel)
    cv::Mat src = randomMat(size, type);
    cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
    cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);
    cv::Mat dst_gold;
    cv::LUT(src, lut, dst_gold);

--- a/modules/gpuarithm/test/test_element_operations.cpp
+++ b/modules/gpuarithm/test/test_element_operations.cpp
--- a/modules/gpufilters/doc/filtering.rst
+++ b/modules/gpufilters/doc/filtering.rst
@@ -381,7 +381,7 @@ Creates a non-separable linear filter.
    :param dstType: Output image type. The same type as ``src`` is supported.
-    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`gpu::convolve`.
+    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:class:`gpu::Convolution`.
    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
@@ -411,7 +411,7 @@ Applies the non-separable 2D linear filter to an image.
    :param stream: Stream for the asynchronous version.
-.. seealso:: :ocv:func:`filter2D`, :ocv:func:`gpu::convolve`
+.. seealso:: :ocv:func:`filter2D`, :ocv:class:`gpu::Convolution`

--- a/modules/gpuimgproc/src/hough.cpp
+++ b/modules/gpuimgproc/src/hough.cpp
@@ -761,7 +761,7 @@ namespace
        {
            buildRTable_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
                            r_table, r_sizes.ptr<int>(), make_short2(templCenter.x, templCenter.y), levels);
-            min(r_sizes, maxSize, r_sizes);
+            gpu::min(r_sizes, maxSize, r_sizes);
        }
    }

--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@@ -172,15 +172,16 @@ namespace
            return;
        }
-        gpu::ConvolveBuf convolve_buf;
+        Ptr<gpu::Convolution> conv = gpu::createConvolution(buf.user_block_size);
-        convolve_buf.user_block_size = buf.user_block_size;
        if (image.channels() == 1)
-            gpu::convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
+        {
+            conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream);
+        }
        else
        {
            GpuMat result_;
-            gpu::convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
+            conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream);
            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
        }
    }
@@ -268,7 +269,7 @@ namespace
            buf.image_sums.resize(1);
            gpu::integral(image, buf.image_sums[0], stream);
-            unsigned int templ_sum = (unsigned int)sum(templ)[0];
+            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
        }
        else

--- a/modules/nonfree/src/surf_gpu.cpp
+++ b/modules/nonfree/src/surf_gpu.cpp
@@ -142,13 +142,13 @@ namespace
            bindImgTex(img);
-            gpu::integralBuffered(img, surf_.sum, surf_.intBuffer);
+            gpu::integral(img, surf_.sum, surf_.intBuffer);
            sumOffset = bindSumTex(surf_.sum);
            if (use_mask)
            {
-                min(mask, 1.0, surf_.mask1);
+                gpu::min(mask, 1.0, surf_.mask1);
-                gpu::integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
+                gpu::integral(surf_.mask1, surf_.maskSum, surf_.intBuffer);
                maskOffset = bindMaskSumTex(surf_.maskSum);
            }
        }

--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -130,15 +130,15 @@ void Worker::operator()(int device_id) const
    rng.fill(src, RNG::UNIFORM, 0, 1);
    // CPU works
-    transpose(src, dst);
+    cv::transpose(src, dst);
    // GPU works
    GpuMat d_src(src);
    GpuMat d_dst;
-    transpose(d_src, d_dst);
+    gpu::transpose(d_src, d_dst);
    // Check results
-    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
+    bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
        << (passed ? "passed" : "FAILED") << endl;

--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@@ -22,9 +22,9 @@ inline T mapVal(T x, T a, T b, T c, T d)
 static void colorizeFlow(const Mat &u, const Mat &v, Mat &dst)
 {
    double uMin, uMax;
-    minMaxLoc(u, &uMin, &uMax, 0, 0);
+    cv::minMaxLoc(u, &uMin, &uMax, 0, 0);
    double vMin, vMax;
-    minMaxLoc(v, &vMin, &vMax, 0, 0);
+    cv::minMaxLoc(v, &vMin, &vMax, 0, 0);
    uMin = ::abs(uMin); uMax = ::abs(uMax);
    vMin = ::abs(vMin); vMax = ::abs(vMax);
    float dMax = static_cast<float>(::max(::max(uMin, uMax), ::max(vMin, vMax)));

--- a/samples/gpu/multi.cpp
+++ b/samples/gpu/multi.cpp
@@ -87,15 +87,15 @@ void Worker::operator()(int device_id) const
    rng.fill(src, RNG::UNIFORM, 0, 1);
    // CPU works
-    transpose(src, dst);
+    cv::transpose(src, dst);
    // GPU works
    GpuMat d_src(src);
    GpuMat d_dst;
-    transpose(d_src, d_dst);
+    gpu::transpose(d_src, d_dst);
    // Check results
-    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
+    bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
        << (passed ? "passed" : "FAILED") << endl;