moved mulSpectrums, dft and convolve to gpuarithm

d569e72a · Vladislav Vinogradov · c56bdbc1 · d569e72a · d569e72a · d569e72a
Commit d569e72a authored Apr 09, 2013 by Vladislav Vinogradov
15 changed files
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -48,12 +48,6 @@ ocv_set_module_sources(
 ocv_create_module(${cuda_link_libs})
-if(HAVE_CUDA)
-  if(HAVE_CUFFT)
-    CUDA_ADD_CUFFT_TO_TARGET(${the_module})
-  endif()
-endif()
 ocv_add_precompiled_headers(${the_module})
 ################################################################################################################

--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -180,49 +180,6 @@ CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, Gp
 CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize,
    int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());
-//! performs per-element multiplication of two full (not packed) Fourier spectrums
-//! supports 32FC2 matrixes only (interleaved format)
-CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());
-//! performs per-element multiplication of two full (not packed) Fourier spectrums
-//! supports 32FC2 matrixes only (interleaved format)
-CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
-//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
-//! Param dft_size is the size of DFT transform.
-//!
-//! If the source matrix is not continous, then additional copy will be done,
-//! so to avoid copying ensure the source matrix is continous one. If you want to use
-//! preallocated output ensure it is continuous too, otherwise it will be reallocated.
-//!
-//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values
-//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
-//!
-//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
-CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
-struct CV_EXPORTS ConvolveBuf
-{
-    Size result_size;
-    Size block_size;
-    Size user_block_size;
-    Size dft_size;
-    int spect_len;
-    GpuMat image_spect, templ_spect, result_spect;
-    GpuMat image_block, templ_block, result_data;
-    void create(Size image_size, Size templ_size);
-    static Size estimateBlockSize(Size result_size, Size templ_size);
-};
-//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
-//! supports source images of 32FC1 type only
-//! result matrix will have 32FC1 type
-CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
-CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());
 struct CV_EXPORTS MatchTemplateBuf
 {
    Size user_block_size;

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -718,54 +718,6 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_BlendLinear,
    }
 }
-//////////////////////////////////////////////////////////////////////
-// Convolve
-DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool);
-PERF_TEST_P(Sz_KernelSz_Ccorr, ImgProc_Convolve,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(17, 27, 32, 64),
-                    Bool()))
-{
-    declare.time(10.0);
-    const cv::Size size = GET_PARAM(0);
-    const int templ_size = GET_PARAM(1);
-    const bool ccorr = GET_PARAM(2);
-    const cv::Mat image(size, CV_32FC1);
-    const cv::Mat templ(templ_size, templ_size, CV_32FC1);
-    declare.in(image, templ, WARMUP_RNG);
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_image = cv::gpu::createContinuous(size, CV_32FC1);
-        d_image.upload(image);
-        cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1);
-        d_templ.upload(templ);
-        cv::gpu::GpuMat dst;
-        cv::gpu::ConvolveBuf d_buf;
-        TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf);
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        if (ccorr)
-            FAIL_NO_CPU();
-        cv::Mat dst;
-        TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ);
-        CPU_SANITY_CHECK(dst);
-    }
-}
 ////////////////////////////////////////////////////////////////////////////////
 // MatchTemplate8U
@@ -846,108 +798,6 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate32F,
        TEST_CYCLE() cv::matchTemplate(image, templ, dst, method);
-        CPU_SANITY_CHECK(dst);
-    }
-};
-//////////////////////////////////////////////////////////////////////
-// MulSpectrums
-CV_FLAGS(DftFlags, 0, DFT_INVERSE, DFT_SCALE, DFT_ROWS, DFT_COMPLEX_OUTPUT, DFT_REAL_OUTPUT)
-DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags);
-PERF_TEST_P(Sz_Flags, ImgProc_MulSpectrums,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(0, DftFlags(cv::DFT_ROWS))))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int flag = GET_PARAM(1);
-    cv::Mat a(size, CV_32FC2);
-    cv::Mat b(size, CV_32FC2);
-    declare.in(a, b, WARMUP_RNG);
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_a(a);
-        const cv::gpu::GpuMat d_b(b);
-        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag);
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-        TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag);
-        CPU_SANITY_CHECK(dst);
-    }
-}
-//////////////////////////////////////////////////////////////////////
-// MulAndScaleSpectrums
-PERF_TEST_P(Sz, ImgProc_MulAndScaleSpectrums,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-    const float scale = 1.f / size.area();
-    cv::Mat src1(size, CV_32FC2);
-    cv::Mat src2(size, CV_32FC2);
-    declare.in(src1,src2, WARMUP_RNG);
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src1(src1);
-        const cv::gpu::GpuMat d_src2(src2);
-        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false);
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-//////////////////////////////////////////////////////////////////////
-// Dft
-PERF_TEST_P(Sz_Flags, ImgProc_Dft,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))))
-{
-    declare.time(10.0);
-    const cv::Size size = GET_PARAM(0);
-    const int flag = GET_PARAM(1);
-    cv::Mat src(size, CV_32FC2);
-    declare.in(src, WARMUP_RNG);
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::dft(d_src, dst, size, flag);
-        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else
-    {
-        cv::Mat dst;
-        TEST_CYCLE() cv::dft(src, dst, flag);
        CPU_SANITY_CHECK(dst);
    }
 }

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -582,119 +582,6 @@ namespace cv { namespace gpu { namespace cudev
                cudaSafeCall(cudaDeviceSynchronize());
        }
-        //////////////////////////////////////////////////////////////////////////
-        // mulSpectrums
-        __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            if (x < c.cols && y < c.rows)
-            {
-                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
-            }
-        }
-        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-            mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
-            cudaSafeCall( cudaGetLastError() );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        //////////////////////////////////////////////////////////////////////////
-        // mulSpectrums_CONJ
-        __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            if (x < c.cols && y < c.rows)
-            {
-                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
-            }
-        }
-        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-            mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
-            cudaSafeCall( cudaGetLastError() );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        //////////////////////////////////////////////////////////////////////////
-        // mulAndScaleSpectrums
-        __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            if (x < c.cols && y < c.rows)
-            {
-                cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
-                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
-            }
-        }
-        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-            mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
-            cudaSafeCall( cudaGetLastError() );
-            if (stream)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        //////////////////////////////////////////////////////////////////////////
-        // mulAndScaleSpectrums_CONJ
-        __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            if (x < c.cols && y < c.rows)
-            {
-                cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
-                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
-            }
-        }
-        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
-        {
-            dim3 threads(256);
-            dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
-            mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
-            cudaSafeCall( cudaGetLastError() );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
        //////////////////////////////////////////////////////////////////////////
        // buildWarpMaps

--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -45,21 +45,6 @@
 #include <cufft.h>
-#if defined(__GNUC__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
-#endif
-namespace cv { namespace gpu
-{
-    void cufftError(int err, const char *file, const int line, const char *func = "");
-}}
-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUFFT_SUCCESS != err)
-        cv::gpu::cufftError(err, file, line, func);
-}
 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -43,65 +43,3 @@
 using namespace cv;
 using namespace cv::gpu;
-#ifdef HAVE_CUDA
-namespace
-{
-    #define error_entry(entry)  { entry, #entry }
-    struct ErrorEntry
-    {
-        int code;
-        const char* str;
-    };
-    struct ErrorEntryComparer
-    {
-        int code;
-        ErrorEntryComparer(int code_) : code(code_) {}
-        bool operator()(const ErrorEntry& e) const { return e.code == code; }
-    };
-    String getErrorString(int code, const ErrorEntry* errors, size_t n)
-    {
-        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
-        const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
-        String str = cv::format("%s [Code = %d]", msg, code);
-        return str;
-    }
-    //////////////////////////////////////////////////////////////////////////
-    // CUFFT errors
-    const ErrorEntry cufft_errors[] =
-    {
-        error_entry( CUFFT_INVALID_PLAN ),
-        error_entry( CUFFT_ALLOC_FAILED ),
-        error_entry( CUFFT_INVALID_TYPE ),
-        error_entry( CUFFT_INVALID_VALUE ),
-        error_entry( CUFFT_INTERNAL_ERROR ),
-        error_entry( CUFFT_EXEC_FAILED ),
-        error_entry( CUFFT_SETUP_FAILED ),
-        error_entry( CUFFT_INVALID_SIZE ),
-        error_entry( CUFFT_UNALIGNED_DATA )
-    };
-    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-}
-namespace cv
-{
-    namespace gpu
-    {
-        void cufftError(int code, const char* file, const int line, const char* func)
-        {
-            String msg = getErrorString(code, cufft_errors, cufft_error_num);
-            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
-        }
-    }
-}
-#endif
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -489,92 +489,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, testing::Combine(
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    WHOLE_SUBMAT));
-////////////////////////////////////////////////////////
-// Convolve
-namespace
-{
-    void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
-    {
-        // reallocate the output array if needed
-        C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
-        cv::Size dftSize;
-        // compute the size of DFT transform
-        dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
-        dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
-        // allocate temporary buffers and initialize them with 0s
-        cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
-        cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
-        // copy A and B to the top-left corners of tempA and tempB, respectively
-        cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
-        A.copyTo(roiA);
-        cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
-        B.copyTo(roiB);
-        // now transform the padded A & B in-place;
-        // use "nonzeroRows" hint for faster processing
-        cv::dft(tempA, tempA, 0, A.rows);
-        cv::dft(tempB, tempB, 0, B.rows);
-        // multiply the spectrums;
-        // the function handles packed spectrum representations well
-        cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
-        // transform the product back from the frequency domain.
-        // Even though all the result rows will be non-zero,
-        // you need only the first C.rows of them, and thus you
-        // pass nonzeroRows == C.rows
-        cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
-        // now copy the result back to C.
-        tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
-    }
-    IMPLEMENT_PARAM_CLASS(KSize, int);
-    IMPLEMENT_PARAM_CLASS(Ccorr, bool);
-}
-PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int ksize;
-    bool ccorr;
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        ksize = GET_PARAM(2);
-        ccorr = GET_PARAM(3);
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-GPU_TEST_P(Convolve, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
-    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
-    cv::gpu::GpuMat dst;
-    cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr);
-    cv::Mat dst_gold;
-    convolveDFT(src, kernel, dst_gold, ccorr);
-    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1);
-}
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Convolve, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(KSize(3), KSize(7), KSize(11), KSize(17), KSize(19), KSize(23), KSize(45)),
-    testing::Values(Ccorr(false), Ccorr(true))));
 ////////////////////////////////////////////////////////////////////////////////
 // MatchTemplate8U
@@ -830,192 +744,6 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate_CanFindBigTemplate, ALL_DEVICES);
-////////////////////////////////////////////////////////////////////////////
-// MulSpectrums
-CV_FLAGS(DftFlags, 0, DFT_INVERSE, DFT_SCALE, DFT_ROWS, DFT_COMPLEX_OUTPUT, DFT_REAL_OUTPUT)
-PARAM_TEST_CASE(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int flag;
-    cv::Mat a, b;
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        flag = GET_PARAM(2);
-        cv::gpu::setDevice(devInfo.deviceID());
-        a = randomMat(size, CV_32FC2);
-        b = randomMat(size, CV_32FC2);
-    }
-};
-GPU_TEST_P(MulSpectrums, Simple)
-{
-    cv::gpu::GpuMat c;
-    cv::gpu::mulSpectrums(loadMat(a), loadMat(b), c, flag, false);
-    cv::Mat c_gold;
-    cv::mulSpectrums(a, b, c_gold, flag, false);
-    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
-}
-GPU_TEST_P(MulSpectrums, Scaled)
-{
-    float scale = 1.f / size.area();
-    cv::gpu::GpuMat c;
-    cv::gpu::mulAndScaleSpectrums(loadMat(a), loadMat(b), c, flag, scale, false);
-    cv::Mat c_gold;
-    cv::mulSpectrums(a, b, c_gold, flag, false);
-    c_gold.convertTo(c_gold, c_gold.type(), scale);
-    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
-}
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MulSpectrums, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS))));
-////////////////////////////////////////////////////////////////////////////
-// Dft
-struct Dft : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-namespace
-{
-    void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace)
-    {
-        SCOPED_TRACE(hint);
-        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
-        cv::Mat b_gold;
-        cv::dft(a, b_gold, flags);
-        cv::gpu::GpuMat d_b;
-        cv::gpu::GpuMat d_b_data;
-        if (inplace)
-        {
-            d_b_data.create(1, a.size().area(), CV_32FC2);
-            d_b = cv::gpu::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
-        }
-        cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), flags);
-        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
-        ASSERT_EQ(CV_32F, d_b.depth());
-        ASSERT_EQ(2, d_b.channels());
-        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
-    }
-}
-GPU_TEST_P(Dft, C2C)
-{
-    int cols = randomInt(2, 100);
-    int rows = randomInt(2, 100);
-    for (int i = 0; i < 2; ++i)
-    {
-        bool inplace = i != 0;
-        testC2C("no flags", cols, rows, 0, inplace);
-        testC2C("no flags 0 1", cols, rows + 1, 0, inplace);
-        testC2C("no flags 1 0", cols, rows + 1, 0, inplace);
-        testC2C("no flags 1 1", cols + 1, rows, 0, inplace);
-        testC2C("DFT_INVERSE", cols, rows, cv::DFT_INVERSE, inplace);
-        testC2C("DFT_ROWS", cols, rows, cv::DFT_ROWS, inplace);
-        testC2C("single col", 1, rows, 0, inplace);
-        testC2C("single row", cols, 1, 0, inplace);
-        testC2C("single col inversed", 1, rows, cv::DFT_INVERSE, inplace);
-        testC2C("single row inversed", cols, 1, cv::DFT_INVERSE, inplace);
-        testC2C("single row DFT_ROWS", cols, 1, cv::DFT_ROWS, inplace);
-        testC2C("size 1 2", 1, 2, 0, inplace);
-        testC2C("size 2 1", 2, 1, 0, inplace);
-    }
-}
-namespace
-{
-    void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
-    {
-        SCOPED_TRACE(hint);
-        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0);
-        cv::gpu::GpuMat d_b, d_c;
-        cv::gpu::GpuMat d_b_data, d_c_data;
-        if (inplace)
-        {
-            if (a.cols == 1)
-            {
-                d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2);
-                d_b = cv::gpu::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
-            }
-            else
-            {
-                d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2);
-                d_b = cv::gpu::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize());
-            }
-            d_c_data.create(1, a.size().area(), CV_32F);
-            d_c = cv::gpu::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize());
-        }
-        cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), 0);
-        cv::gpu::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
-        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
-        EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr());
-        ASSERT_EQ(CV_32F, d_c.depth());
-        ASSERT_EQ(1, d_c.channels());
-        cv::Mat c(d_c);
-        EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5);
-    }
-}
-GPU_TEST_P(Dft, R2CThenC2R)
-{
-    int cols = randomInt(2, 100);
-    int rows = randomInt(2, 100);
-    testR2CThenC2R("sanity", cols, rows, false);
-    testR2CThenC2R("sanity 0 1", cols, rows + 1, false);
-    testR2CThenC2R("sanity 1 0", cols + 1, rows, false);
-    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, false);
-    testR2CThenC2R("single col", 1, rows, false);
-    testR2CThenC2R("single col 1", 1, rows + 1, false);
-    testR2CThenC2R("single row", cols, 1, false);
-    testR2CThenC2R("single row 1", cols + 1, 1, false);
-    testR2CThenC2R("sanity", cols, rows, true);
-    testR2CThenC2R("sanity 0 1", cols, rows + 1, true);
-    testR2CThenC2R("sanity 1 0", cols + 1, rows, true);
-    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, true);
-    testR2CThenC2R("single row", cols, 1, true);
-    testR2CThenC2R("single row 1", cols + 1, 1, true);
-}
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Dft, ALL_DEVICES);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CornerHarris

--- a/modules/gpuarithm/CMakeLists.txt
+++ b/modules/gpuarithm/CMakeLists.txt
@@ -11,3 +11,7 @@ ocv_define_module(gpuarithm opencv_core OPTIONAL opencv_gpunvidia opencv_imgproc
 if(HAVE_CUBLAS)
  CUDA_ADD_CUBLAS_TO_TARGET(${the_module})
 endif()
+if(HAVE_CUFFT)
+  CUDA_ADD_CUFFT_TO_TARGET(${the_module})
+endif()
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@@ -295,6 +295,49 @@ CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer,
 //! supports source images of 8UC1 type only
 CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());
+//! performs per-element multiplication of two full (not packed) Fourier spectrums
+//! supports 32FC2 matrixes only (interleaved format)
+CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());
+//! performs per-element multiplication of two full (not packed) Fourier spectrums
+//! supports 32FC2 matrixes only (interleaved format)
+CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
+//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
+//! Param dft_size is the size of DFT transform.
+//!
+//! If the source matrix is not continous, then additional copy will be done,
+//! so to avoid copying ensure the source matrix is continous one. If you want to use
+//! preallocated output ensure it is continuous too, otherwise it will be reallocated.
+//!
+//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values
+//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
+//!
+//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
+CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
+struct CV_EXPORTS ConvolveBuf
+{
+    Size result_size;
+    Size block_size;
+    Size user_block_size;
+    Size dft_size;
+    int spect_len;
+    GpuMat image_spect, templ_spect, result_spect;
+    GpuMat image_block, templ_block, result_data;
+    void create(Size image_size, Size templ_size);
+    static Size estimateBlockSize(Size result_size, Size templ_size);
+};
+//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
+//! supports source images of 32FC1 type only
+//! result matrix will have 32FC1 type
+CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
+CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());
 }} // namespace cv { namespace gpu {
 #endif /* __OPENCV_GPUARITHM_HPP__ */
--- a/modules/gpuarithm/perf/perf_core.cpp
+++ b/modules/gpuarithm/perf/perf_core.cpp
@@ -2156,6 +2156,108 @@ PERF_TEST_P(Sz_Depth_NormType, Core_Normalize,
    }
 }
+//////////////////////////////////////////////////////////////////////
+// MulSpectrums
+CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
+DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags);
+PERF_TEST_P(Sz_Flags, ImgProc_MulSpectrums,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(0, DftFlags(cv::DFT_ROWS))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int flag = GET_PARAM(1);
+    cv::Mat a(size, CV_32FC2);
+    cv::Mat b(size, CV_32FC2);
+    declare.in(a, b, WARMUP_RNG);
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_a(a);
+        const cv::gpu::GpuMat d_b(b);
+        cv::gpu::GpuMat dst;
+        TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag);
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+        TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag);
+        CPU_SANITY_CHECK(dst);
+    }
+}
+//////////////////////////////////////////////////////////////////////
+// MulAndScaleSpectrums
+PERF_TEST_P(Sz, ImgProc_MulAndScaleSpectrums,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+    const float scale = 1.f / size.area();
+    cv::Mat src1(size, CV_32FC2);
+    cv::Mat src2(size, CV_32FC2);
+    declare.in(src1,src2, WARMUP_RNG);
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src1(src1);
+        const cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat dst;
+        TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false);
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+//////////////////////////////////////////////////////////////////////
+// Dft
+PERF_TEST_P(Sz_Flags, ImgProc_Dft,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))))
+{
+    declare.time(10.0);
+    const cv::Size size = GET_PARAM(0);
+    const int flag = GET_PARAM(1);
+    cv::Mat src(size, CV_32FC2);
+    declare.in(src, WARMUP_RNG);
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        TEST_CYCLE() cv::gpu::dft(d_src, dst, size, flag);
+        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+        TEST_CYCLE() cv::dft(src, dst, flag);
+        CPU_SANITY_CHECK(dst);
+    }
+}
 #ifdef HAVE_OPENCV_IMGPROC
 //////////////////////////////////////////////////////////////////////
@@ -2255,4 +2357,52 @@ PERF_TEST_P(Sz, ImgProc_IntegralSqr,
    }
 }
+//////////////////////////////////////////////////////////////////////
+// Convolve
+DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool);
+PERF_TEST_P(Sz_KernelSz_Ccorr, ImgProc_Convolve,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(17, 27, 32, 64),
+                    Bool()))
+{
+    declare.time(10.0);
+    const cv::Size size = GET_PARAM(0);
+    const int templ_size = GET_PARAM(1);
+    const bool ccorr = GET_PARAM(2);
+    const cv::Mat image(size, CV_32FC1);
+    const cv::Mat templ(templ_size, templ_size, CV_32FC1);
+    declare.in(image, templ, WARMUP_RNG);
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_image = cv::gpu::createContinuous(size, CV_32FC1);
+        d_image.upload(image);
+        cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1);
+        d_templ.upload(templ);
+        cv::gpu::GpuMat dst;
+        cv::gpu::ConvolveBuf d_buf;
+        TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf);
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        if (ccorr)
+            FAIL_NO_CPU();
+        cv::Mat dst;
+        TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ);
+        CPU_SANITY_CHECK(dst);
+    }
+}
 #endif
--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
--- a/modules/gpuarithm/src/cuda/mul_spectrums.cu
+++ b/modules/gpuarithm/src/cuda/mul_spectrums.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if !defined CUDA_DISABLER
+#include "cvconfig.h"
+#ifdef HAVE_CUFFT
+#include <cufft.h>
+#include "opencv2/core/cuda/common.hpp"
+namespace cv { namespace gpu { namespace cudev
+{
+    //////////////////////////////////////////////////////////////////////////
+    // mulSpectrums
+    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        if (x < c.cols && y < c.rows)
+        {
+            c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
+        }
+    }
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+        mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
+        cudaSafeCall( cudaGetLastError() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    //////////////////////////////////////////////////////////////////////////
+    // mulSpectrums_CONJ
+    __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        if (x < c.cols && y < c.rows)
+        {
+            c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
+        }
+    }
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+        mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
+        cudaSafeCall( cudaGetLastError() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    //////////////////////////////////////////////////////////////////////////
+    // mulAndScaleSpectrums
+    __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        if (x < c.cols && y < c.rows)
+        {
+            cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
+            c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
+        }
+    }
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+        mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
+        cudaSafeCall( cudaGetLastError() );
+        if (stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    //////////////////////////////////////////////////////////////////////////
+    // mulAndScaleSpectrums_CONJ
+    __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        if (x < c.cols && y < c.rows)
+        {
+            cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
+            c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
+        }
+    }
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
+        cudaSafeCall( cudaGetLastError() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}}} // namespace cv { namespace gpu { namespace cudev
+#endif // HAVE_CUFFT
+#endif /* CUDA_DISABLER */
--- a/modules/gpuarithm/src/precomp.hpp
+++ b/modules/gpuarithm/src/precomp.hpp
@@ -59,7 +59,11 @@
 #endif
 #ifdef HAVE_CUBLAS
-    #include <cublas.h>
+#  include <cublas.h>
+#endif
+#ifdef HAVE_CUFFT
+#  include <cufft.h>
 #endif
 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpuarithm/test/test_core.cpp
+++ b/modules/gpuarithm/test/test_core.cpp
@@ -3607,6 +3607,278 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Normalize, testing::Combine(
    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF), NormCode(cv::NORM_MINMAX)),
    WHOLE_SUBMAT));
+////////////////////////////////////////////////////////////////////////////
+// MulSpectrums
+CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
+PARAM_TEST_CASE(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int flag;
+    cv::Mat a, b;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        flag = GET_PARAM(2);
+        cv::gpu::setDevice(devInfo.deviceID());
+        a = randomMat(size, CV_32FC2);
+        b = randomMat(size, CV_32FC2);
+    }
+};
+GPU_TEST_P(MulSpectrums, Simple)
+{
+    cv::gpu::GpuMat c;
+    cv::gpu::mulSpectrums(loadMat(a), loadMat(b), c, flag, false);
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, false);
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
+}
+GPU_TEST_P(MulSpectrums, Scaled)
+{
+    float scale = 1.f / size.area();
+    cv::gpu::GpuMat c;
+    cv::gpu::mulAndScaleSpectrums(loadMat(a), loadMat(b), c, flag, scale, false);
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, false);
+    c_gold.convertTo(c_gold, c_gold.type(), scale);
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
+}
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MulSpectrums, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS))));
+////////////////////////////////////////////////////////////////////////////
+// Dft
+struct Dft : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+namespace
+{
+    void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace)
+    {
+        SCOPED_TRACE(hint);
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+        cv::gpu::GpuMat d_b;
+        cv::gpu::GpuMat d_b_data;
+        if (inplace)
+        {
+            d_b_data.create(1, a.size().area(), CV_32FC2);
+            d_b = cv::gpu::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+        }
+        cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), flags);
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
+}
+GPU_TEST_P(Dft, C2C)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+    for (int i = 0; i < 2; ++i)
+    {
+        bool inplace = i != 0;
+        testC2C("no flags", cols, rows, 0, inplace);
+        testC2C("no flags 0 1", cols, rows + 1, 0, inplace);
+        testC2C("no flags 1 0", cols, rows + 1, 0, inplace);
+        testC2C("no flags 1 1", cols + 1, rows, 0, inplace);
+        testC2C("DFT_INVERSE", cols, rows, cv::DFT_INVERSE, inplace);
+        testC2C("DFT_ROWS", cols, rows, cv::DFT_ROWS, inplace);
+        testC2C("single col", 1, rows, 0, inplace);
+        testC2C("single row", cols, 1, 0, inplace);
+        testC2C("single col inversed", 1, rows, cv::DFT_INVERSE, inplace);
+        testC2C("single row inversed", cols, 1, cv::DFT_INVERSE, inplace);
+        testC2C("single row DFT_ROWS", cols, 1, cv::DFT_ROWS, inplace);
+        testC2C("size 1 2", 1, 2, 0, inplace);
+        testC2C("size 2 1", 2, 1, 0, inplace);
+    }
+}
+namespace
+{
+    void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
+    {
+        SCOPED_TRACE(hint);
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0);
+        cv::gpu::GpuMat d_b, d_c;
+        cv::gpu::GpuMat d_b_data, d_c_data;
+        if (inplace)
+        {
+            if (a.cols == 1)
+            {
+                d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2);
+                d_b = cv::gpu::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+            }
+            else
+            {
+                d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2);
+                d_b = cv::gpu::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize());
+            }
+            d_c_data.create(1, a.size().area(), CV_32F);
+            d_c = cv::gpu::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize());
+        }
+        cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), 0);
+        cv::gpu::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr());
+        ASSERT_EQ(CV_32F, d_c.depth());
+        ASSERT_EQ(1, d_c.channels());
+        cv::Mat c(d_c);
+        EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5);
+    }
+}
+GPU_TEST_P(Dft, R2CThenC2R)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+    testR2CThenC2R("sanity", cols, rows, false);
+    testR2CThenC2R("sanity 0 1", cols, rows + 1, false);
+    testR2CThenC2R("sanity 1 0", cols + 1, rows, false);
+    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, false);
+    testR2CThenC2R("single col", 1, rows, false);
+    testR2CThenC2R("single col 1", 1, rows + 1, false);
+    testR2CThenC2R("single row", cols, 1, false);
+    testR2CThenC2R("single row 1", cols + 1, 1, false);
+    testR2CThenC2R("sanity", cols, rows, true);
+    testR2CThenC2R("sanity 0 1", cols, rows + 1, true);
+    testR2CThenC2R("sanity 1 0", cols + 1, rows, true);
+    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, true);
+    testR2CThenC2R("single row", cols, 1, true);
+    testR2CThenC2R("single row 1", cols + 1, 1, true);
+}
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Dft, ALL_DEVICES);
+////////////////////////////////////////////////////////
+// Convolve
+namespace
+{
+    void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+    {
+        // reallocate the output array if needed
+        C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
+        cv::Size dftSize;
+        // compute the size of DFT transform
+        dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
+        dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
+        // allocate temporary buffers and initialize them with 0s
+        cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
+        cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
+        // copy A and B to the top-left corners of tempA and tempB, respectively
+        cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
+        A.copyTo(roiA);
+        cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
+        B.copyTo(roiB);
+        // now transform the padded A & B in-place;
+        // use "nonzeroRows" hint for faster processing
+        cv::dft(tempA, tempA, 0, A.rows);
+        cv::dft(tempB, tempB, 0, B.rows);
+        // multiply the spectrums;
+        // the function handles packed spectrum representations well
+        cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
+        // transform the product back from the frequency domain.
+        // Even though all the result rows will be non-zero,
+        // you need only the first C.rows of them, and thus you
+        // pass nonzeroRows == C.rows
+        cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
+        // now copy the result back to C.
+        tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+    }
+    IMPLEMENT_PARAM_CLASS(KSize, int)
+    IMPLEMENT_PARAM_CLASS(Ccorr, bool)
+}
+PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int ksize;
+    bool ccorr;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        ksize = GET_PARAM(2);
+        ccorr = GET_PARAM(3);
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+GPU_TEST_P(Convolve, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
+    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
+    cv::gpu::GpuMat dst;
+    cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr);
+    cv::Mat dst_gold;
+    convolveDFT(src, kernel, dst_gold, ccorr);
+    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1);
+}
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Convolve, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(KSize(3), KSize(7), KSize(11), KSize(17), KSize(19), KSize(23), KSize(45)),
+    testing::Values(Ccorr(false), Ccorr(true))));
 #ifdef HAVE_OPENCV_IMGPROC
 //////////////////////////////////////////////////////////////////////////////