/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#include "precomp.hpp"

using namespace cv;
using namespace cv::gpu;


#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }

void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_nogpu(); }
void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); }
void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); }
void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_nogpu(); }

#else

namespace
{
    inline void normalizeAnchor(int& anchor, int ksize)
    {
        if (anchor < 0)
            anchor = ksize >> 1;

        CV_Assert(0 <= anchor && anchor < ksize);
    }

    inline void normalizeAnchor(Point& anchor, const Size& ksize)
    {
        normalizeAnchor(anchor.x, ksize.width);
        normalizeAnchor(anchor.y, ksize.height);
    }

    inline void normalizeROI(Rect& roi, const Size& ksize, const Point& anchor, const Size& src_size)
    {
        if (roi == Rect(0,0,-1,-1))
            roi = Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height);

        CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
    }

    inline void normalizeKernel(const Mat& kernel, GpuMat& gpu_krnl, int type = CV_8U, int* nDivisor = 0, bool reverse = false)
    {
        int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
        if (nDivisor) *nDivisor = scale;

        Mat temp(kernel.size(), type);
        kernel.convertTo(temp, type, scale);
        Mat cont_krnl = temp.reshape(1, 1);

        if (reverse)
        {
            int count = cont_krnl.cols >> 1;
            for (int i = 0; i < count; ++i)
            {
                std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
            }
        }

        gpu_krnl.upload(cont_krnl);
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Filter2D

namespace
{
    struct Filter2DEngine_GPU : public FilterEngine_GPU
    {
        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
        {}

        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
        {
            CV_Assert(src.type() == srcType);

            Size src_size = src.size();

            dst.create(src_size, dstType);

            if (roi.size() != src_size)
            {
                if (stream)
                    stream.enqueueMemSet(dst, Scalar::all(0));
                else
                    dst.setTo(Scalar::all(0));
            }

            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);

            GpuMat srcROI = src(roi);
            GpuMat dstROI = dst(roi);

            (*filter2D)(srcROI, dstROI, stream);
        }

        Ptr<BaseFilter_GPU> filter2D;
        int srcType, dstType;
    };
}

Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
{
    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// SeparableFilter

namespace
{
    struct SeparableFilterEngine_GPU : public FilterEngine_GPU
    {
        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
                                  int srcType_, int bufType_, int dstType_) :
            rowFilter(rowFilter_), columnFilter(columnFilter_),
            srcType(srcType_), bufType(bufType_), dstType(dstType_)
        {
            ksize = Size(rowFilter->ksize, columnFilter->ksize);
            anchor = Point(rowFilter->anchor, columnFilter->anchor);

            pbuf = &buf;
        }

        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
                                  int srcType_, int bufType_, int dstType_,
                                  GpuMat& buf_) :
            rowFilter(rowFilter_), columnFilter(columnFilter_),
            srcType(srcType_), bufType(bufType_), dstType(dstType_)
        {
            ksize = Size(rowFilter->ksize, columnFilter->ksize);
            anchor = Point(rowFilter->anchor, columnFilter->anchor);

            pbuf = &buf_;
        }

        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
        {
            CV_Assert(src.type() == srcType);

            Size src_size = src.size();

            dst.create(src_size, dstType);

            if (roi.size() != src_size)
            {
                if (stream)
                    stream.enqueueMemSet(dst, Scalar::all(0));
                else
                    dst.setTo(Scalar::all(0));
            }

            ensureSizeIsEnough(src_size, bufType, *pbuf);

            normalizeROI(roi, ksize, anchor, src_size);

            GpuMat srcROI = src(roi);
            GpuMat dstROI = dst(roi);
            GpuMat bufROI = (*pbuf)(roi);

            (*rowFilter)(srcROI, bufROI, stream);
            (*columnFilter)(bufROI, dstROI, stream);
        }

        Ptr<BaseRowFilter_GPU> rowFilter;
        Ptr<BaseColumnFilter_GPU> columnFilter;

        int srcType, bufType, dstType;

        Size ksize;
        Point anchor;

        GpuMat buf;
        GpuMat* pbuf;
    };
}

Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
{
    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
}

Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
{
    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// 1D Sum Filter

namespace
{
    struct NppRowSumFilter : public BaseRowFilter_GPU
    {
        NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
}

Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor)
{
    CV_Assert(srcType == CV_8UC1 && sumType == CV_32FC1);

    normalizeAnchor(anchor, ksize);

    return Ptr<BaseRowFilter_GPU>(new NppRowSumFilter(ksize, anchor));
}

namespace
{
    struct NppColumnSumFilter : public BaseColumnFilter_GPU
    {
        NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
}

Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor)
{
    CV_Assert(sumType == CV_8UC1 && dstType == CV_32FC1);

    normalizeAnchor(anchor, ksize);

    return Ptr<BaseColumnFilter_GPU>(new NppColumnSumFilter(ksize, anchor));
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Box Filter

namespace
{
    typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
        NppiSize oMaskSize, NppiPoint oAnchor);

    struct NPPBoxFilter : public BaseFilter_GPU
    {
        NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
            NppiSize oKernelSize;
            oKernelSize.height = ksize.height;
            oKernelSize.width = ksize.width;
            NppiPoint oAnchor;
            oAnchor.x = anchor.x;
            oAnchor.y = anchor.y;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

        nppFilterBox_t func;
    };
}

Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
{
    static const nppFilterBox_t nppFilterBox_callers[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};

    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);

    normalizeAnchor(anchor, ksize);

    return Ptr<BaseFilter_GPU>(new NPPBoxFilter(ksize, anchor, nppFilterBox_callers[CV_MAT_CN(srcType)]));
}

Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)
{
    Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);
    return createFilter2D_GPU(boxFilter, srcType, dstType);
}

void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor, Stream& stream)
{
    int sdepth = src.depth(), cn = src.channels();
    if( ddepth < 0 )
        ddepth = sdepth;

    dst.create(src.size(), CV_MAKETYPE(ddepth, cn));

    Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor);
    f->apply(src, dst, Rect(0,0,-1,-1), stream);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Morphology Filter

namespace
{
    typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint);

    struct NPPMorphFilter : public BaseFilter_GPU
    {
        NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
            NppiSize oKernelSize;
            oKernelSize.height = ksize.height;
            oKernelSize.width = ksize.width;
            NppiPoint oAnchor;
            oAnchor.x = anchor.x;
            oAnchor.y = anchor.y;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

        GpuMat kernel;
        nppMorfFilter_t func;
    };
}

Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
{
    static const nppMorfFilter_t nppMorfFilter_callers[2][5] =
    {
        {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
        {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
    };

    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
    CV_Assert(type == CV_8UC1 || type == CV_8UC4);

    GpuMat gpu_krnl;
    normalizeKernel(kernel, gpu_krnl);
    normalizeAnchor(anchor, ksize);

    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
}

namespace
{
    struct MorphologyFilterEngine_GPU : public FilterEngine_GPU
    {
        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) :
            filter2D(filter2D_), type(type_), iters(iters_)
        {
            pbuf = &buf;
        }

        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) :
            filter2D(filter2D_), type(type_), iters(iters_)
        {
            pbuf = &buf_;
        }

        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
        {
            CV_Assert(src.type() == type);

            Size src_size = src.size();

            dst.create(src_size, type);

            if (roi.size() != src_size)
            {
                if (stream)
                    stream.enqueueMemSet(dst, Scalar::all(0));
                else
                    dst.setTo(Scalar::all(0));
            }

            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);

            if (iters > 1)
                pbuf->create(src_size, type);

            GpuMat srcROI = src(roi);
            GpuMat dstROI = dst(roi);

            (*filter2D)(srcROI, dstROI, stream);

            for(int i = 1; i < iters; ++i)
            {
                dst.swap((*pbuf));

                dstROI = dst(roi);
                GpuMat bufROI = (*pbuf)(roi);

                (*filter2D)(bufROI, dstROI, stream);
            }
        }

        Ptr<BaseFilter_GPU> filter2D;

        int type;
        int iters;

        GpuMat buf;
        GpuMat* pbuf;
    };
}

Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor, int iterations)
{
    CV_Assert(iterations > 0);

    Size ksize = kernel.size();

    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);

    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
}

Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, const Point& anchor, int iterations)
{
    CV_Assert(iterations > 0);

    Size ksize = kernel.size();

    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);

    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations, buf));
}

namespace
{
    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream = Stream::Null())
    {
        Mat kernel;
        Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);

        normalizeAnchor(anchor, ksize);

        if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
        {
            if (stream)
                stream.enqueueCopy(src, dst);
            else
                src.copyTo(dst);
            return;
        }

        dst.create(src.size(), src.type());

        if (!_kernel.data)
        {
            kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2));
            anchor = Point(iterations, iterations);
            iterations = 1;
        }
        else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
        {
            anchor = Point(anchor.x * iterations, anchor.y * iterations);
            kernel = getStructuringElement(MORPH_RECT,
                                           Size(ksize.width + (iterations - 1) * (ksize.width - 1),
                                                ksize.height + (iterations - 1) * (ksize.height - 1)),
                                           anchor);
            iterations = 1;
        }
        else
            kernel = _kernel;

        Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, buf, anchor, iterations);

        f->apply(src, dst, Rect(0,0,-1,-1), stream);
    }

    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations)
    {
        GpuMat buf;
        morphOp(op, src, dst, _kernel, buf, anchor, iterations);
    }
}

void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
{
    morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations);
}

void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
{
    morphOp(MORPH_ERODE, src, dst, kernel, buf, anchor, iterations, stream);
}

void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
{
    morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations);
}

void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
{
    morphOp(MORPH_DILATE, src, dst, kernel, buf, anchor, iterations, stream);
}

void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations)
{
    GpuMat buf1;
    GpuMat buf2;
    morphologyEx(src, dst, op, kernel, buf1, buf2, anchor, iterations);
}

void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor, int iterations, Stream& stream)
{
    switch( op )
    {
    case MORPH_ERODE:   erode(src, dst, kernel, buf1, anchor, iterations, stream); break;
    case MORPH_DILATE: dilate(src, dst, kernel, buf1, anchor, iterations, stream); break;
    case MORPH_OPEN:
        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
        dilate(buf2, dst, kernel, buf1, anchor, iterations, stream);
        break;
    case CV_MOP_CLOSE:
        dilate(src, buf2, kernel, buf1, anchor, iterations, stream);
        erode(buf2, dst, kernel, buf1, anchor, iterations, stream);
        break;
    case CV_MOP_GRADIENT:
        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
        subtract(dst, buf2, dst, GpuMat(), -1, stream);
        break;
    case CV_MOP_TOPHAT:
        erode(src, dst, kernel, buf1, anchor, iterations, stream);
        dilate(dst, buf2, kernel, buf1, anchor, iterations, stream);
        subtract(src, buf2, dst, GpuMat(), -1, stream);
        break;
    case CV_MOP_BLACKHAT:
        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
        erode(dst, buf2, kernel, buf1, anchor, iterations, stream);
        subtract(buf2, src, dst, GpuMat(), -1, stream);
        break;
    default:
        CV_Error(CV_StsBadArg, "unknown morphological operation");
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Linear Filter

namespace cv { namespace gpu { namespace device
{
    namespace imgproc
    {
        template <typename T, typename D>
        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
                          int borderMode, const float* borderValue, cudaStream_t stream);
    }
}}}

namespace
{
    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
        const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);

    struct NPPLinearFilter : public BaseFilter_GPU
    {
        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
            NppiSize oKernelSize;
            oKernelSize.height = ksize.height;
            oKernelSize.width = ksize.width;
            NppiPoint oAnchor;
            oAnchor.x = anchor.x;
            oAnchor.y = anchor.y;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

        GpuMat kernel;
        Npp32s nDivisor;
        nppFilter2D_t func;
    };

    typedef void (*gpuFilter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
                                   int borderMode, const float* borderValue, cudaStream_t stream);

    struct GpuFilter2D : public BaseFilter_GPU
    {
        GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) :
            BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_)
        {
        }

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
        {
            using namespace cv::gpu::device::imgproc;

            Point ofs;
            Size wholeSize;
            src.locateROI(wholeSize, ofs);
            GpuMat srcWhole(wholeSize, src.type(), src.datastart);

            static const Scalar_<float> zero = Scalar_<float>::all(0.0f);
            func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), brd_type, zero.val, StreamAccessor::getStream(stream));
        }

        gpuFilter2D_t func;
        GpuMat kernel;
        int brd_type;
    };
}

Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type)
{
    using namespace cv::gpu::device::imgproc;

    int sdepth = CV_MAT_DEPTH(srcType);
    int scn = CV_MAT_CN(srcType);

    CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F);
    CV_Assert(scn == 1 || scn == 4);
    CV_Assert(dstType == srcType);
    CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP);

    Size ksize = kernel.size();

#if 0
    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT)
    {
        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};

        GpuMat gpu_krnl;
        int nDivisor;
        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);

        normalizeAnchor(anchor, ksize);

        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
    }
#endif

    CV_Assert(ksize.width * ksize.height <= 16 * 16);

    int gpuBorderType;
    CV_Assert( tryConvertToGpuBorderType(brd_type, gpuBorderType) );

    GpuMat gpu_krnl;
    normalizeKernel(kernel, gpu_krnl, CV_32F);

    normalizeAnchor(anchor, ksize);

    gpuFilter2D_t func = 0;

    switch (srcType)
    {
    case CV_8UC1:
        func = filter2D_gpu<uchar, uchar>;
        break;
    case CV_8UC4:
        func = filter2D_gpu<uchar4, uchar4>;
        break;
    case CV_16UC1:
        func = filter2D_gpu<ushort, ushort>;
        break;
    case CV_16UC4:
        func = filter2D_gpu<ushort4, ushort4>;
        break;
    case CV_32FC1:
        func = filter2D_gpu<float, float>;
        break;
    case CV_32FC4:
        func = filter2D_gpu<float4, float4>;
        break;
    }

    return Ptr<BaseFilter_GPU>(new GpuFilter2D(ksize, anchor, func, gpu_krnl, gpuBorderType));
}

Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType)
{
    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType);

    return createFilter2D_GPU(linearFilter, srcType, dstType);
}

void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream)
{
    if (ddepth < 0)
        ddepth = src.depth();

    int dst_type = CV_MAKE_TYPE(ddepth, src.channels());

    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType);

    dst.create(src.size(), dst_type);

    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Separable Linear Filter

namespace cv { namespace gpu { namespace device
{
    namespace row_filter
    {
        template <typename T, typename D>
        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    }

    namespace column_filter
    {
        template <typename T, typename D>
        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    }
}}}

namespace
{
    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);

    typedef void (*gpuFilter1D_t)(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);

    struct NppLinearRowFilter : public BaseRowFilter_GPU
    {
        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

        GpuMat kernel;
        Npp32s nDivisor;
        nppFilter1D_t func;
    };

    struct GpuLinearRowFilter : public BaseRowFilter_GPU
    {
        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            DeviceInfo devInfo;
            int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
        }

        Mat kernel;
        gpuFilter1D_t func;
        int brd_type;
    };
}

Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
{
    using namespace ::cv::gpu::device::row_filter;

    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};

    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
    {
        CV_Assert(borderType == BORDER_CONSTANT);

        GpuMat gpu_row_krnl;
        int nDivisor;
        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);

        int ksize = gpu_row_krnl.cols;
        normalizeAnchor(anchor, ksize);

        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,
            nppFilter1D_callers[CV_MAT_CN(srcType)]));
    }

    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1);

    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));

    Mat temp(rowKernel.size(), CV_32FC1);
    rowKernel.convertTo(temp, CV_32FC1);
    Mat cont_krnl = temp.reshape(1, 1);

    int ksize = cont_krnl.cols;

    CV_Assert(ksize > 0 && ksize <= 32);

    normalizeAnchor(anchor, ksize);

    gpuFilter1D_t func = 0;

    switch (srcType)
    {
    case CV_8UC1:
        func = linearRowFilter_gpu<uchar, float>;
        break;
    case CV_8UC4:
        func = linearRowFilter_gpu<uchar4, float4>;
        break;
    case CV_16SC3:
        func = linearRowFilter_gpu<short3, float3>;
        break;
    case CV_32SC1:
        func = linearRowFilter_gpu<int, float>;
        break;
    case CV_32FC1:
        func = linearRowFilter_gpu<float, float>;
        break;
    }

    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, func, gpuBorderType));
}

namespace
{
    struct NppLinearColumnFilter : public BaseColumnFilter_GPU
    {
        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

        GpuMat kernel;
        Npp32s nDivisor;
        nppFilter1D_t func;
    };

    struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
    {
        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            DeviceInfo devInfo;
            int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
            if (ksize > 16 && cc < 20)
                CV_Error(CV_StsNotImplemented, "column linear filter doesn't implemented for kernel size > 16 for device with compute capabilities less than 2.0");

            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
        }

        Mat kernel;
        gpuFilter1D_t func;
        int brd_type;
    };
}

Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
{
    using namespace ::cv::gpu::device::column_filter;

    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};

    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
    {
        CV_Assert(borderType == BORDER_CONSTANT);

        GpuMat gpu_col_krnl;
        int nDivisor;
        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);

        int ksize = gpu_col_krnl.cols;
        normalizeAnchor(anchor, ksize);

        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor,
            nppFilter1D_callers[CV_MAT_CN(bufType)]));
    }

    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);

    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));

    Mat temp(columnKernel.size(), CV_32FC1);
    columnKernel.convertTo(temp, CV_32FC1);
    Mat cont_krnl = temp.reshape(1, 1);

    int ksize = cont_krnl.cols;

    CV_Assert(ksize > 0 && ksize <= 32);

    normalizeAnchor(anchor, ksize);

    gpuFilter1D_t func = 0;

    switch (dstType)
    {
    case CV_8UC1:
        func = linearColumnFilter_gpu<float, uchar>;
        break;
    case CV_8UC4:
        func = linearColumnFilter_gpu<float4, uchar4>;
        break;
    case CV_16SC3:
        func = linearColumnFilter_gpu<float3, short3>;
        break;
    case CV_32SC1:
        func = linearColumnFilter_gpu<float, int>;
        break;
    case CV_32FC1:
        func = linearColumnFilter_gpu<float, float>;
        break;
    }

    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType));
}

Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
    const Point& anchor, int rowBorderType, int columnBorderType)
{
    if (columnBorderType < 0)
        columnBorderType = rowBorderType;

    int cn = CV_MAT_CN(srcType);
    int bdepth = CV_32F;
    int bufType = CV_MAKETYPE(bdepth, cn);

    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);

    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
}

Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
    const Point& anchor, int rowBorderType, int columnBorderType)
{
    if (columnBorderType < 0)
        columnBorderType = rowBorderType;

    int cn = CV_MAT_CN(srcType);
    int bdepth = CV_32F;
    int bufType = CV_MAKETYPE(bdepth, cn);

    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);

    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
}

void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
                          Point anchor, int rowBorderType, int columnBorderType)
{
    if( ddepth < 0 )
        ddepth = src.depth();

    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));

    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
}

void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
                          Point anchor, int rowBorderType, int columnBorderType,
                          Stream& stream)
{
    if( ddepth < 0 )
        ddepth = src.depth();

    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));

    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType);
    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Deriv Filter

Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
{
    Mat kx, ky;
    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
}

Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType)
{
    Mat kx, ky;
    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
}

void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
{
    GpuMat buf;
    Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType);
}

void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
{
    Mat kx, ky;
    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);

    if (scale != 1)
    {
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if (dx == 0)
            kx *= scale;
        else
            ky *= scale;
    }

    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
}

void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
{
    GpuMat buf;
    Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType);
}

void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream)
{
    Mat kx, ky;
    getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);

    if( scale != 1 )
    {
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if( dx == 0 )
            kx *= scale;
        else
            ky *= scale;
    }

    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
}

void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
{
    CV_Assert(ksize == 1 || ksize == 3);

    static const int K[2][9] =
    {
        {0, 1, 0, 1, -4, 1, 0, 1, 0},
        {2, 0, 2, 0, -8, 0, 2, 0, 2}
    };
    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
    if (scale != 1)
        kernel *= scale;

    filter2D(src, dst, ddepth, kernel, Point(-1,-1), borderType, stream);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Gaussian Filter

Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
{
    int depth = CV_MAT_DEPTH(type);

    if (sigma2 <= 0)
        sigma2 = sigma1;

    // automatic detection of kernel size from sigma
    if (ksize.width <= 0 && sigma1 > 0)
        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
    if (ksize.height <= 0 && sigma2 > 0)
        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;

    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );

    sigma1 = std::max(sigma1, 0.0);
    sigma2 = std::max(sigma2, 0.0);

    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
    Mat ky;
    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
        ky = kx;
    else
        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );

    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
}

Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
{
    int depth = CV_MAT_DEPTH(type);

    if (sigma2 <= 0)
        sigma2 = sigma1;

    // automatic detection of kernel size from sigma
    if (ksize.width <= 0 && sigma1 > 0)
        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
    if (ksize.height <= 0 && sigma2 > 0)
        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;

    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );

    sigma1 = std::max(sigma1, 0.0);
    sigma2 = std::max(sigma2, 0.0);

    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
    Mat ky;
    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
        ky = kx;
    else
        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );

    return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
}

void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
{
    if (ksize.width == 1 && ksize.height == 1)
    {
        src.copyTo(dst);
        return;
    }

    dst.create(src.size(), src.type());

    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
}

void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
{
    if (ksize.width == 1 && ksize.height == 1)
    {
        src.copyTo(dst);
        return;
    }

    dst.create(src.size(), src.type());

    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Image Rank Filter

namespace
{
    typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
        NppiSize oMaskSize, NppiPoint oAnchor);

    struct NPPRankFilter : public BaseFilter_GPU
    {
        NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
            NppiSize oKernelSize;
            oKernelSize.height = ksize.height;
            oKernelSize.width = ksize.width;
            NppiPoint oAnchor;
            oAnchor.x = anchor.x;
            oAnchor.y = anchor.y;

            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

        nppFilterRank_t func;
    };
}

Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
{
    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};

    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);

    normalizeAnchor(anchor, ksize);

    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
}

Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
{
    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};

    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);

    normalizeAnchor(anchor, ksize);

    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
}

#endif