added buffered version of pyrDown and pyrUp

added stream support to downsample, upsample, pyrUp and pyrDown

added buffered version of pyrDown and pyrUp
added stream support to downsample, upsample, pyrUp and pyrDown
e746b3e8 · Vladislav Vinogradov · cf42f308 · e746b3e8 · e746b3e8 · e746b3e8
Commit e746b3e8 authored Aug 01, 2011 by Vladislav Vinogradov
5 changed files
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -441,6 +441,191 @@ namespace cv
            explicit Stream(Impl* impl);
        };
+        //////////////////////////////// Filter Engine ////////////////////////////////
+        /*!
+        The Base Class for 1D or Row-wise Filters
+        This is the base class for linear or non-linear filters that process 1D data.
+        In particular, such filters are used for the "horizontal" filtering parts in separable filters.
+        */
+        class CV_EXPORTS BaseRowFilter_GPU
+        {
+        public:
+            BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
+            virtual ~BaseRowFilter_GPU() {}
+            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+            int ksize, anchor;
+        };
+        /*!
+        The Base Class for Column-wise Filters
+        This is the base class for linear or non-linear filters that process columns of 2D arrays.
+        Such filters are used for the "vertical" filtering parts in separable filters.
+        */
+        class CV_EXPORTS BaseColumnFilter_GPU
+        {
+        public:
+            BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
+            virtual ~BaseColumnFilter_GPU() {}
+            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+            int ksize, anchor;
+        };
+        /*!
+        The Base Class for Non-Separable 2D Filters.
+        This is the base class for linear or non-linear 2D filters.
+        */
+        class CV_EXPORTS BaseFilter_GPU
+        {
+        public:
+            BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
+            virtual ~BaseFilter_GPU() {}
+            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+            Size ksize;
+            Point anchor;
+        };
+        /*!
+        The Base Class for Filter Engine.
+        The class can be used to apply an arbitrary filtering operation to an image.
+        It contains all the necessary intermediate buffers.
+        */
+        class CV_EXPORTS FilterEngine_GPU
+        {
+        public:
+            virtual ~FilterEngine_GPU() {}
+            virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
+        };
+        //! returns the non-separable filter engine with the specified filter
+        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
+        //! returns the separable filter engine with the specified filters
+        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
+            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
+        //! returns horizontal 1D box filter
+        //! supports only CV_8UC1 source type and CV_32FC1 sum type
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
+        //! returns vertical 1D box filter
+        //! supports only CV_8UC1 sum type and CV_32FC1 dst type
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
+        //! returns 2D box filter
+        //! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
+        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
+        //! returns box filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
+            const Point& anchor = Point(-1,-1));
+        //! returns 2D morphological filter
+        //! only MORPH_ERODE and MORPH_DILATE are supported
+        //! supports CV_8UC1 and CV_8UC4 types
+        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
+            Point anchor=Point(-1,-1));
+        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
+        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
+            const Point& anchor = Point(-1,-1), int iterations = 1);
+        //! returns 2D filter with the specified kernel
+        //! supports CV_8UC1 and CV_8UC4 types
+        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize,
+            Point anchor = Point(-1, -1));
+        //! returns the non-separable linear filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
+            const Point& anchor = Point(-1,-1));
+        //! returns the primitive row filter with the specified kernel.
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
+        //! there are two version of algorithm: NPP and OpenCV.
+        //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
+        //! otherwise calls OpenCV version.
+        //! NPP supports only BORDER_CONSTANT border type.
+        //! OpenCV version supports only CV_32F as buffer depth and
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
+            int anchor = -1, int borderType = BORDER_CONSTANT);
+        //! returns the primitive column filter with the specified kernel.
+        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
+        //! there are two version of algorithm: NPP and OpenCV.
+        //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
+        //! otherwise calls OpenCV version.
+        //! NPP supports only BORDER_CONSTANT border type.
+        //! OpenCV version supports only CV_32F as buffer depth and
+        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
+            int anchor = -1, int borderType = BORDER_CONSTANT);
+        //! returns the separable linear filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
+            const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
+            int columnBorderType = -1);
+        //! returns filter engine for the generalized Sobel operator
+        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+        //! returns the Gaussian filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
+        //! returns maximum filter
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
+        //! returns minimum filter
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
+        //! smooths the image using the normalized box filter
+        //! supports CV_8UC1, CV_8UC4 types
+        CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
+        //! a synonym for normalized box filter
+        static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); }
+        //! erodes the image (applies the local minimum operator)
+        CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
+        //! dilates the image (applies the local maximum operator)
+        CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
+        //! applies an advanced morphological operation to the image
+        CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
+        //! applies non-separable 2D linear filter to the image
+        CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());
+        //! applies separable 2D linear filter to the image
+        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
+            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+        //! applies generalized Sobel operator to the image
+        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+        //! applies the vertical or horizontal Scharr operator to the image
+        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+        //! smooths the image using Gaussian filter.
+        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
+            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
+        //! applies Laplacian operator to the image
+        //! supports only ksize = 1 and ksize = 3
+        CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null());
        ////////////////////////////// Arithmetics ///////////////////////////////////
@@ -739,16 +924,54 @@ namespace cv
        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
        //! downsamples image
-        CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst);
+        CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
        //! upsamples image
-        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst);
+        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
        //! smoothes the source image and downsamples it
-        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst);
+        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+        struct CV_EXPORTS PyrDownBuf;
+        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, PyrDownBuf& buf, Stream& stream = Stream::Null());
+        struct CV_EXPORTS PyrDownBuf
+        {
+            PyrDownBuf() : image_type(-1) {}
+            PyrDownBuf(Size image_size, int image_type_) : image_type(-1) { create(image_size, image_type_); }
+            void create(Size image_size, int image_type_);
+        private:
+            friend void pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream& stream);
+            static Mat ker;
+            GpuMat buf;
+            Ptr<FilterEngine_GPU> filter;
+            int image_type;
+        };
        //! upsamples the source image and then smoothes it
-        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst);
+        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+        struct CV_EXPORTS PyrUpBuf;
+        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& stream = Stream::Null());
+        struct CV_EXPORTS PyrUpBuf
+        {
+            PyrUpBuf() : image_type(-1) {}
+            PyrUpBuf(Size image_size, int image_type_) : image_type(-1) { create(image_size, image_type_); }
+            void create(Size image_size, int image_type_);
+        private:
+            friend void pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream& stream);
+            static Mat ker;
+            GpuMat buf;
+            Ptr<FilterEngine_GPU> filter;
+            int image_type;
+        };
        //! performs linear blending of two images
        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
@@ -835,190 +1058,6 @@ namespace cv
                                       int num_iters=100, float max_dist=8.0, int min_inlier_count=100, 
                                       vector<int>* inliers=NULL);
-        //////////////////////////////// Filter Engine ////////////////////////////////
-        /*!
-        The Base Class for 1D or Row-wise Filters
-        This is the base class for linear or non-linear filters that process 1D data.
-        In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-        */
-        class CV_EXPORTS BaseRowFilter_GPU
-        {
-        public:
-            BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-            virtual ~BaseRowFilter_GPU() {}
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-            int ksize, anchor;
-        };
-        /*!
-        The Base Class for Column-wise Filters
-        This is the base class for linear or non-linear filters that process columns of 2D arrays.
-        Such filters are used for the "vertical" filtering parts in separable filters.
-        */
-        class CV_EXPORTS BaseColumnFilter_GPU
-        {
-        public:
-            BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-            virtual ~BaseColumnFilter_GPU() {}
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-            int ksize, anchor;
-        };
-        /*!
-        The Base Class for Non-Separable 2D Filters.
-        This is the base class for linear or non-linear 2D filters.
-        */
-        class CV_EXPORTS BaseFilter_GPU
-        {
-        public:
-            BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
-            virtual ~BaseFilter_GPU() {}
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-            Size ksize;
-            Point anchor;
-        };
-        /*!
-        The Base Class for Filter Engine.
-        The class can be used to apply an arbitrary filtering operation to an image.
-        It contains all the necessary intermediate buffers.
-        */
-        class CV_EXPORTS FilterEngine_GPU
-        {
-        public:
-            virtual ~FilterEngine_GPU() {}
-            virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-        };
-        //! returns the non-separable filter engine with the specified filter
-        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
-        //! returns the separable filter engine with the specified filters
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
-        //! returns horizontal 1D box filter
-        //! supports only CV_8UC1 source type and CV_32FC1 sum type
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
-        //! returns vertical 1D box filter
-        //! supports only CV_8UC1 sum type and CV_32FC1 dst type
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
-        //! returns 2D box filter
-        //! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
-        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
-        //! returns box filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
-            const Point& anchor = Point(-1,-1));
-        //! returns 2D morphological filter
-        //! only MORPH_ERODE and MORPH_DILATE are supported
-        //! supports CV_8UC1 and CV_8UC4 types
-        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
-            Point anchor=Point(-1,-1));
-        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
-            const Point& anchor = Point(-1,-1), int iterations = 1);
-        //! returns 2D filter with the specified kernel
-        //! supports CV_8UC1 and CV_8UC4 types
-        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize,
-            Point anchor = Point(-1, -1));
-        //! returns the non-separable linear filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-            const Point& anchor = Point(-1,-1));
-        //! returns the primitive row filter with the specified kernel.
-        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
-        //! there are two version of algorithm: NPP and OpenCV.
-        //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-        //! otherwise calls OpenCV version.
-        //! NPP supports only BORDER_CONSTANT border type.
-        //! OpenCV version supports only CV_32F as buffer depth and
-        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-            int anchor = -1, int borderType = BORDER_CONSTANT);
-        //! returns the primitive column filter with the specified kernel.
-        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
-        //! there are two version of algorithm: NPP and OpenCV.
-        //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
-        //! otherwise calls OpenCV version.
-        //! NPP supports only BORDER_CONSTANT border type.
-        //! OpenCV version supports only CV_32F as buffer depth and
-        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-            int anchor = -1, int borderType = BORDER_CONSTANT);
-        //! returns the separable linear filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-            int columnBorderType = -1);
-        //! returns filter engine for the generalized Sobel operator
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-        //! returns the Gaussian filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-        //! returns maximum filter
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-        //! returns minimum filter
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-        //! smooths the image using the normalized box filter
-        //! supports CV_8UC1, CV_8UC4 types
-        CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
-        //! a synonym for normalized box filter
-        static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); }
-        //! erodes the image (applies the local minimum operator)
-        CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-        //! dilates the image (applies the local maximum operator)
-        CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-        //! applies an advanced morphological operation to the image
-        CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-        //! applies non-separable 2D linear filter to the image
-        CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());
-        //! applies separable 2D linear filter to the image
-        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-        //! applies generalized Sobel operator to the image
-        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-        //! applies the vertical or horizontal Scharr operator to the image
-        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-        //! smooths the image using Gaussian filter.
-        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-        //! applies Laplacian operator to the image
-        //! supports only ksize = 1 and ksize = 3
-        CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null());
        //////////////////////////////// Image Labeling ////////////////////////////////
        //!performs labeling via graph cuts

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -908,29 +908,31 @@ namespace cv { namespace gpu { namespace imgproc
    template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst)
+    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
    {
        dim3 threads(32, 8);
        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
-        downsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        downsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
        cudaSafeCall(cudaGetLastError());
-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }
-    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////
@@ -952,29 +954,31 @@ namespace cv { namespace gpu { namespace imgproc
    template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst)
+    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
    {
        dim3 threads(32, 8);
        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
-        upsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        upsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
        cudaSafeCall(cudaGetLastError());
-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }
-    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////

--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -79,10 +79,14 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
-void cv::gpu::downsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::upsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::pyrDown(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::pyrUp(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::PyrDownBuf::create(Size, int) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::PyrUpBuf::create(Size, int) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream&) { throw_nogpu(); }
@@ -1413,15 +1417,15 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst);
+    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 }}}
-void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
+void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
-    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
    static const Caller callers[6][4] =
        {{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
          imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
@@ -1437,7 +1441,7 @@ void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-    caller(src, dst.reshape(1));
+    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
 }
@@ -1447,15 +1451,15 @@ void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
 namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst);
+    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 }}}
-void cv::gpu::upsample(const GpuMat& src, GpuMat& dst)
+void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
-    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
    static const Caller callers[6][5] =
        {{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
          imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
@@ -1471,31 +1475,73 @@ void cv::gpu::upsample(const GpuMat& src, GpuMat& dst)
        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
    dst.create(src.rows*2, src.cols*2, src.type());
-    caller(src, dst.reshape(1));
+    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
 }
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown
-void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst)
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth()));
+    PyrDownBuf buf;
-    GpuMat buf;
+    pyrDown(src, dst, buf, stream);
-    sepFilter2D(src, buf, src.depth(), ker, ker);
+}
-    downsample(buf, dst);
+cv::Mat cv::gpu::PyrDownBuf::ker;
+void cv::gpu::PyrDownBuf::create(Size image_size, int image_type_)
+{
+    if (ker.empty() || image_type_ != image_type)
+        ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_)));
+    ensureSizeIsEnough(image_size.height, image_size.width, image_type_, buf);
+    if (filter.empty() || image_type_ != image_type)
+    {
+        image_type = image_type_;
+        filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker);
+    }
+}
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, PyrDownBuf& buf, Stream& stream)
+{
+    buf.create(src.size(), src.type());
+    buf.filter->apply(src, buf.buf, Rect(0, 0, src.cols, src.rows), stream);
+    downsample(buf.buf, dst, stream);
 }
 //////////////////////////////////////////////////////////////////////////////
 // pyrUp
-void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst)
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    PyrUpBuf buf;
+    pyrUp(src, dst, buf, stream);
+}
+cv::Mat cv::gpu::PyrUpBuf::ker;
+void cv::gpu::PyrUpBuf::create(Size image_size, int image_type_)
+{
+    if (ker.empty() || image_type_ != image_type)
+        ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_))) * 2;
+    ensureSizeIsEnough(image_size.height * 2, image_size.width * 2, image_type_, buf);
+    if (filter.empty() || image_type_ != image_type)
+    {
+        image_type = image_type_;
+        filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker);
+    }
+}
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& stream)
 {
-    GpuMat buf;
+    buf.create(src.size(), src.type());
-    upsample(src, buf);
+    upsample(src, buf.buf, stream);
-    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth())) * 2;
+    buf.filter->apply(buf.buf, dst, Rect(0, 0, buf.buf.cols, buf.buf.rows), stream);
-    sepFilter2D(buf, dst, buf.depth(), ker, ker);
 }
 #endif /* !defined (HAVE_CUDA) */

--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -560,778 +560,3 @@ INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher, testing::Combine(
                        testing::Values(57, 64, 83, 128, 179, 256, 304)));
 #endif // HAVE_CUDA
-//struct CV_GpuBFMTest : CV_GpuTestBase
-//{
-//    void run_gpu_test();
-//       
-//    void generateData(GpuMat& query, GpuMat& train, int dim, int depth);
-//
-//    virtual void test(const GpuMat& query, const GpuMat& train, BruteForceMatcher_GPU_base& matcher) = 0;
-//
-//    static const int queryDescCount = 300; // must be even number because we split train data in some cases in two
-//    static const int countFactor = 4; // do not change it
-//};
-//
-//void CV_GpuBFMTest::run_gpu_test()
-//{
-//    BruteForceMatcher_GPU_base::DistType dists[] = {BruteForceMatcher_GPU_base::L1Dist, BruteForceMatcher_GPU_base::L2Dist, BruteForceMatcher_GPU_base::HammingDist};
-//    const char* dists_str[] = {"L1Dist", "L2Dist", "HammingDist"};
-//    int dists_count = sizeof(dists) / sizeof(dists[0]);
-//
-//    RNG rng = ts->get_rng();
-//
-//    int dims[] = {rng.uniform(30, 60), 64, rng.uniform(70, 110), 128, rng.uniform(130, 250), 256, rng.uniform(260, 350)};
-//    int dims_count = sizeof(dims) / sizeof(dims[0]);
-//
-//    for (int dist = 0; dist < dists_count; ++dist)
-//    {
-//        int depth_end = dists[dist] == BruteForceMatcher_GPU_base::HammingDist ? CV_32S : CV_32F;
-//
-//        for (int depth = CV_8U; depth <= depth_end; ++depth)
-//        {
-//            for (int dim = 0; dim < dims_count; ++dim)
-//            {
-//                PRINT_ARGS("dist=%s depth=%s dim=%d", dists_str[dist], getTypeName(depth), dims[dim]);
-//                
-//                BruteForceMatcher_GPU_base matcher(dists[dist]);
-//
-//                GpuMat query, train;
-//                generateData(query, train, dim, depth);
-//
-//                test(query, train, matcher);
-//            }
-//        }
-//    }
-//}
-//
-//void CV_GpuBFMTest::generateData(GpuMat& queryGPU, GpuMat& trainGPU, int dim, int depth)
-//{
-//    RNG& rng = ts->get_rng();
-//
-//    Mat queryBuf, trainBuf;
-//
-//    // Generate query descriptors randomly.
-//    // Descriptor vector elements are integer values.
-//    queryBuf.create(queryDescCount, dim, CV_32SC1);
-//    rng.fill(queryBuf, RNG::UNIFORM, Scalar::all(0), Scalar(3));
-//    queryBuf.convertTo(queryBuf, CV_32FC1);
-//
-//    // Generate train decriptors as follows:
-//    // copy each query descriptor to train set countFactor times
-//    // and perturb some one element of the copied descriptors in
-//    // in ascending order. General boundaries of the perturbation
-//    // are (0.f, 1.f).
-//    trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1);
-//    float step = 1.f / countFactor;
-//    for (int qIdx = 0; qIdx < queryDescCount; qIdx++)
-//    {
-//        Mat queryDescriptor = queryBuf.row(qIdx);
-//        for (int c = 0; c < countFactor; c++)
-//        {
-//            int tIdx = qIdx * countFactor + c;
-//            Mat trainDescriptor = trainBuf.row(tIdx);
-//            queryDescriptor.copyTo(trainDescriptor);
-//            int elem = rng(dim);
-//            float diff = rng.uniform(step * c, step * (c + 1));
-//            trainDescriptor.at<float>(0, elem) += diff;
-//        }
-//    }
-//
-//    Mat query, train;
-//    queryBuf.convertTo(query, depth);
-//    trainBuf.convertTo(train, depth);
-//
-//    queryGPU.upload(query);
-//    trainGPU.upload(train);
-//}
-//
-//#define GPU_BFM_TEST(test_name) 
-//    struct CV_GpuBFM_ ##test_name ## _Test : CV_GpuBFMTest 
-//    { 
-//        void test(const GpuMat& query, const GpuMat& train, BruteForceMatcher_GPU_base& matcher); 
-//    }; 
-//    TEST(BruteForceMatcher, test_name) { CV_GpuBFM_ ##test_name ## _Test test; test.safe_run(); } 
-//    void CV_GpuBFM_ ##test_name ## _Test::test(const GpuMat& query, const GpuMat& train, BruteForceMatcher_GPU_base& matcher)
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// match
-//
-//GPU_BFM_TEST(match)
-//{
-//    vector<DMatch> matches;
-//
-//    matcher.match(query, train, matches);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        DMatch match = matches[i];
-//        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
-//            badCount++;
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//GPU_BFM_TEST(match_add)
-//{
-//    vector<DMatch> matches;
-//
-//    // make add() twice to test such case
-//    matcher.add(vector<GpuMat>(1, train.rowRange(0, train.rows/2)));
-//    matcher.add(vector<GpuMat>(1, train.rowRange(train.rows/2, train.rows)));
-//
-//    // prepare masks (make first nearest match illegal)
-//    vector<GpuMat> masks(2);
-//    for (int mi = 0; mi < 2; mi++)
-//    {
-//        masks[mi] = GpuMat(query.rows, train.rows/2, CV_8UC1, Scalar::all(1));
-//        for (int di = 0; di < queryDescCount/2; di++)
-//            masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-//    }
-//
-//    matcher.match(query, matches, masks);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        DMatch match = matches[i];
-//        int shift = matcher.isMaskSupported() ? 1 : 0;
-//        {
-//            if (i < queryDescCount / 2)
-//            {
-//                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + shift) || (match.imgIdx != 0))
-//                    badCount++;
-//            }
-//            else
-//            {
-//                if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + shift) || (match.imgIdx != 1))
-//                    badCount++;
-//            }
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// knnMatch
-//
-//GPU_BFM_TEST(knnMatch)
-//{
-//    const int knn = 3;
-//
-//    vector< vector<DMatch> > matches;
-//
-//    matcher.knnMatch(query, train, matches, knn);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != knn)
-//            badCount++;
-//        else
-//        {
-//            int localBadCount = 0;
-//            for (int k = 0; k < knn; k++)
-//            {
-//                DMatch match = matches[i][k];
-//                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
-//                    localBadCount++;
-//            }
-//            badCount += localBadCount > 0 ? 1 : 0;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//GPU_BFM_TEST(knnMatch_add)
-//{
-//    const int knn = 2;
-//    vector<vector<DMatch> > matches;
-//
-//    // make add() twice to test such case
-//    matcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-//    matcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-//
-//    // prepare masks (make first nearest match illegal)
-//    vector<GpuMat> masks(2);
-//    for (int mi = 0; mi < 2; mi++ )
-//    {
-//        masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-//        for (int di = 0; di < queryDescCount / 2; di++)
-//            masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-//    }
-//
-//    matcher.knnMatch(query, matches, knn, masks);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    int shift = matcher.isMaskSupported() ? 1 : 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != knn)
-//            badCount++;
-//        else
-//        {
-//            int localBadCount = 0;
-//            for (int k = 0; k < knn; k++)
-//            {
-//                DMatch match = matches[i][k];
-//                {
-//                    if (i < queryDescCount / 2)
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-//                            localBadCount++;
-//                    }
-//                    else
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-//                            localBadCount++;
-//                    }
-//                }
-//            }
-//            badCount += localBadCount > 0 ? 1 : 0;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// radiusMatch
-//
-//GPU_BFM_TEST(radiusMatch)
-//{
-//    CHECK_RETURN(support(GLOBAL_ATOMICS), TS::SKIPPED);
-//
-//    const float radius = 1.f / countFactor;
-//
-//    vector< vector<DMatch> > matches;
-//
-//    matcher.radiusMatch(query, train, matches, radius);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != 1)
-//            badCount++;
-//        else
-//        {
-//            DMatch match = matches[i][0];
-//            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
-//                badCount++;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//GPU_BFM_TEST(radiusMatch_add)
-//{
-//    CHECK_RETURN(support(GLOBAL_ATOMICS), TS::SKIPPED);
-//
-//    int n = 3;
-//    const float radius = 1.f / countFactor * n;
-//    vector< vector<DMatch> > matches;
-//
-//    // make add() twice to test such case
-//    matcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-//    matcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-//
-//    // prepare masks (make first nearest match illegal)
-//    vector<GpuMat> masks(2);
-//    for (int mi = 0; mi < 2; mi++)
-//    {
-//        masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-//        for (int di = 0; di < queryDescCount / 2; di++)
-//            masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-//    }
-//
-//    matcher.radiusMatch(query, matches, radius, masks);
-//
-//    CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-//
-//    int badCount = 0;
-//    int shift = matcher.isMaskSupported() ? 1 : 0;
-//    int needMatchCount = matcher.isMaskSupported() ? n-1 : n;
-//    for (size_t i = 0; i < matches.size(); i++)
-//    {
-//        if ((int)matches[i].size() != needMatchCount)
-//            badCount++;
-//        else
-//        {
-//            int localBadCount = 0;
-//            for (int k = 0; k < needMatchCount; k++)
-//            {
-//                DMatch match = matches[i][k];
-//                {
-//                    if (i < queryDescCount / 2)
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-//                            localBadCount++;
-//                    }
-//                    else
-//                    {
-//                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-//                            localBadCount++;
-//                    }
-//                }
-//            }
-//            badCount += localBadCount > 0 ? 1 : 0;
-//        }
-//    }
-//
-//    CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-//}
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-//
-////struct CV_GpuBruteForceMatcherTest : CV_GpuTestBase
-////{
-////    void run_gpu_test();
-////    
-////    void emptyDataTest();
-////    void dataTest(int dim);
-////    
-////    void generateData(GpuMat& query, GpuMat& train, int dim);
-////
-////    void matchTest(const GpuMat& query, const GpuMat& train);
-////    void knnMatchTest(const GpuMat& query, const GpuMat& train);
-////    void radiusMatchTest(const GpuMat& query, const GpuMat& train);
-////
-////    BruteForceMatcher_GPU< L2<float> > dmatcher;
-////
-////    static const int queryDescCount = 300; // must be even number because we split train data in some cases in two
-////    static const int countFactor = 4; // do not change it
-////};
-////
-////void CV_GpuBruteForceMatcherTest::emptyDataTest()
-////{
-////    GpuMat queryDescriptors, trainDescriptors, mask;
-////    vector<GpuMat> trainDescriptorCollection, masks;
-////    vector<DMatch> matches;
-////    vector< vector<DMatch> > vmatches;
-////
-////    try
-////    {
-////        dmatcher.match(queryDescriptors, trainDescriptors, matches, mask);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("match() on empty descriptors must not generate exception (1)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.knnMatch(queryDescriptors, trainDescriptors, vmatches, 2, mask);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("knnMatch() on empty descriptors must not generate exception (1)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.radiusMatch(queryDescriptors, trainDescriptors, vmatches, 10.f, mask);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("radiusMatch() on empty descriptors must not generate exception (1)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.add(trainDescriptorCollection);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("add() on empty descriptors must not generate exception");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.match(queryDescriptors, matches, masks);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("match() on empty descriptors must not generate exception (2)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.knnMatch(queryDescriptors, vmatches, 2, masks);
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("knnMatch() on empty descriptors must not generate exception (2)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////    try
-////    {
-////        dmatcher.radiusMatch( queryDescriptors, vmatches, 10.f, masks );
-////    }
-////    catch(...)
-////    {
-////        PRINTLN("radiusMatch() on empty descriptors must not generate exception (2)");
-////        ts->set_failed_test_info(TS::FAIL_EXCEPTION);
-////    }
-////
-////}
-////
-////void CV_GpuBruteForceMatcherTest::generateData(GpuMat& queryGPU, GpuMat& trainGPU, int dim)
-////{
-////    Mat query, train;
-////    RNG& rng = ts->get_rng();
-////
-////    // Generate query descriptors randomly.
-////    // Descriptor vector elements are integer values.
-////    Mat buf(queryDescCount, dim, CV_32SC1);
-////    rng.fill(buf, RNG::UNIFORM, Scalar::all(0), Scalar(3));
-////    buf.convertTo(query, CV_32FC1);
-////
-////    // Generate train decriptors as follows:
-////    // copy each query descriptor to train set countFactor times
-////    // and perturb some one element of the copied descriptors in
-////    // in ascending order. General boundaries of the perturbation
-////    // are (0.f, 1.f).
-////    train.create( query.rows*countFactor, query.cols, CV_32FC1 );
-////    float step = 1.f / countFactor;
-////    for (int qIdx = 0; qIdx < query.rows; qIdx++)
-////    {
-////        Mat queryDescriptor = query.row(qIdx);
-////        for (int c = 0; c < countFactor; c++)
-////        {
-////            int tIdx = qIdx * countFactor + c;
-////            Mat trainDescriptor = train.row(tIdx);
-////            queryDescriptor.copyTo(trainDescriptor);
-////            int elem = rng(dim);
-////            float diff = rng.uniform(step * c, step * (c + 1));
-////            trainDescriptor.at<float>(0, elem) += diff;
-////        }
-////    }
-////
-////    queryGPU.upload(query);
-////    trainGPU.upload(train);
-////}
-////
-////void CV_GpuBruteForceMatcherTest::matchTest(const GpuMat& query, const GpuMat& train)
-////{
-////    dmatcher.clear();
-////
-////    // test const version of match()
-////    {
-////        vector<DMatch> matches;
-////        dmatcher.match(query, train, matches);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            DMatch match = matches[i];
-////            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
-////                badCount++;
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////
-////    // test version of match() with add()
-////    {
-////        vector<DMatch> matches;
-////
-////        // make add() twice to test such case
-////        dmatcher.add(vector<GpuMat>(1, train.rowRange(0, train.rows/2)));
-////        dmatcher.add(vector<GpuMat>(1, train.rowRange(train.rows/2, train.rows)));
-////
-////        // prepare masks (make first nearest match illegal)
-////        vector<GpuMat> masks(2);
-////        for (int mi = 0; mi < 2; mi++)
-////        {
-////            masks[mi] = GpuMat(query.rows, train.rows/2, CV_8UC1, Scalar::all(1));
-////            for (int di = 0; di < queryDescCount/2; di++)
-////                masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-////        }
-////
-////        dmatcher.match(query, matches, masks);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            DMatch match = matches[i];
-////            int shift = dmatcher.isMaskSupported() ? 1 : 0;
-////            {
-////                if (i < queryDescCount / 2)
-////                {
-////                    if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + shift) || (match.imgIdx != 0))
-////                        badCount++;
-////                }
-////                else
-////                {
-////                    if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + shift) || (match.imgIdx != 1))
-////                        badCount++;
-////                }
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////}
-////
-////void CV_GpuBruteForceMatcherTest::knnMatchTest(const GpuMat& query, const GpuMat& train)
-////{
-////    dmatcher.clear();
-////
-////    // test const version of knnMatch()
-////    {
-////        const int knn = 3;
-////
-////        vector< vector<DMatch> > matches;
-////        dmatcher.knnMatch(query, train, matches, knn);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != knn)
-////                badCount++;
-////            else
-////            {
-////                int localBadCount = 0;
-////                for (int k = 0; k < knn; k++)
-////                {
-////                    DMatch match = matches[i][k];
-////                    if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
-////                        localBadCount++;
-////                }
-////                badCount += localBadCount > 0 ? 1 : 0;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////
-////    // test version of knnMatch() with add()
-////    {
-////        const int knn = 2;
-////        vector<vector<DMatch> > matches;
-////
-////        // make add() twice to test such case
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-////
-////        // prepare masks (make first nearest match illegal)
-////        vector<GpuMat> masks(2);
-////        for (int mi = 0; mi < 2; mi++ )
-////        {
-////            masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-////            for (int di = 0; di < queryDescCount / 2; di++)
-////                masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-////        }
-////
-////        dmatcher.knnMatch(query, matches, knn, masks);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        int shift = dmatcher.isMaskSupported() ? 1 : 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != knn)
-////                badCount++;
-////            else
-////            {
-////                int localBadCount = 0;
-////                for (int k = 0; k < knn; k++)
-////                {
-////                    DMatch match = matches[i][k];
-////                    {
-////                        if (i < queryDescCount / 2)
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-////                                localBadCount++;
-////                        }
-////                        else
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-////                                localBadCount++;
-////                        }
-////                    }
-////                }
-////                badCount += localBadCount > 0 ? 1 : 0;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////}
-////
-////void CV_GpuBruteForceMatcherTest::radiusMatchTest(const GpuMat& query, const GpuMat& train)
-////{
-////    CHECK_RETURN(support(GLOBAL_ATOMICS), TS::SKIPPED);
-////
-////    dmatcher.clear();
-////
-////    // test const version of match()
-////    {
-////        const float radius = 1.f / countFactor;
-////
-////        vector< vector<DMatch> > matches;
-////        dmatcher.radiusMatch(query, train, matches, radius);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != 1)
-////                badCount++;
-////            else
-////            {
-////                DMatch match = matches[i][0];
-////                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
-////                    badCount++;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////
-////    // test version of match() with add()
-////    {
-////        int n = 3;
-////        const float radius = 1.f / countFactor * n;
-////        vector< vector<DMatch> > matches;
-////
-////        // make add() twice to test such case
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(0, train.rows / 2)));
-////        dmatcher.add(vector<GpuMat>(1,train.rowRange(train.rows / 2, train.rows)));
-////
-////        // prepare masks (make first nearest match illegal)
-////        vector<GpuMat> masks(2);
-////        for (int mi = 0; mi < 2; mi++)
-////        {
-////            masks[mi] = GpuMat(query.rows, train.rows / 2, CV_8UC1, Scalar::all(1));
-////            for (int di = 0; di < queryDescCount / 2; di++)
-////                masks[mi].col(di * countFactor).setTo(Scalar::all(0));
-////        }
-////
-////        dmatcher.radiusMatch(query, matches, radius, masks);
-////
-////        CHECK((int)matches.size() == queryDescCount, TS::FAIL_INVALID_OUTPUT);
-////
-////        int badCount = 0;
-////        int shift = dmatcher.isMaskSupported() ? 1 : 0;
-////        int needMatchCount = dmatcher.isMaskSupported() ? n-1 : n;
-////        for (size_t i = 0; i < matches.size(); i++)
-////        {
-////            if ((int)matches[i].size() != needMatchCount)
-////                badCount++;
-////            else
-////            {
-////                int localBadCount = 0;
-////                for (int k = 0; k < needMatchCount; k++)
-////                {
-////                    DMatch match = matches[i][k];
-////                    {
-////                        if (i < queryDescCount / 2)
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-////                                localBadCount++;
-////                        }
-////                        else
-////                        {
-////                            if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-////                                localBadCount++;
-////                        }
-////                    }
-////                }
-////                badCount += localBadCount > 0 ? 1 : 0;
-////            }
-////        }
-////
-////        CHECK(badCount == 0, TS::FAIL_INVALID_OUTPUT);
-////    }
-////}
-////
-////void CV_GpuBruteForceMatcherTest::dataTest(int dim)
-////{
-////    GpuMat query, train;
-////    generateData(query, train, dim);
-////
-////    matchTest(query, train);
-////    knnMatchTest(query, train);
-////    radiusMatchTest(query, train);
-////
-////    dmatcher.clear();
-////}
-////
-////void CV_GpuBruteForceMatcherTest::run_gpu_test()
-////{
-////    emptyDataTest();
-////
-////    dataTest(50);
-////    dataTest(64);
-////    dataTest(100);
-////    dataTest(128);
-////    dataTest(200);
-////    dataTest(256);
-////    dataTest(300);
-////}
-////
-////TEST(BruteForceMatcher, accuracy) { CV_GpuBruteForceMatcherTest test; test.safe_run(); }
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -866,3 +866,51 @@ TEST(GaussianBlur)
        GPU_OFF;
    }
 }
+TEST(pyrDown)
+{
+    gpu::PyrDownBuf buf;
+    for (int size = 4000; size >= 1000; size -= 1000)
+    {
+        SUBTEST << "size " << size;
+        Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
+        Mat dst(Size(src.cols / 2, src.rows / 2), src.type());
+        CPU_ON;
+        pyrDown(src, dst);
+        CPU_OFF;
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type());
+        GPU_ON;
+        gpu::pyrDown(d_src, d_dst, buf);
+        GPU_OFF;
+    }
+}
+TEST(pyrUp)
+{
+    gpu::PyrUpBuf buf;
+    for (int size = 4000; size >= 1000; size -= 1000)
+    {
+        SUBTEST << "size " << size;
+        Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
+        Mat dst(Size(src.cols * 2, src.rows * 2), src.type());
+        CPU_ON;
+        pyrUp(src, dst);
+        CPU_OFF;
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type());
+        GPU_ON;
+        gpu::pyrUp(d_src, d_dst, buf);
+        GPU_OFF;
+    }
+}