refactoring: moved gpu reduction-based functions into separated file

df852937 · Alexey Spizhevoy · 1922e50f · df852937 · df852937 · df852937
Commit df852937 authored Dec 20, 2010 by Alexey Spizhevoy
7 changed files
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -360,66 +360,17 @@ namespace cv
            friend struct StreamAccessor;
        };
        ////////////////////////////// Arithmetics ///////////////////////////////////
        //! transposes the matrix
        //! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type
        CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst);
-        //! computes mean value and standard deviation of all or selected array elements
-        //! supports only CV_8UC1 type
-        CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
-        //! computes norm of array
-        //! supports NORM_INF, NORM_L1, NORM_L2
-        //! supports only CV_8UC1 type
-        CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
-        //! computes norm of the difference between two arrays
-        //! supports NORM_INF, NORM_L1, NORM_L2
-        //! supports only CV_8UC1 type
-        CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
        //! reverses the order of the rows, columns or both in a matrix
        //! supports CV_8UC1, CV_8UC4 types
        CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode);
-        //! computes sum of array elements
-        //! supports only single channel images
-        CV_EXPORTS Scalar sum(const GpuMat& src);
-        //! computes sum of array elements
-        //! supports only single channel images
-        CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
-        //! computes squared sum of array elements
-        //! supports only single channel images
-        CV_EXPORTS Scalar sqrSum(const GpuMat& src);
-        //! computes squared sum of array elements
-        //! supports only single channel images
-        CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
-        //! finds global minimum and maximum array elements and returns their values
-        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
-        //! finds global minimum and maximum array elements and returns their values
-        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
-        //! finds global minimum and maximum array elements and returns their values with locations
-        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
-                                  const GpuMat& mask=GpuMat());
-        //! finds global minimum and maximum array elements and returns their values with locations
-        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                                  const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
-        //! counts non-zero array elements
-        CV_EXPORTS int countNonZero(const GpuMat& src);
-        //! counts non-zero array elements
-        CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
        //! destination array will have the depth type as lut and the same channels number as source
        //! supports CV_8UC1, CV_8UC3 types
@@ -487,25 +438,6 @@ namespace cv
        //! async version
        CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream);
-        //! computes per-element minimum of two arrays (dst = min(src1, src2))
-        CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
-        //! Async version
-        CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
-        //! computes per-element minimum of array and scalar (dst = min(src1, src2))
-        CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst);
-        //! Async version
-        CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
-        //! computes per-element maximum of two arrays (dst = max(src1, src2))
-        CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
-        //! Async version
-        CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
-        //! computes per-element maximum of array and scalar (dst = max(src1, src2))
-        CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst);
-        //! Async version
-        CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
        //////////////////////////// Per-element operations ////////////////////////////////////
@@ -576,6 +508,26 @@ namespace cv
        //! async version
        CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
+        //! computes per-element minimum of two arrays (dst = min(src1, src2))
+        CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
+        //! computes per-element minimum of array and scalar (dst = min(src1, src2))
+        CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
+        //! computes per-element maximum of two arrays (dst = max(src1, src2))
+        CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
+        //! computes per-element maximum of array and scalar (dst = max(src1, src2))
+        CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
        ////////////////////////////// Image processing //////////////////////////////
@@ -663,15 +615,66 @@ namespace cv
        //! computes Harris cornerness criteria at each image pixel
        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
        //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
        //! computes the proximity map for the raster template and the image where the template is searched for
        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
+        ////////////////////////////// Matrix reductions //////////////////////////////
+        //! computes mean value and standard deviation of all or selected array elements
+        //! supports only CV_8UC1 type
+        CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
+        //! computes norm of array
+        //! supports NORM_INF, NORM_L1, NORM_L2
+        //! supports only CV_8UC1 type
+        CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
+        //! computes norm of the difference between two arrays
+        //! supports NORM_INF, NORM_L1, NORM_L2
+        //! supports only CV_8UC1 type
+        CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
+        //! computes sum of array elements
+        //! supports only single channel images
+        CV_EXPORTS Scalar sum(const GpuMat& src);
+        //! computes sum of array elements
+        //! supports only single channel images
+        CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
+        //! computes squared sum of array elements
+        //! supports only single channel images
+        CV_EXPORTS Scalar sqrSum(const GpuMat& src);
+        //! computes squared sum of array elements
+        //! supports only single channel images
+        CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
+        //! finds global minimum and maximum array elements and returns their values
+        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
+        //! finds global minimum and maximum array elements and returns their values
+        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
+        //! finds global minimum and maximum array elements and returns their values with locations
+        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
+                                  const GpuMat& mask=GpuMat());
+        //! finds global minimum and maximum array elements and returns their values with locations
+        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                                  const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
+        //! counts non-zero array elements
+        CV_EXPORTS int countNonZero(const GpuMat& src);
+        //! counts non-zero array elements
+        CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
        //////////////////////////////// Filter Engine ////////////////////////////////
        /*!

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -49,20 +49,7 @@ using namespace std;
 #if !defined (HAVE_CUDA)
 void cv::gpu::transpose(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
-double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
 void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
-Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
-Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
-int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
-int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
 void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); }
@@ -78,14 +65,6 @@ void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool)
 void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
 void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
-void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
-void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
-void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
@@ -118,54 +97,6 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
    }
 }
-////////////////////////////////////////////////////////////////////////
-// meanStdDev
-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
-{
-    CV_Assert(src.type() == CV_8UC1);
-    NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
-}
-////////////////////////////////////////////////////////////////////////
-// norm
-double cv::gpu::norm(const GpuMat& src1, int normType)
-{
-    return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType);
-}
-double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
-{
-    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert(src1.type() == CV_8UC1);
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
-        NppiSize oSizeROI, Npp64f* pRetVal);
-    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
-    NppiSize sz;
-    sz.width  = src1.cols;
-    sz.height = src1.rows;
-    int funcIdx = normType >> 1;
-    double retVal;
-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step,
-        src2.ptr<Npp8u>(), src2.step,
-        sz, &retVal) );
-    return retVal;
-}
 ////////////////////////////////////////////////////////////////////////
 // flip
@@ -193,305 +124,6 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode)
    }
 }
-////////////////////////////////////////////////////////////////////////
-// sum
-namespace cv { namespace gpu { namespace mathfunc
-{
-    template <typename T>
-    void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
-    template <typename T>
-    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
-    template <typename T>
-    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
-    template <typename T>
-    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
-    namespace sum
-    {
-        void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows);
-    }
-}}}
-Scalar cv::gpu::sum(const GpuMat& src) 
-{
-    GpuMat buf;
-    return sum(src, buf);
-}
-Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) 
-{
-    using namespace mathfunc;
-    typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
-    static const Caller callers[2][7] = 
-        { { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>, 
-            sum_multipass_caller<unsigned short>, sum_multipass_caller<short>, 
-            sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
-          { sum_caller<unsigned char>, sum_caller<char>, 
-            sum_caller<unsigned short>, sum_caller<short>, 
-            sum_caller<int>, sum_caller<float>, 0 } };
-    Size bufSize;
-    sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); 
-    buf.create(bufSize, CV_8U);
-    Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
-    double result[4];
-    caller(src, buf, result, src.channels());
-    return Scalar(result[0], result[1], result[2], result[3]);
-}
-Scalar cv::gpu::sqrSum(const GpuMat& src) 
-{
-    GpuMat buf;
-    return sqrSum(src, buf);
-}
-Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) 
-{
-    using namespace mathfunc;
-    typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
-    static const Caller callers[2][7] = 
-        { { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>, 
-            sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>, 
-            sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
-          { sqsum_caller<unsigned char>, sqsum_caller<char>, 
-            sqsum_caller<unsigned short>, sqsum_caller<short>, 
-            sqsum_caller<int>, sqsum_caller<float>, 0 } };
-    Size bufSize;
-    sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); 
-    buf.create(bufSize, CV_8U);
-    Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
-    double result[4];
-    caller(src, buf, result, src.channels());
-    return Scalar(result[0], result[1], result[2], result[3]);
-}
-////////////////////////////////////////////////////////////////////////
-// minMax
-namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
-    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
-    template <typename T> 
-    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
-    template <typename T> 
-    void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
-    template <typename T> 
-    void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
-    template <typename T> 
-    void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
-}}}}
-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
-{
-    GpuMat buf;
-    minMax(src, minVal, maxVal, mask, buf);
-}
-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
-{
-    using namespace mathfunc::minmax;
-    typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
-    typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    static const Caller callers[2][7] = 
-    { { min_max_multipass_caller<unsigned char>, min_max_multipass_caller<char>, 
-        min_max_multipass_caller<unsigned short>, min_max_multipass_caller<short>, 
-        min_max_multipass_caller<int>, min_max_multipass_caller<float>, 0 },
-      { min_max_caller<unsigned char>, min_max_caller<char>, 
-        min_max_caller<unsigned short>, min_max_caller<short>, 
-        min_max_caller<int>, min_max_caller<float>, min_max_caller<double> } };
-    static const MaskedCaller masked_callers[2][7] = 
-    { { min_max_mask_multipass_caller<unsigned char>, min_max_mask_multipass_caller<char>, 
-        min_max_mask_multipass_caller<unsigned short>, min_max_mask_multipass_caller<short>, 
-        min_max_mask_multipass_caller<int>, min_max_mask_multipass_caller<float>, 0 },
-      { min_max_mask_caller<unsigned char>, min_max_mask_caller<char>, 
-        min_max_mask_caller<unsigned short>, min_max_mask_caller<short>, 
-        min_max_mask_caller<int>, min_max_mask_caller<float>, 
-        min_max_mask_caller<double> } };
-    CV_Assert(src.channels() == 1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
-    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
-    double minVal_; if (!minVal) minVal = &minVal_;
-    double maxVal_; if (!maxVal) maxVal = &maxVal_;
-    Size bufSize;
-    get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
-    buf.create(bufSize, CV_8U);
-    if (mask.empty())
-    {
-        Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
-        caller(src, minVal, maxVal, buf);
-    }
-    else
-    {
-        MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
-        caller(src, mask, minVal, maxVal, buf);
-    }
-}
-////////////////////////////////////////////////////////////////////////
-// minMaxLoc
-namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
-    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
-                               int& b1rows, int& b2cols, int& b2rows);
-    template <typename T> 
-    void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, 
-                            int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
-    template <typename T> 
-    void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
-                                 int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
-    template <typename T> 
-    void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, 
-                                     int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
-    template <typename T> 
-    void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
-                                           int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
-}}}}
-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
-{    
-    GpuMat valbuf, locbuf;
-    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valbuf, locbuf);
-}
-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                        const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
-{
-    using namespace mathfunc::minmaxloc;
-    typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    static const Caller callers[2][7] = 
-    { { min_max_loc_multipass_caller<unsigned char>, min_max_loc_multipass_caller<char>, 
-        min_max_loc_multipass_caller<unsigned short>, min_max_loc_multipass_caller<short>, 
-        min_max_loc_multipass_caller<int>, min_max_loc_multipass_caller<float>, 0 },
-      { min_max_loc_caller<unsigned char>, min_max_loc_caller<char>, 
-        min_max_loc_caller<unsigned short>, min_max_loc_caller<short>, 
-        min_max_loc_caller<int>, min_max_loc_caller<float>, min_max_loc_caller<double> } };
-    static const MaskedCaller masked_callers[2][7] = 
-    { { min_max_loc_mask_multipass_caller<unsigned char>, min_max_loc_mask_multipass_caller<char>, 
-        min_max_loc_mask_multipass_caller<unsigned short>, min_max_loc_mask_multipass_caller<short>, 
-        min_max_loc_mask_multipass_caller<int>, min_max_loc_mask_multipass_caller<float>, 0 },
-      { min_max_loc_mask_caller<unsigned char>, min_max_loc_mask_caller<char>, 
-        min_max_loc_mask_caller<unsigned short>, min_max_loc_mask_caller<short>, 
-        min_max_loc_mask_caller<int>, min_max_loc_mask_caller<float>, min_max_loc_mask_caller<double> } };
-    CV_Assert(src.channels() == 1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
-    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
-    double minVal_; if (!minVal) minVal = &minVal_;
-    double maxVal_; if (!maxVal) maxVal = &maxVal_;
-    int minLoc_[2];
-    int maxLoc_[2];
-    Size valbuf_size, locbuf_size;
-    get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width, 
-                          valbuf_size.height, locbuf_size.width, locbuf_size.height);
-    valbuf.create(valbuf_size, CV_8U);
-    locbuf.create(locbuf_size, CV_8U);
-    if (mask.empty())
-    {
-        Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
-        caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
-    }
-    else
-    {
-        MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
-        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
-    }
-    if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
-    if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
-}
-////////////////////////////////////////////////////////////////////////
-// Count non zero
-namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
-    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
-    template <typename T> 
-    int count_non_zero_caller(const DevMem2D src, PtrStep buf);
-    template <typename T> 
-    int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf);
-}}}}
-int cv::gpu::countNonZero(const GpuMat& src)
-{
-    GpuMat buf;
-    return countNonZero(src, buf);
-}
-int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
-{
-    using namespace mathfunc::countnonzero;
-    typedef int (*Caller)(const DevMem2D src, PtrStep buf);
-    static const Caller callers[2][7] = 
-    { { count_non_zero_multipass_caller<unsigned char>, count_non_zero_multipass_caller<char>,
-        count_non_zero_multipass_caller<unsigned short>, count_non_zero_multipass_caller<short>,
-        count_non_zero_multipass_caller<int>, count_non_zero_multipass_caller<float>, 0},
-      { count_non_zero_caller<unsigned char>, count_non_zero_caller<char>,
-        count_non_zero_caller<unsigned short>, count_non_zero_caller<short>,
-        count_non_zero_caller<int>, count_non_zero_caller<float>, count_non_zero_caller<double> } };
-    CV_Assert(src.channels() == 1);
-    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
-    Size buf_size;
-    get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
-    buf.create(buf_size, CV_8U);
-    Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
-    if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
-    return caller(src, buf);
-}
 ////////////////////////////////////////////////////////////////////////
 // LUT
@@ -711,144 +343,4 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat&
 }
-//////////////////////////////////////////////////////////////////////////////
-// min/max
-namespace cv { namespace gpu { namespace mathfunc
-{
-    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-}}}
-namespace
-{
-    template <typename T>
-    void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-        mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-    }
-    template <typename T>
-    void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
-    {
-        dst.create(src1.size(), src1.type());
-        mathfunc::min_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
-    }
-    template <typename T>
-    void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        dst.create(src1.size(), src1.type());
-        mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-    }
-    template <typename T>
-    void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
-    {
-        dst.create(src1.size(), src1.type());
-        mathfunc::max_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
-    }
-}
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
-{ 
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
-        min_caller<float>, min_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, 0);
-}
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
-{ 
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
-        min_caller<float>, min_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
-}
-void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) 
-{
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
-        min_caller<float>, min_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, 0);
-}
-void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
-{
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
-        min_caller<float>, min_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
-}
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
-{ 
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
-        max_caller<float>, max_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, 0);
-}
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
-{ 
-    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
-        max_caller<float>, max_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
-}
-void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) 
-{
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
-        max_caller<float>, max_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, 0);
-}
-void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
-{
-    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[] = 
-    {
-        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
-        max_caller<float>, max_caller<double>
-    };
-    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
-}
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -345,4 +345,127 @@ namespace cv { namespace gpu { namespace mathfunc
    template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
    template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
+    //////////////////////////////////////////////////////////////////////////
+    // min/max
+    struct MinOp
+    {        
+        template <typename T>
+        __device__ T operator()(T a, T b)
+        {
+            return min(a, b);
+        }
+        __device__ float operator()(float a, float b)
+        {
+            return fmin(a, b);
+        }
+        __device__ double operator()(double a, double b)
+        {
+            return fmin(a, b);
+        }
+    };
+    struct MaxOp
+    {        
+        template <typename T>
+        __device__ T operator()(T a, T b)
+        {
+            return max(a, b);
+        }
+        __device__ float operator()(float a, float b)
+        {
+            return fmax(a, b);
+        }
+        __device__ double operator()(double a, double b)
+        {
+            return fmax(a, b);
+        }
+    };
+    struct ScalarMinOp
+    {
+        double s;
+        explicit ScalarMinOp(double s_) : s(s_) {}
+        template <typename T>
+        __device__ T operator()(T a)
+        {
+            return saturate_cast<T>(fmin((double)a, s));
+        }
+    };
+    struct ScalarMaxOp
+    {
+        double s;
+        explicit ScalarMaxOp(double s_) : s(s_) {}
+        template <typename T>
+        __device__ T operator()(T a)
+        {
+            return saturate_cast<T>(fmax((double)a, s));
+        }
+    };
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        MinOp op;
+        transform(src1, src2, dst, op, stream);    
+    }
+    template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
+    template void min_gpu<char  >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        MaxOp op;
+        transform(src1, src2, dst, op, stream);    
+    }
+    template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
+    template void max_gpu<char  >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ScalarMinOp op(src2);
+        transform(src1, dst, op, stream);    
+    }
+    template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
+    template void min_gpu<char  >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ScalarMaxOp op(src2);
+        transform(src1, dst, op, stream);    
+    }
+    template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
+    template void max_gpu<char  >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
 }}}
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -58,49 +58,6 @@ using namespace cv::gpu::device;
 namespace cv { namespace gpu { namespace mathfunc
 {
-    template <int size, typename T>
-    __device__ void sum_in_smem(volatile T* data, const uint tid)
-    {
-        T sum = data[tid];
-        if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
-        if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
-        if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
-        if (tid < 32)
-        {
-            if (size >= 64) data[tid] = sum = sum + data[tid + 32];
-            if (size >= 32) data[tid] = sum = sum + data[tid + 16];
-            if (size >= 16) data[tid] = sum = sum + data[tid + 8];
-            if (size >= 8) data[tid] = sum = sum + data[tid + 4];
-            if (size >= 4) data[tid] = sum = sum + data[tid + 2];
-            if (size >= 2) data[tid] = sum = sum + data[tid + 1];
-        }
-    }
-    struct Mask8U
-    {
-        explicit Mask8U(PtrStep mask): mask(mask) {}
-        __device__ bool operator()(int y, int x) const 
-        { 
-            return mask.ptr(y)[x]; 
-        }
-        PtrStep mask;
-    };
-    struct MaskTrue 
-    { 
-        __device__ bool operator()(int y, int x) const 
-        { 
-            return true; 
-        } 
-    };
    struct Nothing
    {
        static __device__ void calc(int, int, float, float, float*, size_t, float)
@@ -259,1676 +216,42 @@ namespace cv { namespace gpu { namespace mathfunc
    }
-//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Min max
+// transpose
-    // To avoid shared bank conflicts we convert each value into value of 
-    // appropriate type (32 bits minimum)
-    template <typename T> struct MinMaxTypeTraits {};
-    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
-    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
-    namespace minmax 
-    {
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
-    // Global counter of blocks finished its work
-    __device__ uint blocks_finished = 0;
-    // Estimates good thread configuration
-    //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(32, 8);
-        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
-    }
-    // Returns required buffer sizes
-    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * elem_size; 
-        bufrows = 2;
-    }
-    // Estimates device constants which are used in the kernels using specified thread configuration
-    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-    }  
-    // Does min and max in shared memory
-    template <typename T>
-    __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
-    {
-        minval[tid] = min(minval[tid], minval[tid + offset]);
-        maxval[tid] = max(maxval[tid], maxval[tid + offset]);
-    }
-    template <int size, typename T>
-    __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid)
-    {
-        if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
-        if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); }  __syncthreads(); }
-        if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
-        if (tid < 32)
-        {
-            if (size >= 64) merge(tid, 32, minval, maxval);
-            if (size >= 32) merge(tid, 16, minval, maxval);
-            if (size >= 16) merge(tid, 8, minval, maxval);
-            if (size >= 8) merge(tid, 4, minval, maxval);
-            if (size >= 4) merge(tid, 2, minval, maxval);
-            if (size >= 2) merge(tid, 1, minval, maxval);
-        }
-    }
-    template <int nthreads, typename T, typename Mask>
-    __global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        T mymin = numeric_limits_gpu<T>::max();
-        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
-        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
-        for (uint y = y0; y < y_end; y += blockDim.y)
-        {
-            const T* src_row = (const T*)src.ptr(y);
-            for (uint x = x0; x < x_end; x += blockDim.x)
-            {
-                T val = src_row[x];
-                if (mask(y, x)) 
-                { 
-                    mymin = min(mymin, val); 
-                    mymax = max(mymax, val); 
-                }
-            }
-        }
-        sminval[tid] = mymin;
-        smaxval[tid] = mymax;
-        __syncthreads();
-        find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
-        if (tid == 0) 
-        {
-            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-        }
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
-		__shared__ bool is_last;
-		if (tid == 0)
-		{
-			minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-			__threadfence();
-			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-			is_last = ticket == gridDim.x * gridDim.y - 1;
-		}
-		__syncthreads();
-		if (is_last)
-		{
-            uint idx = min(tid, gridDim.x * gridDim.y - 1);
-            sminval[tid] = minval[idx];
-            smaxval[tid] = maxval[idx];
-            __syncthreads();
-			find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
-            if (tid == 0) 
-            {
-                minval[0] = (T)sminval[0];
-                maxval[0] = (T)smaxval[0];
-                blocks_finished = 0;
-            }
-		}
-#else
-        if (tid == 0) 
-        {
-            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-        }
-#endif
-    }
-    template <typename T>
-    void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
-        min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-    }  
-    template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template <typename T>
-    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
-        min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-    }  
-    template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep);
-    template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
-    template <int nthreads, typename T>
-    __global__ void min_max_pass2_kernel(T* minval, T* maxval, int size)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        uint idx = min(tid, gridDim.x * gridDim.y - 1);
-        sminval[tid] = minval[idx];
-        smaxval[tid] = maxval[idx];
-        __syncthreads();
-		find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
-        if (tid == 0) 
-        {
-            minval[0] = (T)sminval[0];
-            maxval[0] = (T)smaxval[0];
-        }
-    }
-    template <typename T>
-    void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
-        min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
-        min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-    }
-    template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
-    template <typename T>
-    void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)buf.ptr(0);
-        T* maxval_buf = (T*)buf.ptr(1);
-        min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
-        min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-    }
-    template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep);
-    } // namespace minmax
-///////////////////////////////////////////////////////////////////////////////
-// minMaxLoc
-    namespace minmaxloc {
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
-    // Global counter of blocks finished its work
-    __device__ uint blocks_finished = 0;
-    // Estimates good thread configuration
-    //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(32, 8);
-        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
-    }
-    // Returns required buffer sizes
-    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
-                               int& b1rows, int& b2cols, int& b2rows)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(cols, rows, threads, grid);
-        b1cols = grid.x * grid.y * elem_size; // For values
-        b1rows = 2;
-        b2cols = grid.x * grid.y * sizeof(int); // For locations
-        b2rows = 2;
-    }
-    // Estimates device constants which are used in the kernels using specified thread configuration
-    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-    }  
-    template <typename T>
-    __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, 
-                          volatile uint* minloc, volatile uint* maxloc)
-    {
-        T val = minval[tid + offset];
-        if (val < minval[tid])
-        {
-            minval[tid] = val;
-            minloc[tid] = minloc[tid + offset];
-        }
-        val = maxval[tid + offset];
-        if (val > maxval[tid])
-        {
-            maxval[tid] = val;
-            maxloc[tid] = maxloc[tid + offset];
-        }
-    }
-    template <int size, typename T>
-    __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc, 
-                                             volatile uint* maxloc, const uint tid)
-    {
-        if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
-        if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); }  __syncthreads(); }
-        if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
-        if (tid < 32)
-        {
-            if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
-            if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
-            if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
-            if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
-            if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
-            if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
-        }
-    }
-    template <int nthreads, typename T, typename Mask>
-    __global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval, 
-                                       uint* minloc, uint* maxloc)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-        __shared__ uint sminloc[nthreads];
-        __shared__ uint smaxloc[nthreads];
-        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        T mymin = numeric_limits_gpu<T>::max();
-        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min(); 
-        uint myminloc = 0;
-        uint mymaxloc = 0;
-        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
-        for (uint y = y0; y < y_end; y += blockDim.y)
+    __global__ void transpose(const DevMem2Di src, PtrStepi dst)
-        {
-            const T* ptr = (const T*)src.ptr(y);
-            for (uint x = x0; x < x_end; x += blockDim.x)
-            {
-                if (mask(y, x))
    {
-                    T val = ptr[x];
+    	__shared__ int s_mem[16 * 17];
-                    if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
-                    if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
-                }
-            }
-        }
-        sminval[tid] = mymin; 
-        smaxval[tid] = mymax;
-        sminloc[tid] = myminloc;
-        smaxloc[tid] = mymaxloc;
-        __syncthreads();
-        find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+    	int x = blockIdx.x * blockDim.x + threadIdx.x;
-		__shared__ bool is_last;
+    	int y = blockIdx.y * blockDim.y + threadIdx.y;
+	    int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
-		if (tid == 0)
+	    if (y < src.rows && x < src.cols)
 	    {
-			minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+            s_mem[smem_idx] = src.ptr(y)[x];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-            minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
-            maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
-			__threadfence();
-			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-			is_last = ticket == gridDim.x * gridDim.y - 1;
 	    }
-		__syncthreads();
-		if (is_last)
-		{
-            uint idx = min(tid, gridDim.x * gridDim.y - 1);
-            sminval[tid] = minval[idx];
-            smaxval[tid] = maxval[idx];
-            sminloc[tid] = minloc[idx];
-            smaxloc[tid] = maxloc[idx];
 	    __syncthreads();
-			find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+	    smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
-            if (tid == 0) 
-            {
-                minval[0] = (T)sminval[0];
-                maxval[0] = (T)smaxval[0];
-                minloc[0] = sminloc[0];
-                maxloc[0] = smaxloc[0];
-                blocks_finished = 0;
-            }
-		}
-#else
-        if (tid == 0) 
-        {
-            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
-            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
-            minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
-            maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
-        }
-#endif
-    }
-    template <typename T>
-    void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
-                                 int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-        min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-        uint minloc_, maxloc_;
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-    }
-    template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template <typename T>
-    void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, 
-                            int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-        min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-        uint minloc_, maxloc_;
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
-    }
-    template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_caller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    // This kernel will be used only when compute capability is 1.0
-    template <int nthreads, typename T>
-    __global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
-    {
-        typedef typename MinMaxTypeTraits<T>::best_type best_type;
-        __shared__ best_type sminval[nthreads];
-        __shared__ best_type smaxval[nthreads];
-        __shared__ uint sminloc[nthreads];
-        __shared__ uint smaxloc[nthreads];
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        uint idx = min(tid, gridDim.x * gridDim.y - 1);
-        sminval[tid] = minval[idx];
-        smaxval[tid] = maxval[idx];
-        sminloc[tid] = minloc[idx];
-        smaxloc[tid] = maxloc[idx];
-        __syncthreads();
-		find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+	    x = blockIdx.y * blockDim.x + threadIdx.x;
+	    y = blockIdx.x * blockDim.y + threadIdx.y;
-        if (tid == 0) 
+	    if (y < src.cols && x < src.rows)
 	    {
-            minval[0] = (T)sminval[0];
+		    dst.ptr(y)[x] = s_mem[smem_idx];
-            maxval[0] = (T)smaxval[0];
-            minloc[0] = sminloc[0];
-            maxloc[0] = smaxloc[0];
-        }
 	    }
-    template <typename T>
-    void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
-                                           int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-        min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
-        min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-        uint minloc_, maxloc_;
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
    }
-    template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)
-    template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template <typename T>
-    void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, 
-                                      int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
    {
-        dim3 threads, grid;
+	    dim3 threads(16, 16, 1);
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+	    dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* minval_buf = (T*)valbuf.ptr(0);
-        T* maxval_buf = (T*)valbuf.ptr(1);
-        uint* minloc_buf = (uint*)locbuf.ptr(0);
-        uint* maxloc_buf = (uint*)locbuf.ptr(1);
-        min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
-        min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
-        cudaSafeCall(cudaThreadSynchronize());
-        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
-        *minval = minval_;
-        *maxval = maxval_;
-        uint minloc_, maxloc_;
+	    transpose<<<grid, threads>>>(src, dst);
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
-        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
    }
-    template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
-    } // namespace minmaxloc
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// countNonZero
-    namespace countnonzero 
-    {
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
-    __device__ uint blocks_finished = 0;
-    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(32, 8);
-        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
-    }
-    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * sizeof(int);
-        bufrows = 1;
-    }
-    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
-    }
-    template <int nthreads, typename T>
-    __global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count)
-    {
-        __shared__ uint scount[nthreads];
-        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-		uint cnt = 0;
-        for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
-            for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-				cnt += ptr[x0 + x * blockDim.x] != 0;
-		}
-		scount[tid] = cnt;
-		__syncthreads();
-        sum_in_smem<nthreads, uint>(scount, tid);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
-		__shared__ bool is_last;
-		if (tid == 0)
-		{
-			count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
-			__threadfence();
-			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-			is_last = ticket == gridDim.x * gridDim.y - 1;
-		}
-		__syncthreads();
-		if (is_last)
-		{
-            scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
-            __syncthreads();
-			sum_in_smem<nthreads, uint>(scount, tid);
-			if (tid == 0) 
-            {
-                count[0] = scount[0];
-                blocks_finished = 0;
-            }
-		}
-#else
-        if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
-#endif
-    }
-    template <typename T>
-    int count_non_zero_caller(const DevMem2D src, PtrStep buf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        uint* count_buf = (uint*)buf.ptr(0);
-        count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
-        cudaSafeCall(cudaThreadSynchronize());
-        uint count;
-        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        return count;
-    }  
-    template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep);
-    template int count_non_zero_caller<char>(const DevMem2D, PtrStep);
-    template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep);
-    template int count_non_zero_caller<short>(const DevMem2D, PtrStep);
-    template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
-    template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
-    template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
-    template <int nthreads, typename T>
-    __global__ void count_non_zero_pass2_kernel(uint* count, int size)
-    {
-        __shared__ uint scount[nthreads];
-        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        scount[tid] = tid < size ? count[tid] : 0;
-        __syncthreads();
-        sum_in_smem<nthreads, uint>(scount, tid);
-        if (tid == 0) 
-            count[0] = scount[0];
-    }
-    template <typename T>
-    int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        uint* count_buf = (uint*)buf.ptr(0);
-        count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
-        count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
-        cudaSafeCall(cudaThreadSynchronize());
-        uint count;
-        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-        return count;
-    }  
-    template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep);
-    template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep);
-    template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep);
-    template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep);
-    template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep);
-    template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep);
-    } // namespace countnonzero
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// transpose
-    __global__ void transpose(const DevMem2Di src, PtrStepi dst)
-    {
-    	__shared__ int s_mem[16 * 17];
-    	int x = blockIdx.x * blockDim.x + threadIdx.x;
-    	int y = blockIdx.y * blockDim.y + threadIdx.y;
-	    int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
-	    if (y < src.rows && x < src.cols)
-	    {
-            s_mem[smem_idx] = src.ptr(y)[x];
-	    }
-	    __syncthreads();
-	    smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
-	    x = blockIdx.y * blockDim.x + threadIdx.x;
-	    y = blockIdx.x * blockDim.y + threadIdx.y;
-	    if (y < src.cols && x < src.rows)
-	    {
-		    dst.ptr(y)[x] = s_mem[smem_idx];
-	    }
-    }
-    void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)
-    {
-	    dim3 threads(16, 16, 1);
-	    dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
-	    transpose<<<grid, threads>>>(src, dst);
-        cudaSafeCall( cudaThreadSynchronize() );
-    }
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// min/max
-    struct MinOp
-    {        
-        template <typename T>
-        __device__ T operator()(T a, T b)
-        {
-            return min(a, b);
-        }
-        __device__ float operator()(float a, float b)
-        {
-            return fmin(a, b);
-        }
-        __device__ double operator()(double a, double b)
-        {
-            return fmin(a, b);
-        }
-    };
-    struct MaxOp
-    {        
-        template <typename T>
-        __device__ T operator()(T a, T b)
-        {
-            return max(a, b);
-        }
-        __device__ float operator()(float a, float b)
-        {
-            return fmax(a, b);
-        }
-        __device__ double operator()(double a, double b)
-        {
-            return fmax(a, b);
-        }
-    };
-    struct ScalarMinOp
-    {
-        double s;
-        explicit ScalarMinOp(double s_) : s(s_) {}
-        template <typename T>
-        __device__ T operator()(T a)
-        {
-            return saturate_cast<T>(fmin((double)a, s));
-        }
-    };
-    struct ScalarMaxOp
-    {
-        double s;
-        explicit ScalarMaxOp(double s_) : s(s_) {}
-        template <typename T>
-        __device__ T operator()(T a)
-        {
-            return saturate_cast<T>(fmax((double)a, s));
-        }
-    };
-    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        MinOp op;
-        transform(src1, src2, dst, op, stream);    
-    }
-    template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
-    template void min_gpu<char  >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-    template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-    template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-    template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        MaxOp op;
-        transform(src1, src2, dst, op, stream);    
-    }
-    template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
-    template void max_gpu<char  >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-    template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-    template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-    template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        ScalarMinOp op(src2);
-        transform(src1, dst, op, stream);    
-    }
-    template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
-    template void min_gpu<char  >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-    template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-    template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        ScalarMaxOp op(src2);
-        transform(src1, dst, op, stream);    
-    }
-    template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
-    template void max_gpu<char  >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
-    template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
-    template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
-    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-//////////////////////////////////////////////////////////////////////////////
-// Sum
-    namespace sum 
-    {
-    template <typename T> struct SumType {};
-    template <> struct SumType<uchar> { typedef uint R; };
-    template <> struct SumType<char> { typedef int R; };
-    template <> struct SumType<ushort> { typedef uint R; };
-    template <> struct SumType<short> { typedef int R; };
-    template <> struct SumType<int> { typedef int R; };
-    template <> struct SumType<float> { typedef float R; };
-    template <> struct SumType<double> { typedef double R; };
-    template <typename R> 
-    struct IdentityOp { static __device__ R call(R x) { return x; } };
-    template <typename R> 
-    struct SqrOp { static __device__ R call(R x) { return x * x; } };
-    __constant__ int ctwidth;
-    __constant__ int ctheight;
-    __device__ uint blocks_finished = 0;
-    const int threads_x = 32;
-    const int threads_y = 8;
-    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
-    {
-        threads = dim3(threads_x, threads_y);
-        grid = dim3(divUp(cols, threads.x * threads.y), 
-                    divUp(rows, threads.y * threads.x));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
-    }
-    void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows)
-    {
-        dim3 threads, grid;
-        estimate_thread_cfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * sizeof(double) * cn;
-        bufrows = 1;
-    }
-    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
-    {        
-        int twidth = divUp(divUp(cols, grid.x), threads.x);
-        int theight = divUp(divUp(rows, grid.y), threads.y);
-        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
-    }
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sum_kernel(const DevMem2D src, R* result)
-    {
-        __shared__ R smem[nthreads];
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-        R sum = 0;
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-                sum += Op::call(ptr[x0 + x * blockDim.x]);
-        }
-        smem[tid] = sum;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
-        if (tid == 0)
-        {
-            result[bid] = smem[0];
-            __threadfence();
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
-        __syncthreads();
-        if (is_last)
-        {
-            smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
-            __syncthreads();
-            sum_in_smem<nthreads, R>(smem, tid);
-            if (tid == 0) 
-            {
-                result[0] = smem[0];
-                blocks_finished = 0;
-            }
-        }
-#else
-        if (tid == 0) result[bid] = smem[0];
-#endif
-    }
-    template <typename T, typename R, int nthreads>
-    __global__ void sum_pass2_kernel(R* result, int size)
-    {
-        __shared__ R smem[nthreads];
-        int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        smem[tid] = tid < size ? result[tid] : 0;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-        if (tid == 0) 
-            result[0] = smem[0];
-    }
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
-    {
-        typedef typename TypeVec<T, 2>::vec_t SrcType;
-        typedef typename TypeVec<R, 2>::vec_t DstType;
-        __shared__ R smem[nthreads * 2];
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-        SrcType val;
-        DstType sum = VecTraits<DstType>::all(0);
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-            {
-                val = ptr[x0 + x * blockDim.x];
-                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
-            }
-        }
-        smem[tid] = sum.x;
-        smem[tid + nthreads] = sum.y;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-        sum_in_smem<nthreads, R>(smem + nthreads, tid);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
-        if (tid == 0)
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            result[bid] = res;
-            __threadfence();
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
-        __syncthreads();
-        if (is_last)
-        {
-            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-            smem[tid] = res.x;
-            smem[tid + nthreads] = res.y;
-            __syncthreads();
-            sum_in_smem<nthreads, R>(smem, tid);
-            sum_in_smem<nthreads, R>(smem + nthreads, tid);
-            if (tid == 0) 
-            {
-                res.x = smem[0];
-                res.y = smem[nthreads];
-                result[0] = res;
-                blocks_finished = 0;
-            }
-        }
-#else
-        if (tid == 0) 
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            result[bid] = res;
-        }
-#endif
-    }
-    template <typename T, typename R, int nthreads>
-    __global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
-    {
-        typedef typename TypeVec<R, 2>::vec_t DstType;
-        __shared__ R smem[nthreads * 2];
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-        smem[tid] = res.x;
-        smem[tid + nthreads] = res.y;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-        sum_in_smem<nthreads, R>(smem + nthreads, tid);
-        if (tid == 0) 
-        {
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            result[0] = res;
-        }
-    }
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
-    {
-        typedef typename TypeVec<T, 3>::vec_t SrcType;
-        typedef typename TypeVec<R, 3>::vec_t DstType;
-        __shared__ R smem[nthreads * 3];
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-        SrcType val;
-        DstType sum = VecTraits<DstType>::all(0);
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-            {
-                val = ptr[x0 + x * blockDim.x];
-                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
-            }
-        }
-        smem[tid] = sum.x;
-        smem[tid + nthreads] = sum.y;
-        smem[tid + 2 * nthreads] = sum.z;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-        sum_in_smem<nthreads, R>(smem + nthreads, tid);
-        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
-        if (tid == 0)
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            result[bid] = res;
-            __threadfence();
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
-        __syncthreads();
-        if (is_last)
-        {
-            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-            smem[tid] = res.x;
-            smem[tid + nthreads] = res.y;
-            smem[tid + 2 * nthreads] = res.z;
-            __syncthreads();
-            sum_in_smem<nthreads, R>(smem, tid);
-            sum_in_smem<nthreads, R>(smem + nthreads, tid);
-            sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
-            if (tid == 0) 
-            {
-                res.x = smem[0];
-                res.y = smem[nthreads];
-                res.z = smem[2 * nthreads];
-                result[0] = res;
-                blocks_finished = 0;
-            }
-        }
-#else
-        if (tid == 0) 
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            result[bid] = res;
-        }
-#endif
-    }
-    template <typename T, typename R, int nthreads>
-    __global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
-    {
-        typedef typename TypeVec<R, 3>::vec_t DstType;
-        __shared__ R smem[nthreads * 3];
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-        smem[tid] = res.x;
-        smem[tid + nthreads] = res.y;
-        smem[tid + 2 * nthreads] = res.z;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-        sum_in_smem<nthreads, R>(smem + nthreads, tid);
-        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
-        if (tid == 0) 
-        {
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            result[0] = res;
-        }
-    }
-    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
-    {
-        typedef typename TypeVec<T, 4>::vec_t SrcType;
-        typedef typename TypeVec<R, 4>::vec_t DstType;
-        __shared__ R smem[nthreads * 4];
-        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
-        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-        SrcType val;
-        DstType sum = VecTraits<DstType>::all(0);
-        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
-        {
-            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
-            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
-            {
-                val = ptr[x0 + x * blockDim.x];
-                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), 
-                                                     Op::call(val.z), Op::call(val.w));
-            }
-        }
-        smem[tid] = sum.x;
-        smem[tid + nthreads] = sum.y;
-        smem[tid + 2 * nthreads] = sum.z;
-        smem[tid + 3 * nthreads] = sum.w;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-        sum_in_smem<nthreads, R>(smem + nthreads, tid);
-        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
-        sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
-        __shared__ bool is_last;
-        if (tid == 0)
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            res.w = smem[3 * nthreads];
-            result[bid] = res;
-            __threadfence();
-            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
-            is_last = (ticket == gridDim.x * gridDim.y - 1);
-        }
-        __syncthreads();
-        if (is_last)
-        {
-            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-            smem[tid] = res.x;
-            smem[tid + nthreads] = res.y;
-            smem[tid + 2 * nthreads] = res.z;
-            smem[tid + 3 * nthreads] = res.w;
-            __syncthreads();
-            sum_in_smem<nthreads, R>(smem, tid);
-            sum_in_smem<nthreads, R>(smem + nthreads, tid);
-            sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
-            sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
-            if (tid == 0) 
-            {
-                res.x = smem[0];
-                res.y = smem[nthreads];
-                res.z = smem[2 * nthreads];
-                res.w = smem[3 * nthreads];
-                result[0] = res;
-                blocks_finished = 0;
-            }
-        }
-#else
-        if (tid == 0) 
-        {
-            DstType res;
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            res.w = smem[3 * nthreads];
-            result[bid] = res;
-        }
-#endif
-    }
-    template <typename T, typename R, int nthreads>
-    __global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
-    {
-        typedef typename TypeVec<R, 4>::vec_t DstType;
-        __shared__ R smem[nthreads * 4];
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
-        smem[tid] = res.x;
-        smem[tid + nthreads] = res.y;
-        smem[tid + 2 * nthreads] = res.z;
-        smem[tid + 3 * nthreads] = res.z;
-        __syncthreads();
-        sum_in_smem<nthreads, R>(smem, tid);
-        sum_in_smem<nthreads, R>(smem + nthreads, tid);
-        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
-        sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
-        if (tid == 0) 
-        {
-            res.x = smem[0];
-            res.y = smem[nthreads];
-            res.z = smem[2 * nthreads];
-            res.w = smem[3 * nthreads];
-            result[0] = res;
-        }
-    }
-    } // namespace sum
-    template <typename T>
-    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
-    {
-        using namespace sum;
-        typedef typename SumType<T>::R R;
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        switch (cn)
-        {
-        case 1:
-            sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
-        case 2:
-            sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
-        case 3:
-            sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
-        case 4:
-            sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
-        }
-        cudaSafeCall(cudaThreadSynchronize());
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }  
-    template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
-    template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
-    template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
-    template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
-    template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
-    template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
-    template <typename T>
-    void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
-    {
-        using namespace sum;
-        typedef typename SumType<T>::R R;
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        switch (cn)
-        {
-        case 1:
-            sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
-            break;
-        case 2:
-            sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
-            break;
-        case 3:
-            sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
-            break;
-        case 4:
-            sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
-            break;
-        }
-        cudaSafeCall(cudaThreadSynchronize());
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }  
-    template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
-    template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);
-    template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
-    template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);
-    template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);
-    template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);
-    template <typename T>
-    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
-    {
-        using namespace sum;
-        typedef typename SumType<T>::R R;
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        switch (cn)
-        {
-        case 1:
-            sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
-            break;
-        case 2:
-            sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
-            break;
-        case 3:
-            sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
-            break;
-        case 4:
-            sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
-            sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
-            break;
-        }
-        cudaSafeCall(cudaThreadSynchronize());
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }  
-    template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
-    template <typename T>
-    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
-    {
-        using namespace sum;
-        typedef typename SumType<T>::R R;
-        dim3 threads, grid;
-        estimate_thread_cfg(src.cols, src.rows, threads, grid);
-        set_kernel_consts(src.cols, src.rows, threads, grid);
-        switch (cn)
-        {
-        case 1:
-            sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
-            break;
-        case 2:
-            sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
-            break;
-        case 3:
-            sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
-            break;
-        case 4:
-            sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
-            break;
-        }
-        cudaSafeCall(cudaThreadSynchronize());
-        R result[4] = {0, 0, 0, 0};
-        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-        sum[0] = result[0];
-        sum[1] = result[1];
-        sum[2] = result[2];
-        sum[3] = result[3];
-    }
-    template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
-    template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
 }}}

--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vecmath.hpp"
+#include "transform.hpp"
+#include "internal_shared.hpp"
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+namespace cv { namespace gpu { namespace mathfunc
+{
+    // Performs reduction in shared memory
+    template <int size, typename T>
+    __device__ void sum_in_smem(volatile T* data, const uint tid)
+    {
+        T sum = data[tid];
+        if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
+        if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
+        if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
+        if (tid < 32)
+        {
+            if (size >= 64) data[tid] = sum = sum + data[tid + 32];
+            if (size >= 32) data[tid] = sum = sum + data[tid + 16];
+            if (size >= 16) data[tid] = sum = sum + data[tid + 8];
+            if (size >= 8) data[tid] = sum = sum + data[tid + 4];
+            if (size >= 4) data[tid] = sum = sum + data[tid + 2];
+            if (size >= 2) data[tid] = sum = sum + data[tid + 1];
+        }
+    }
+    struct Mask8U
+    {
+        explicit Mask8U(PtrStep mask): mask(mask) {}
+        __device__ bool operator()(int y, int x) const 
+        { 
+            return mask.ptr(y)[x]; 
+        }
+        PtrStep mask;
+    };
+    struct MaskTrue 
+    { 
+        __device__ bool operator()(int y, int x) const 
+        { 
+            return true; 
+        } 
+    };
+    //////////////////////////////////////////////////////////////////////////////
+    // Min max
+    // To avoid shared bank conflicts we convert each value into value of 
+    // appropriate type (32 bits minimum)
+    template <typename T> struct MinMaxTypeTraits {};
+    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+    namespace minmax 
+    {
+    __constant__ int ctwidth;
+    __constant__ int ctheight;
+    // Global counter of blocks finished its work
+    __device__ uint blocks_finished = 0;
+    // Estimates good thread configuration
+    //  - threads variable satisfies to threads.x * threads.y == 256
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
+    {
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
+    }
+    // Returns required buffer sizes
+    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * elem_size; 
+        bufrows = 2;
+    }
+    // Estimates device constants which are used in the kernels using specified thread configuration
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    {        
+        int twidth = divUp(divUp(cols, grid.x), threads.x);
+        int theight = divUp(divUp(rows, grid.y), threads.y);
+        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
+        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
+    }  
+    // Does min and max in shared memory
+    template <typename T>
+    __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
+    {
+        minval[tid] = min(minval[tid], minval[tid + offset]);
+        maxval[tid] = max(maxval[tid], maxval[tid + offset]);
+    }
+    template <int size, typename T>
+    __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid)
+    {
+        if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
+        if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); }  __syncthreads(); }
+        if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
+        if (tid < 32)
+        {
+            if (size >= 64) merge(tid, 32, minval, maxval);
+            if (size >= 32) merge(tid, 16, minval, maxval);
+            if (size >= 16) merge(tid, 8, minval, maxval);
+            if (size >= 8) merge(tid, 4, minval, maxval);
+            if (size >= 4) merge(tid, 2, minval, maxval);
+            if (size >= 2) merge(tid, 1, minval, maxval);
+        }
+    }
+    template <int nthreads, typename T, typename Mask>
+    __global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type best_type;
+        __shared__ best_type sminval[nthreads];
+        __shared__ best_type smaxval[nthreads];
+        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+        T mymin = numeric_limits_gpu<T>::max();
+        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
+        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
+        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
+        for (uint y = y0; y < y_end; y += blockDim.y)
+        {
+            const T* src_row = (const T*)src.ptr(y);
+            for (uint x = x0; x < x_end; x += blockDim.x)
+            {
+                T val = src_row[x];
+                if (mask(y, x)) 
+                { 
+                    mymin = min(mymin, val); 
+                    mymax = max(mymax, val); 
+                }
+            }
+        }
+        sminval[tid] = mymin;
+        smaxval[tid] = mymax;
+        __syncthreads();
+        find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
+        if (tid == 0) 
+        {
+            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+        }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+		__shared__ bool is_last;
+		if (tid == 0)
+		{
+			minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+			__threadfence();
+			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+			is_last = ticket == gridDim.x * gridDim.y - 1;
+		}
+		__syncthreads();
+		if (is_last)
+		{
+            uint idx = min(tid, gridDim.x * gridDim.y - 1);
+            sminval[tid] = minval[idx];
+            smaxval[tid] = maxval[idx];
+            __syncthreads();
+			find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
+            if (tid == 0) 
+            {
+                minval[0] = (T)sminval[0];
+                maxval[0] = (T)smaxval[0];
+                blocks_finished = 0;
+            }
+		}
+#else
+        if (tid == 0) 
+        {
+            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+        }
+#endif
+    }
+    template <typename T>
+    void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)buf.ptr(0);
+        T* maxval_buf = (T*)buf.ptr(1);
+        min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+    }  
+    template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template <typename T>
+    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)buf.ptr(0);
+        T* maxval_buf = (T*)buf.ptr(1);
+        min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+    }  
+    template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep);
+    template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
+    template <int nthreads, typename T>
+    __global__ void min_max_pass2_kernel(T* minval, T* maxval, int size)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type best_type;
+        __shared__ best_type sminval[nthreads];
+        __shared__ best_type smaxval[nthreads];
+        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+        uint idx = min(tid, gridDim.x * gridDim.y - 1);
+        sminval[tid] = minval[idx];
+        smaxval[tid] = maxval[idx];
+        __syncthreads();
+		find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
+        if (tid == 0) 
+        {
+            minval[0] = (T)sminval[0];
+            maxval[0] = (T)smaxval[0];
+        }
+    }
+    template <typename T>
+    void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)buf.ptr(0);
+        T* maxval_buf = (T*)buf.ptr(1);
+        min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
+        min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+    }
+    template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    template <typename T>
+    void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)buf.ptr(0);
+        T* maxval_buf = (T*)buf.ptr(1);
+        min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
+        min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+    }
+    template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep);
+    } // namespace minmax
+///////////////////////////////////////////////////////////////////////////////
+// minMaxLoc
+    namespace minmaxloc {
+    __constant__ int ctwidth;
+    __constant__ int ctheight;
+    // Global counter of blocks finished its work
+    __device__ uint blocks_finished = 0;
+    // Estimates good thread configuration
+    //  - threads variable satisfies to threads.x * threads.y == 256
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
+    {
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
+    }
+    // Returns required buffer sizes
+    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
+                               int& b1rows, int& b2cols, int& b2rows)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        b1cols = grid.x * grid.y * elem_size; // For values
+        b1rows = 2;
+        b2cols = grid.x * grid.y * sizeof(int); // For locations
+        b2rows = 2;
+    }
+    // Estimates device constants which are used in the kernels using specified thread configuration
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    {        
+        int twidth = divUp(divUp(cols, grid.x), threads.x);
+        int theight = divUp(divUp(rows, grid.y), threads.y);
+        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
+        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
+    }  
+    template <typename T>
+    __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, 
+                          volatile uint* minloc, volatile uint* maxloc)
+    {
+        T val = minval[tid + offset];
+        if (val < minval[tid])
+        {
+            minval[tid] = val;
+            minloc[tid] = minloc[tid + offset];
+        }
+        val = maxval[tid + offset];
+        if (val > maxval[tid])
+        {
+            maxval[tid] = val;
+            maxloc[tid] = maxloc[tid + offset];
+        }
+    }
+    template <int size, typename T>
+    __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc, 
+                                             volatile uint* maxloc, const uint tid)
+    {
+        if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
+        if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); }  __syncthreads(); }
+        if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
+        if (tid < 32)
+        {
+            if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
+            if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
+            if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
+            if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
+            if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
+            if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
+        }
+    }
+    template <int nthreads, typename T, typename Mask>
+    __global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval, 
+                                       uint* minloc, uint* maxloc)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type best_type;
+        __shared__ best_type sminval[nthreads];
+        __shared__ best_type smaxval[nthreads];
+        __shared__ uint sminloc[nthreads];
+        __shared__ uint smaxloc[nthreads];
+        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+        T mymin = numeric_limits_gpu<T>::max();
+        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min(); 
+        uint myminloc = 0;
+        uint mymaxloc = 0;
+        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
+        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
+        for (uint y = y0; y < y_end; y += blockDim.y)
+        {
+            const T* ptr = (const T*)src.ptr(y);
+            for (uint x = x0; x < x_end; x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    T val = ptr[x];
+                    if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
+                    if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
+                }
+            }
+        }
+        sminval[tid] = mymin; 
+        smaxval[tid] = mymax;
+        sminloc[tid] = myminloc;
+        smaxloc[tid] = mymaxloc;
+        __syncthreads();
+        find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+		__shared__ bool is_last;
+		if (tid == 0)
+		{
+			minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+            minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
+            maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
+			__threadfence();
+			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+			is_last = ticket == gridDim.x * gridDim.y - 1;
+		}
+		__syncthreads();
+		if (is_last)
+		{
+            uint idx = min(tid, gridDim.x * gridDim.y - 1);
+            sminval[tid] = minval[idx];
+            smaxval[tid] = maxval[idx];
+            sminloc[tid] = minloc[idx];
+            smaxloc[tid] = maxloc[idx];
+            __syncthreads();
+			find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+            if (tid == 0) 
+            {
+                minval[0] = (T)sminval[0];
+                maxval[0] = (T)smaxval[0];
+                minloc[0] = sminloc[0];
+                maxloc[0] = smaxloc[0];
+                blocks_finished = 0;
+            }
+		}
+#else
+        if (tid == 0) 
+        {
+            minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
+            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+            minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
+            maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
+        }
+#endif
+    }
+    template <typename T>
+    void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
+                                 int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)valbuf.ptr(0);
+        T* maxval_buf = (T*)valbuf.ptr(1);
+        uint* minloc_buf = (uint*)locbuf.ptr(0);
+        uint* maxloc_buf = (uint*)locbuf.ptr(1);
+        min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+        uint minloc_, maxloc_;
+        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+    }
+    template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template <typename T>
+    void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, 
+                            int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)valbuf.ptr(0);
+        T* maxval_buf = (T*)valbuf.ptr(1);
+        uint* minloc_buf = (uint*)locbuf.ptr(0);
+        uint* maxloc_buf = (uint*)locbuf.ptr(1);
+        min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+        uint minloc_, maxloc_;
+        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+    }
+    template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_caller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    // This kernel will be used only when compute capability is 1.0
+    template <int nthreads, typename T>
+    __global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type best_type;
+        __shared__ best_type sminval[nthreads];
+        __shared__ best_type smaxval[nthreads];
+        __shared__ uint sminloc[nthreads];
+        __shared__ uint smaxloc[nthreads];
+        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+        uint idx = min(tid, gridDim.x * gridDim.y - 1);
+        sminval[tid] = minval[idx];
+        smaxval[tid] = maxval[idx];
+        sminloc[tid] = minloc[idx];
+        smaxloc[tid] = maxloc[idx];
+        __syncthreads();
+		find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+        if (tid == 0) 
+        {
+            minval[0] = (T)sminval[0];
+            maxval[0] = (T)smaxval[0];
+            minloc[0] = sminloc[0];
+            maxloc[0] = smaxloc[0];
+        }
+    }
+    template <typename T>
+    void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
+                                           int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)valbuf.ptr(0);
+        T* maxval_buf = (T*)valbuf.ptr(1);
+        uint* minloc_buf = (uint*)locbuf.ptr(0);
+        uint* maxloc_buf = (uint*)locbuf.ptr(1);
+        min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
+        min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+        uint minloc_, maxloc_;
+        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+    }
+    template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template <typename T>
+    void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, 
+                                      int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)valbuf.ptr(0);
+        T* maxval_buf = (T*)valbuf.ptr(1);
+        uint* minloc_buf = (uint*)locbuf.ptr(0);
+        uint* maxloc_buf = (uint*)locbuf.ptr(1);
+        min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
+        min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
+        cudaSafeCall(cudaThreadSynchronize());
+        T minval_, maxval_;
+        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        *minval = minval_;
+        *maxval = maxval_;
+        uint minloc_, maxloc_;
+        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+    }
+    template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    } // namespace minmaxloc
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// countNonZero
+    namespace countnonzero 
+    {
+    __constant__ int ctwidth;
+    __constant__ int ctheight;
+    __device__ uint blocks_finished = 0;
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
+    {
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
+    }
+    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * sizeof(int);
+        bufrows = 1;
+    }
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    {        
+        int twidth = divUp(divUp(cols, grid.x), threads.x);
+        int theight = divUp(divUp(rows, grid.y), threads.y);
+        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
+        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+    }
+    template <int nthreads, typename T>
+    __global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count)
+    {
+        __shared__ uint scount[nthreads];
+        uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+		uint cnt = 0;
+        for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        {
+            const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
+            for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+				cnt += ptr[x0 + x * blockDim.x] != 0;
+		}
+		scount[tid] = cnt;
+		__syncthreads();
+        sum_in_smem<nthreads, uint>(scount, tid);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+		__shared__ bool is_last;
+		if (tid == 0)
+		{
+			count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
+			__threadfence();
+			uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+			is_last = ticket == gridDim.x * gridDim.y - 1;
+		}
+		__syncthreads();
+		if (is_last)
+		{
+            scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
+            __syncthreads();
+			sum_in_smem<nthreads, uint>(scount, tid);
+			if (tid == 0) 
+            {
+                count[0] = scount[0];
+                blocks_finished = 0;
+            }
+		}
+#else
+        if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
+#endif
+    }
+    template <typename T>
+    int count_non_zero_caller(const DevMem2D src, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        uint* count_buf = (uint*)buf.ptr(0);
+        count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
+        cudaSafeCall(cudaThreadSynchronize());
+        uint count;
+        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        return count;
+    }  
+    template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<char>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<short>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
+    template <int nthreads, typename T>
+    __global__ void count_non_zero_pass2_kernel(uint* count, int size)
+    {
+        __shared__ uint scount[nthreads];
+        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+        scount[tid] = tid < size ? count[tid] : 0;
+        __syncthreads();
+        sum_in_smem<nthreads, uint>(scount, tid);
+        if (tid == 0) 
+            count[0] = scount[0];
+    }
+    template <typename T>
+    int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        uint* count_buf = (uint*)buf.ptr(0);
+        count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
+        count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
+        cudaSafeCall(cudaThreadSynchronize());
+        uint count;
+        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        return count;
+    }  
+    template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep);
+    template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep);
+    template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep);
+    template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep);
+    template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep);
+    template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep);
+    } // namespace countnonzero
+    //////////////////////////////////////////////////////////////////////////
+    // Sum
+    namespace sum 
+    {
+    template <typename T> struct SumType {};
+    template <> struct SumType<uchar> { typedef uint R; };
+    template <> struct SumType<char> { typedef int R; };
+    template <> struct SumType<ushort> { typedef uint R; };
+    template <> struct SumType<short> { typedef int R; };
+    template <> struct SumType<int> { typedef int R; };
+    template <> struct SumType<float> { typedef float R; };
+    template <> struct SumType<double> { typedef double R; };
+    template <typename R> 
+    struct IdentityOp { static __device__ R call(R x) { return x; } };
+    template <typename R> 
+    struct SqrOp { static __device__ R call(R x) { return x * x; } };
+    __constant__ int ctwidth;
+    __constant__ int ctheight;
+    __device__ uint blocks_finished = 0;
+    const int threads_x = 32;
+    const int threads_y = 8;
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
+    {
+        threads = dim3(threads_x, threads_y);
+        grid = dim3(divUp(cols, threads.x * threads.y), 
+                    divUp(rows, threads.y * threads.x));
+        grid.x = min(grid.x, threads.x);
+        grid.y = min(grid.y, threads.y);
+    }
+    void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * sizeof(double) * cn;
+        bufrows = 1;
+    }
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    {        
+        int twidth = divUp(divUp(cols, grid.x), threads.x);
+        int theight = divUp(divUp(rows, grid.y), threads.y);
+        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
+        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+    }
+    template <typename T, typename R, typename Op, int nthreads>
+    __global__ void sum_kernel(const DevMem2D src, R* result)
+    {
+        __shared__ R smem[nthreads];
+        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+        R sum = 0;
+        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        {
+            const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
+            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+                sum += Op::call(ptr[x0 + x * blockDim.x]);
+        }
+        smem[tid] = sum;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+        __shared__ bool is_last;
+        if (tid == 0)
+        {
+            result[bid] = smem[0];
+            __threadfence();
+            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+        __syncthreads();
+        if (is_last)
+        {
+            smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
+            __syncthreads();
+            sum_in_smem<nthreads, R>(smem, tid);
+            if (tid == 0) 
+            {
+                result[0] = smem[0];
+                blocks_finished = 0;
+            }
+        }
+#else
+        if (tid == 0) result[bid] = smem[0];
+#endif
+    }
+    template <typename T, typename R, int nthreads>
+    __global__ void sum_pass2_kernel(R* result, int size)
+    {
+        __shared__ R smem[nthreads];
+        int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        smem[tid] = tid < size ? result[tid] : 0;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+        if (tid == 0) 
+            result[0] = smem[0];
+    }
+    template <typename T, typename R, typename Op, int nthreads>
+    __global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
+    {
+        typedef typename TypeVec<T, 2>::vec_t SrcType;
+        typedef typename TypeVec<R, 2>::vec_t DstType;
+        __shared__ R smem[nthreads * 2];
+        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+        SrcType val;
+        DstType sum = VecTraits<DstType>::all(0);
+        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        {
+            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
+            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+            {
+                val = ptr[x0 + x * blockDim.x];
+                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
+            }
+        }
+        smem[tid] = sum.x;
+        smem[tid + nthreads] = sum.y;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+        sum_in_smem<nthreads, R>(smem + nthreads, tid);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+        __shared__ bool is_last;
+        if (tid == 0)
+        {
+            DstType res;
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            result[bid] = res;
+            __threadfence();
+            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+        __syncthreads();
+        if (is_last)
+        {
+            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+            smem[tid] = res.x;
+            smem[tid + nthreads] = res.y;
+            __syncthreads();
+            sum_in_smem<nthreads, R>(smem, tid);
+            sum_in_smem<nthreads, R>(smem + nthreads, tid);
+            if (tid == 0) 
+            {
+                res.x = smem[0];
+                res.y = smem[nthreads];
+                result[0] = res;
+                blocks_finished = 0;
+            }
+        }
+#else
+        if (tid == 0) 
+        {
+            DstType res;
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            result[bid] = res;
+        }
+#endif
+    }
+    template <typename T, typename R, int nthreads>
+    __global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
+    {
+        typedef typename TypeVec<R, 2>::vec_t DstType;
+        __shared__ R smem[nthreads * 2];
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+        smem[tid] = res.x;
+        smem[tid + nthreads] = res.y;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+        sum_in_smem<nthreads, R>(smem + nthreads, tid);
+        if (tid == 0) 
+        {
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            result[0] = res;
+        }
+    }
+    template <typename T, typename R, typename Op, int nthreads>
+    __global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
+    {
+        typedef typename TypeVec<T, 3>::vec_t SrcType;
+        typedef typename TypeVec<R, 3>::vec_t DstType;
+        __shared__ R smem[nthreads * 3];
+        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+        SrcType val;
+        DstType sum = VecTraits<DstType>::all(0);
+        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        {
+            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
+            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+            {
+                val = ptr[x0 + x * blockDim.x];
+                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
+            }
+        }
+        smem[tid] = sum.x;
+        smem[tid + nthreads] = sum.y;
+        smem[tid + 2 * nthreads] = sum.z;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+        sum_in_smem<nthreads, R>(smem + nthreads, tid);
+        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+        __shared__ bool is_last;
+        if (tid == 0)
+        {
+            DstType res;
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            res.z = smem[2 * nthreads];
+            result[bid] = res;
+            __threadfence();
+            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+        __syncthreads();
+        if (is_last)
+        {
+            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+            smem[tid] = res.x;
+            smem[tid + nthreads] = res.y;
+            smem[tid + 2 * nthreads] = res.z;
+            __syncthreads();
+            sum_in_smem<nthreads, R>(smem, tid);
+            sum_in_smem<nthreads, R>(smem + nthreads, tid);
+            sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
+            if (tid == 0) 
+            {
+                res.x = smem[0];
+                res.y = smem[nthreads];
+                res.z = smem[2 * nthreads];
+                result[0] = res;
+                blocks_finished = 0;
+            }
+        }
+#else
+        if (tid == 0) 
+        {
+            DstType res;
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            res.z = smem[2 * nthreads];
+            result[bid] = res;
+        }
+#endif
+    }
+    template <typename T, typename R, int nthreads>
+    __global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
+    {
+        typedef typename TypeVec<R, 3>::vec_t DstType;
+        __shared__ R smem[nthreads * 3];
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+        smem[tid] = res.x;
+        smem[tid + nthreads] = res.y;
+        smem[tid + 2 * nthreads] = res.z;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+        sum_in_smem<nthreads, R>(smem + nthreads, tid);
+        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
+        if (tid == 0) 
+        {
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            res.z = smem[2 * nthreads];
+            result[0] = res;
+        }
+    }
+    template <typename T, typename R, typename Op, int nthreads>
+    __global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
+    {
+        typedef typename TypeVec<T, 4>::vec_t SrcType;
+        typedef typename TypeVec<R, 4>::vec_t DstType;
+        __shared__ R smem[nthreads * 4];
+        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+        SrcType val;
+        DstType sum = VecTraits<DstType>::all(0);
+        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        {
+            const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
+            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+            {
+                val = ptr[x0 + x * blockDim.x];
+                sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), 
+                                                     Op::call(val.z), Op::call(val.w));
+            }
+        }
+        smem[tid] = sum.x;
+        smem[tid + nthreads] = sum.y;
+        smem[tid + 2 * nthreads] = sum.z;
+        smem[tid + 3 * nthreads] = sum.w;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+        sum_in_smem<nthreads, R>(smem + nthreads, tid);
+        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
+        sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+        __shared__ bool is_last;
+        if (tid == 0)
+        {
+            DstType res;
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            res.z = smem[2 * nthreads];
+            res.w = smem[3 * nthreads];
+            result[bid] = res;
+            __threadfence();
+            uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+        __syncthreads();
+        if (is_last)
+        {
+            DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+            smem[tid] = res.x;
+            smem[tid + nthreads] = res.y;
+            smem[tid + 2 * nthreads] = res.z;
+            smem[tid + 3 * nthreads] = res.w;
+            __syncthreads();
+            sum_in_smem<nthreads, R>(smem, tid);
+            sum_in_smem<nthreads, R>(smem + nthreads, tid);
+            sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
+            sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
+            if (tid == 0) 
+            {
+                res.x = smem[0];
+                res.y = smem[nthreads];
+                res.z = smem[2 * nthreads];
+                res.w = smem[3 * nthreads];
+                result[0] = res;
+                blocks_finished = 0;
+            }
+        }
+#else
+        if (tid == 0) 
+        {
+            DstType res;
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            res.z = smem[2 * nthreads];
+            res.w = smem[3 * nthreads];
+            result[bid] = res;
+        }
+#endif
+    }
+    template <typename T, typename R, int nthreads>
+    __global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
+    {
+        typedef typename TypeVec<R, 4>::vec_t DstType;
+        __shared__ R smem[nthreads * 4];
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
+        smem[tid] = res.x;
+        smem[tid + nthreads] = res.y;
+        smem[tid + 2 * nthreads] = res.z;
+        smem[tid + 3 * nthreads] = res.z;
+        __syncthreads();
+        sum_in_smem<nthreads, R>(smem, tid);
+        sum_in_smem<nthreads, R>(smem + nthreads, tid);
+        sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
+        sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
+        if (tid == 0) 
+        {
+            res.x = smem[0];
+            res.y = smem[nthreads];
+            res.z = smem[2 * nthreads];
+            res.w = smem[3 * nthreads];
+            result[0] = res;
+        }
+    }
+    } // namespace sum
+    template <typename T>
+    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
+    {
+        using namespace sum;
+        typedef typename SumType<T>::R R;
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        switch (cn)
+        {
+        case 1:
+            sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+        case 2:
+            sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+        case 3:
+            sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+        case 4:
+            sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+        }
+        cudaSafeCall(cudaThreadSynchronize());
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+        sum[0] = result[0];
+        sum[1] = result[1];
+        sum[2] = result[2];
+        sum[3] = result[3];
+    }  
+    template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
+    template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
+    template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
+    template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
+    template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
+    template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
+    template <typename T>
+    void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
+    {
+        using namespace sum;
+        typedef typename SumType<T>::R R;
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        switch (cn)
+        {
+        case 1:
+            sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+            break;
+        case 2:
+            sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+            break;
+        case 3:
+            sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+            break;
+        case 4:
+            sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+            break;
+        }
+        cudaSafeCall(cudaThreadSynchronize());
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+        sum[0] = result[0];
+        sum[1] = result[1];
+        sum[2] = result[2];
+        sum[3] = result[3];
+    }  
+    template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
+    template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);
+    template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
+    template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);
+    template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);
+    template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);
+    template <typename T>
+    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
+    {
+        using namespace sum;
+        typedef typename SumType<T>::R R;
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        switch (cn)
+        {
+        case 1:
+            sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        case 2:
+            sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        case 3:
+            sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        case 4:
+            sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+            sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        }
+        cudaSafeCall(cudaThreadSynchronize());
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+        sum[0] = result[0];
+        sum[1] = result[1];
+        sum[2] = result[2];
+        sum[3] = result[3];
+    }  
+    template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
+    template <typename T>
+    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
+    {
+        using namespace sum;
+        typedef typename SumType<T>::R R;
+        dim3 threads, grid;
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
+        switch (cn)
+        {
+        case 1:
+            sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+            break;
+        case 2:
+            sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+            break;
+        case 3:
+            sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+            break;
+        case 4:
+            sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+            break;
+        }
+        cudaSafeCall(cudaThreadSynchronize());
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+        sum[0] = result[0];
+        sum[1] = result[1];
+        sum[2] = result[2];
+        sum[3] = result[3];
+    }
+    template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
+    template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
+ }}}
\ No newline at end of file
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -66,10 +66,14 @@ void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&)
 void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); }
-cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat&) { throw_nogpu(); return GpuMat(); }
+void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-cv::gpu::GpuMat cv::gpu::operator | (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
+void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
-cv::gpu::GpuMat cv::gpu::operator & (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
+void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
-cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
+void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
 #else
@@ -574,4 +578,144 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
        ::bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
 }
+//////////////////////////////////////////////////////////////////////////////
+// Minimum and maximum operations
+namespace cv { namespace gpu { namespace mathfunc
+{
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+}}}
+namespace
+{
+    template <typename T>
+    void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+        mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+    }
+    template <typename T>
+    void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
+    {
+        dst.create(src1.size(), src1.type());
+        mathfunc::min_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
+    }
+    template <typename T>
+    void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+        mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+    }
+    template <typename T>
+    void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
+    {
+        dst.create(src1.size(), src1.type());
+        mathfunc::max_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
+    }
+}
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
+void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
+void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
 #endif
\ No newline at end of file
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+using namespace cv;
+using namespace cv::gpu;
+#if !defined (HAVE_CUDA)
+void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
+double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
+double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
+Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
+int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
+#else
+////////////////////////////////////////////////////////////////////////
+// meanStdDev
+void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
+{
+    CV_Assert(src.type() == CV_8UC1);
+    NppiSize sz;
+    sz.width  = src.cols;
+    sz.height = src.rows;
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
+}
+////////////////////////////////////////////////////////////////////////
+// norm
+double cv::gpu::norm(const GpuMat& src1, int normType)
+{
+    return norm(src1, GpuMat(src1.size(), src1.type(), Scalar::all(0.0)), normType);
+}
+double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
+{
+    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert(src1.type() == CV_8UC1);
+    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
+    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+        NppiSize oSizeROI, Npp64f* pRetVal);
+    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+    NppiSize sz;
+    sz.width  = src1.cols;
+    sz.height = src1.rows;
+    int funcIdx = normType >> 1;
+    double retVal;
+    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step,
+        src2.ptr<Npp8u>(), src2.step,
+        sz, &retVal) );
+    return retVal;
+}
+////////////////////////////////////////////////////////////////////////
+// Sum
+namespace cv { namespace gpu { namespace mathfunc
+{
+    template <typename T>
+    void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    template <typename T>
+    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    template <typename T>
+    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    template <typename T>
+    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    namespace sum
+    {
+        void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows);
+    }
+}}}
+Scalar cv::gpu::sum(const GpuMat& src) 
+{
+    GpuMat buf;
+    return sum(src, buf);
+}
+Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) 
+{
+    using namespace mathfunc;
+    typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
+    static const Caller callers[2][7] = 
+        { { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>, 
+            sum_multipass_caller<unsigned short>, sum_multipass_caller<short>, 
+            sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
+          { sum_caller<unsigned char>, sum_caller<char>, 
+            sum_caller<unsigned short>, sum_caller<short>, 
+            sum_caller<int>, sum_caller<float>, 0 } };
+    Size bufSize;
+    sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); 
+    buf.create(bufSize, CV_8U);
+    Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
+    if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
+    double result[4];
+    caller(src, buf, result, src.channels());
+    return Scalar(result[0], result[1], result[2], result[3]);
+}
+Scalar cv::gpu::sqrSum(const GpuMat& src) 
+{
+    GpuMat buf;
+    return sqrSum(src, buf);
+}
+Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) 
+{
+    using namespace mathfunc;
+    typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
+    static const Caller callers[2][7] = 
+        { { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>, 
+            sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>, 
+            sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
+          { sqsum_caller<unsigned char>, sqsum_caller<char>, 
+            sqsum_caller<unsigned short>, sqsum_caller<short>, 
+            sqsum_caller<int>, sqsum_caller<float>, 0 } };
+    Size bufSize;
+    sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); 
+    buf.create(bufSize, CV_8U);
+    Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
+    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
+    double result[4];
+    caller(src, buf, result, src.channels());
+    return Scalar(result[0], result[1], result[2], result[3]);
+}
+////////////////////////////////////////////////////////////////////////
+// Find min or max
+namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
+    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
+    template <typename T> 
+    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
+    template <typename T> 
+    void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
+    template <typename T> 
+    void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
+    template <typename T> 
+    void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
+}}}}
+void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
+{
+    GpuMat buf;
+    minMax(src, minVal, maxVal, mask, buf);
+}
+void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
+{
+    using namespace mathfunc::minmax;
+    typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
+    typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);
+    static const Caller callers[2][7] = 
+    { { min_max_multipass_caller<unsigned char>, min_max_multipass_caller<char>, 
+        min_max_multipass_caller<unsigned short>, min_max_multipass_caller<short>, 
+        min_max_multipass_caller<int>, min_max_multipass_caller<float>, 0 },
+      { min_max_caller<unsigned char>, min_max_caller<char>, 
+        min_max_caller<unsigned short>, min_max_caller<short>, 
+        min_max_caller<int>, min_max_caller<float>, min_max_caller<double> } };
+    static const MaskedCaller masked_callers[2][7] = 
+    { { min_max_mask_multipass_caller<unsigned char>, min_max_mask_multipass_caller<char>, 
+        min_max_mask_multipass_caller<unsigned short>, min_max_mask_multipass_caller<short>, 
+        min_max_mask_multipass_caller<int>, min_max_mask_multipass_caller<float>, 0 },
+      { min_max_mask_caller<unsigned char>, min_max_mask_caller<char>, 
+        min_max_mask_caller<unsigned short>, min_max_mask_caller<short>, 
+        min_max_mask_caller<int>, min_max_mask_caller<float>, 
+        min_max_mask_caller<double> } };
+    CV_Assert(src.channels() == 1);
+    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
+    double minVal_; if (!minVal) minVal = &minVal_;
+    double maxVal_; if (!maxVal) maxVal = &maxVal_;
+    Size bufSize;
+    get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
+    buf.create(bufSize, CV_8U);
+    if (mask.empty())
+    {
+        Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
+        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        caller(src, minVal, maxVal, buf);
+    }
+    else
+    {
+        MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
+        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        caller(src, mask, minVal, maxVal, buf);
+    }
+}
+////////////////////////////////////////////////////////////////////////
+// Locate min and max
+namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
+    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
+                               int& b1rows, int& b2cols, int& b2rows);
+    template <typename T> 
+    void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, 
+                            int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
+    template <typename T> 
+    void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
+                                 int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
+    template <typename T> 
+    void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, 
+                                     int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
+    template <typename T> 
+    void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
+                                           int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf);
+}}}}
+void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
+{    
+    GpuMat valbuf, locbuf;
+    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valbuf, locbuf);
+}
+void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                        const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
+{
+    using namespace mathfunc::minmaxloc;
+    typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
+    static const Caller callers[2][7] = 
+    { { min_max_loc_multipass_caller<unsigned char>, min_max_loc_multipass_caller<char>, 
+        min_max_loc_multipass_caller<unsigned short>, min_max_loc_multipass_caller<short>, 
+        min_max_loc_multipass_caller<int>, min_max_loc_multipass_caller<float>, 0 },
+      { min_max_loc_caller<unsigned char>, min_max_loc_caller<char>, 
+        min_max_loc_caller<unsigned short>, min_max_loc_caller<short>, 
+        min_max_loc_caller<int>, min_max_loc_caller<float>, min_max_loc_caller<double> } };
+    static const MaskedCaller masked_callers[2][7] = 
+    { { min_max_loc_mask_multipass_caller<unsigned char>, min_max_loc_mask_multipass_caller<char>, 
+        min_max_loc_mask_multipass_caller<unsigned short>, min_max_loc_mask_multipass_caller<short>, 
+        min_max_loc_mask_multipass_caller<int>, min_max_loc_mask_multipass_caller<float>, 0 },
+      { min_max_loc_mask_caller<unsigned char>, min_max_loc_mask_caller<char>, 
+        min_max_loc_mask_caller<unsigned short>, min_max_loc_mask_caller<short>, 
+        min_max_loc_mask_caller<int>, min_max_loc_mask_caller<float>, min_max_loc_mask_caller<double> } };
+    CV_Assert(src.channels() == 1);
+    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
+    double minVal_; if (!minVal) minVal = &minVal_;
+    double maxVal_; if (!maxVal) maxVal = &maxVal_;
+    int minLoc_[2];
+    int maxLoc_[2];
+    Size valbuf_size, locbuf_size;
+    get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width, 
+                          valbuf_size.height, locbuf_size.width, locbuf_size.height);
+    valbuf.create(valbuf_size, CV_8U);
+    locbuf.create(locbuf_size, CV_8U);
+    if (mask.empty())
+    {
+        Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
+        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
+    }
+    else
+    {
+        MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
+        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf);
+    }
+    if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
+    if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
+}
+//////////////////////////////////////////////////////////////////////////////
+// Count non-zero elements
+namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
+    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
+    template <typename T> 
+    int count_non_zero_caller(const DevMem2D src, PtrStep buf);
+    template <typename T> 
+    int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf);
+}}}}
+int cv::gpu::countNonZero(const GpuMat& src)
+{
+    GpuMat buf;
+    return countNonZero(src, buf);
+}
+int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
+{
+    using namespace mathfunc::countnonzero;
+    typedef int (*Caller)(const DevMem2D src, PtrStep buf);
+    static const Caller callers[2][7] = 
+    { { count_non_zero_multipass_caller<unsigned char>, count_non_zero_multipass_caller<char>,
+        count_non_zero_multipass_caller<unsigned short>, count_non_zero_multipass_caller<short>,
+        count_non_zero_multipass_caller<int>, count_non_zero_multipass_caller<float>, 0},
+      { count_non_zero_caller<unsigned char>, count_non_zero_caller<char>,
+        count_non_zero_caller<unsigned short>, count_non_zero_caller<short>,
+        count_non_zero_caller<int>, count_non_zero_caller<float>, count_non_zero_caller<double> } };
+    CV_Assert(src.channels() == 1);
+    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
+    Size buf_size;
+    get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
+    buf.create(buf_size, CV_8U);
+    Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
+    if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
+    return caller(src, buf);
+}
+#endif