used new device layer for cv::gpu::reduce

1ef211b8 · Vladislav Vinogradov · 31a78143 · 1ef211b8 · 1ef211b8 · 1ef211b8
Commit 1ef211b8 authored Aug 27, 2013 by Vladislav Vinogradov
6 changed files
--- a/modules/cudaarithm/perf/perf_reductions.cpp
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);
+        TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);

        CUDA_SANITY_CHECK(dst);
    }
@@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
    {
        cv::Mat dst;

-        TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
+        TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);

        CPU_SANITY_CHECK(dst);
    }

--- a/modules/cudaarithm/src/cuda/reduce.cu
+++ b/modules/cudaarithm/src/cuda/reduce.cu
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -186,188 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
    return retVal;
 }

-//////////////////////////////////////////////////////////////////////////////
-// reduce
-
-namespace reduce
-{
-    template <typename T, typename S, typename D>
-    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
-    template <typename T, typename S, typename D>
-    void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-}
-
-void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
-{
-    GpuMat src = _src.getGpuMat();
-
-    CV_Assert( src.channels() <= 4 );
-    CV_Assert( dim == 0 || dim == 1 );
-    CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
-
-    if (dtype < 0)
-        dtype = src.depth();
-
-    _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
-    GpuMat dst = _dst.getGpuMat();
-
-    if (dim == 0)
-    {
-        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-        static const func_t funcs[7][7] =
-        {
-            {
-                ::reduce::rows<unsigned char, int, unsigned char>,
-                0/*::reduce::rows<unsigned char, int, signed char>*/,
-                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
-                0/*::reduce::rows<unsigned char, int, short>*/,
-                ::reduce::rows<unsigned char, int, int>,
-                ::reduce::rows<unsigned char, float, float>,
-                ::reduce::rows<unsigned char, double, double>
-            },
-            {
-                0/*::reduce::rows<signed char, int, unsigned char>*/,
-                0/*::reduce::rows<signed char, int, signed char>*/,
-                0/*::reduce::rows<signed char, int, unsigned short>*/,
-                0/*::reduce::rows<signed char, int, short>*/,
-                0/*::reduce::rows<signed char, int, int>*/,
-                0/*::reduce::rows<signed char, float, float>*/,
-                0/*::reduce::rows<signed char, double, double>*/
-            },
-            {
-                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
-                0/*::reduce::rows<unsigned short, int, signed char>*/,
-                ::reduce::rows<unsigned short, int, unsigned short>,
-                0/*::reduce::rows<unsigned short, int, short>*/,
-                ::reduce::rows<unsigned short, int, int>,
-                ::reduce::rows<unsigned short, float, float>,
-                ::reduce::rows<unsigned short, double, double>
-            },
-            {
-                0/*::reduce::rows<short, int, unsigned char>*/,
-                0/*::reduce::rows<short, int, signed char>*/,
-                0/*::reduce::rows<short, int, unsigned short>*/,
-                ::reduce::rows<short, int, short>,
-                ::reduce::rows<short, int, int>,
-                ::reduce::rows<short, float, float>,
-                ::reduce::rows<short, double, double>
-            },
-            {
-                0/*::reduce::rows<int, int, unsigned char>*/,
-                0/*::reduce::rows<int, int, signed char>*/,
-                0/*::reduce::rows<int, int, unsigned short>*/,
-                0/*::reduce::rows<int, int, short>*/,
-                ::reduce::rows<int, int, int>,
-                ::reduce::rows<int, float, float>,
-                ::reduce::rows<int, double, double>
-            },
-            {
-                0/*::reduce::rows<float, float, unsigned char>*/,
-                0/*::reduce::rows<float, float, signed char>*/,
-                0/*::reduce::rows<float, float, unsigned short>*/,
-                0/*::reduce::rows<float, float, short>*/,
-                0/*::reduce::rows<float, float, int>*/,
-                ::reduce::rows<float, float, float>,
-                ::reduce::rows<float, double, double>
-            },
-            {
-                0/*::reduce::rows<double, double, unsigned char>*/,
-                0/*::reduce::rows<double, double, signed char>*/,
-                0/*::reduce::rows<double, double, unsigned short>*/,
-                0/*::reduce::rows<double, double, short>*/,
-                0/*::reduce::rows<double, double, int>*/,
-                0/*::reduce::rows<double, double, float>*/,
-                ::reduce::rows<double, double, double>
-            }
-        };
-
-        const func_t func = funcs[src.depth()][dst.depth()];
-
-        if (!func)
-            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
-
-        func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
-    }
-    else
-    {
-        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-        static const func_t funcs[7][7] =
-        {
-            {
-                ::reduce::cols<unsigned char, int, unsigned char>,
-                0/*::reduce::cols<unsigned char, int, signed char>*/,
-                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
-                0/*::reduce::cols<unsigned char, int, short>*/,
-                ::reduce::cols<unsigned char, int, int>,
-                ::reduce::cols<unsigned char, float, float>,
-                ::reduce::cols<unsigned char, double, double>
-            },
-            {
-                0/*::reduce::cols<signed char, int, unsigned char>*/,
-                0/*::reduce::cols<signed char, int, signed char>*/,
-                0/*::reduce::cols<signed char, int, unsigned short>*/,
-                0/*::reduce::cols<signed char, int, short>*/,
-                0/*::reduce::cols<signed char, int, int>*/,
-                0/*::reduce::cols<signed char, float, float>*/,
-                0/*::reduce::cols<signed char, double, double>*/
-            },
-            {
-                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
-                0/*::reduce::cols<unsigned short, int, signed char>*/,
-                ::reduce::cols<unsigned short, int, unsigned short>,
-                0/*::reduce::cols<unsigned short, int, short>*/,
-                ::reduce::cols<unsigned short, int, int>,
-                ::reduce::cols<unsigned short, float, float>,
-                ::reduce::cols<unsigned short, double, double>
-            },
-            {
-                0/*::reduce::cols<short, int, unsigned char>*/,
-                0/*::reduce::cols<short, int, signed char>*/,
-                0/*::reduce::cols<short, int, unsigned short>*/,
-                ::reduce::cols<short, int, short>,
-                ::reduce::cols<short, int, int>,
-                ::reduce::cols<short, float, float>,
-                ::reduce::cols<short, double, double>
-            },
-            {
-                0/*::reduce::cols<int, int, unsigned char>*/,
-                0/*::reduce::cols<int, int, signed char>*/,
-                0/*::reduce::cols<int, int, unsigned short>*/,
-                0/*::reduce::cols<int, int, short>*/,
-                ::reduce::cols<int, int, int>,
-                ::reduce::cols<int, float, float>,
-                ::reduce::cols<int, double, double>
-            },
-            {
-                0/*::reduce::cols<float, float, unsigned char>*/,
-                0/*::reduce::cols<float, float, signed char>*/,
-                0/*::reduce::cols<float, float, unsigned short>*/,
-                0/*::reduce::cols<float, float, short>*/,
-                0/*::reduce::cols<float, float, int>*/,
-                ::reduce::cols<float, float, float>,
-                ::reduce::cols<float, double, double>
-            },
-            {
-                0/*::reduce::cols<double, double, unsigned char>*/,
-                0/*::reduce::cols<double, double, signed char>*/,
-                0/*::reduce::cols<double, double, unsigned short>*/,
-                0/*::reduce::cols<double, double, short>*/,
-                0/*::reduce::cols<double, double, int>*/,
-                0/*::reduce::cols<double, double, float>*/,
-                ::reduce::cols<double, double, double>
-            }
-        };
-
-        const func_t func = funcs[src.depth()][dst.depth()];
-
-        if (!func)
-            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
-
-        func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////
 // meanStdDev


--- a/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
@@ -54,12 +54,52 @@ namespace cv { namespace cudev {

 namespace grid_reduce_to_vec_detail
 {
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor, int cn> struct Reduce;
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 1>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[1][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem[0], myVal, threadIdx.x, op);
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 2>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[2][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1]), tie(myVal.x, myVal.y), threadIdx.x, make_tuple(op, op));
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 3>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[3][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2]), tie(myVal.x, myVal.y, myVal.z), threadIdx.x, make_tuple(op, op, op));
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 4>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[4][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2], smem[3]), tie(myVal.x, myVal.y, myVal.z, myVal.w), threadIdx.x, make_tuple(op, op, op, op));
+        }
+    };
+
    template <class Reductor, int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
    __global__ void reduceToColumn(const SrcPtr src, ResType* dst, const MaskPtr mask, const int cols)
    {
        typedef typename Reductor::work_type work_type;
+        typedef typename VecTraits<work_type>::elem_type work_elem_type;
+        const int cn = VecTraits<work_type>::cn;

-        __shared__ work_type smem[BLOCK_SIZE];
+        __shared__ work_elem_type smem[cn][BLOCK_SIZE];

        const int y = blockIdx.x;

@@ -75,7 +115,7 @@ namespace grid_reduce_to_vec_detail
            }
        }

-        blockReduce<BLOCK_SIZE>(smem, myVal, threadIdx.x, op);
+        Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, cn>::call(smem, myVal);

        if (threadIdx.x == 0)
            dst[y] = saturate_cast<ResType>(Reductor::result(myVal, cols));

--- a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
@@ -49,6 +49,7 @@
 #include "../common.hpp"
 #include "../util/vec_traits.hpp"
 #include "../util/limits.hpp"
+#include "../util/saturate_cast.hpp"
 #include "../ptr2d/traits.hpp"
 #include "../ptr2d/gpumat.hpp"
 #include "../ptr2d/mask.hpp"
@@ -62,6 +63,11 @@ template <typename T> struct Sum : plus<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Sum<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(0);
@@ -77,14 +83,19 @@ template <typename T> struct Avg : plus<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Avg<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(0);
    }

-    __device__ __forceinline__ static T result(T r, int sz)
+    __device__ __forceinline__ static T result(T r, float sz)
    {
-        return r / sz;
+        return saturate_cast<T>(r / sz);
    }
 };

@@ -92,6 +103,11 @@ template <typename T> struct Min : minimum<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Min<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
@@ -107,6 +123,11 @@ template <typename T> struct Max : maximum<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Max<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
@@ -158,7 +179,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, cons

    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    createContinuous(rows, 1, DataType<ResType>::type, dst);
+    dst.create(1, rows);

    grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
                                                                dst[0],
@@ -173,7 +194,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, Stre
    const int rows = getRows(src);
    const int cols = getCols(src);

-    createContinuous(rows, 1, DataType<ResType>::type, dst);
+    dst.create(1, rows);

    grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
                                                                dst[0],

--- a/modules/cudev/test/test_reduction.cu
+++ b/modules/cudev/test/test_reduction.cu
@@ -228,6 +228,9 @@ TEST(ReduceToColumn, Sum)

    Mat dst_gold;
    cv::reduce(src, dst_gold, 1, REDUCE_SUM, CV_32S);
+    dst_gold.cols = dst_gold.rows;
+    dst_gold.rows = 1;
+    dst_gold.step = dst_gold.cols * dst_gold.elemSize();

    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
@@ -244,6 +247,9 @@ TEST(ReduceToColumn, Avg)

    Mat dst_gold;
    cv::reduce(src, dst_gold, 1, REDUCE_AVG, CV_32F);
+    dst_gold.cols = dst_gold.rows;
+    dst_gold.rows = 1;
+    dst_gold.step = dst_gold.cols * dst_gold.elemSize();

    EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
 }
@@ -260,6 +266,9 @@ TEST(ReduceToColumn, Min)

    Mat dst_gold;
    cv::reduce(src, dst_gold, 1, REDUCE_MIN);
+    dst_gold.cols = dst_gold.rows;
+    dst_gold.rows = 1;
+    dst_gold.step = dst_gold.cols * dst_gold.elemSize();

    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
@@ -276,6 +285,9 @@ TEST(ReduceToColumn, Max)

    Mat dst_gold;
    cv::reduce(src, dst_gold, 1, REDUCE_MAX);
+    dst_gold.cols = dst_gold.rows;
+    dst_gold.rows = 1;
+    dst_gold.step = dst_gold.cols * dst_gold.elemSize();

    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }