added performance tests for gpu::reduce

67a9b794 · Vladislav Vinogradov · 66b41b67 · 67a9b794 · 67a9b794 · 67a9b794
Commit 67a9b794 authored Sep 26, 2011 by Vladislav Vinogradov
Showing with 109 additions and 19 deletions

perf_arithm.cpp modules/gpu/perf/perf_arithm.cpp +31 -0

matrix_reductions.cu modules/gpu/src/cuda/matrix_reductions.cu +42 -19

tests.cpp samples/gpu/performance/tests.cpp +36 -0

No files found.
--- a/modules/gpu/perf/perf_arithm.cpp
+++ b/modules/gpu/perf/perf_arithm.cpp
@@ -716,3 +716,34 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI

    SANITY_CHECK(dst_host);
 }
+
+PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::ValuesIn(devices()), 
+                                                                    testing::Values(GPU_TYPICAL_MAT_SIZES), 
+                                                                    testing::Values(CV_8UC1, CV_8UC4, CV_32FC1), 
+                                                                    testing::Values((int)HORIZONTAL_AXIS, (int)VERTICAL_AXIS)))
+{
+    DeviceInfo devInfo = std::tr1::get<0>(GetParam());
+    Size size = std::tr1::get<1>(GetParam());
+    int type = std::tr1::get<2>(GetParam());
+    int dim = std::tr1::get<3>(GetParam());
+
+    setDevice(devInfo.deviceID());
+
+    Mat src_host(size, type);
+
+    declare.in(src_host, WARMUP_RNG);
+
+    GpuMat src(src_host);
+    GpuMat dst(size, type);
+
+    declare.time(0.5).iterations(100);
+
+    SIMPLE_TEST_CYCLE()
+    {
+        reduce(src, dst, dim, CV_REDUCE_MIN);
+    }
+
+    Mat dst_host = dst;
+
+    SANITY_CHECK(dst_host);
+}
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -1894,27 +1894,29 @@ namespace cv { namespace gpu { namespace mathfunc

        const int x = blockIdx.x * 16 + threadIdx.x;

-        if (x < src.cols)
-        {
        S myVal = op.startValue();

+        if (x < src.cols)
+        {
            for (int y = threadIdx.y; y < src.rows; y += 16)
                myVal = op(myVal, src.ptr(y)[x]);
+        }        

-            smem[threadIdx.y * 16 + threadIdx.x] = myVal;
+        smem[threadIdx.x * 16 + threadIdx.y] = myVal;
        __syncthreads();

-            if (threadIdx.y == 0)
+        if (threadIdx.x < 8)
        {
-                myVal = smem[threadIdx.x];
-
-                #pragma unroll
-                for (int i = 1; i < 16; ++i)
-                    myVal = op(myVal, smem[i * 16 + threadIdx.x]);
-
-                dst[x] = saturate_cast<D>(op.result(myVal, src.rows));
-            }
+            volatile S* srow = smem + threadIdx.y * 16;
+            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
+            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
+            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
+            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
        }
+        __syncthreads();
+
+        if (threadIdx.y == 0 && x < src.cols)
+            dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
    }

    template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
@@ -1965,7 +1967,6 @@ namespace cv { namespace gpu { namespace mathfunc



-
    template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
    {
        __shared__ S smem[256 * cn];
@@ -1980,6 +1981,9 @@ namespace cv { namespace gpu { namespace mathfunc
        for (int c = 0; c < cn; ++c)
            myVal[c] = op.startValue();

+#if __CUDA_ARCH__ >= 200
+
+        // For cc >= 2.0 prefer L1 cache
        for (int x = threadIdx.x; x < src.cols; x += 256)
        {
            #pragma unroll
@@ -1987,6 +1991,29 @@ namespace cv { namespace gpu { namespace mathfunc
                myVal[c] = op(myVal[c], src_row[x * cn + c]);
        }

+#else // __CUDA_ARCH__ >= 200
+
+        // For older arch use shared memory for cache
+        for (int x = 0; x < src.cols; x += 256)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+            {
+                smem[c * 256 + threadIdx.x] = op.startValue();
+                const int load_x = x * cn + c * 256 + threadIdx.x;
+                if (load_x < src.cols * cn)
+                    smem[c * 256 + threadIdx.x] = src_row[load_x];
+            }
+            __syncthreads();
+
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+                myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
+            __syncthreads();
+        }
+
+#endif // __CUDA_ARCH__ >= 200
+
        #pragma unroll
        for (int c = 0; c < cn; ++c)
            smem[c * 256 + threadIdx.x] = myVal[c];
@@ -2025,12 +2052,8 @@ namespace cv { namespace gpu { namespace mathfunc
        }
        __syncthreads();

-        if (threadIdx.x == 0)
-        {
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-                dst[y * cn + c] = saturate_cast<D>(op.result(smem[c * 256], src.cols));
-        }
+        if (threadIdx.x < cn)
+            dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
    }

    template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1419,3 +1419,39 @@ TEST(Canny)
    gpu::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
    GPU_OFF;
 }
+
+
+TEST(reduce)
+{
+    for (int size = 1000; size < 4000; size += 1000)
+    {
+        Mat src;
+        gen(src, size, size, CV_32F, 0, 255);
+        Mat dst0(1, src.cols, CV_32F);
+        Mat dst1(src.rows, 1, CV_32F);
+
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst0(1, src.cols, CV_32F);
+        gpu::GpuMat d_dst1(1, src.rows, CV_32F);
+
+        SUBTEST << "size " << size << ", dim = 0";
+
+        CPU_ON;
+        reduce(src, dst0, 0, CV_REDUCE_MIN);
+        CPU_OFF;
+
+        GPU_ON;
+        gpu::reduce(d_src, d_dst0, 0, CV_REDUCE_MIN);
+        GPU_OFF;
+
+        SUBTEST << "size " << size << ", dim = 1";
+
+        CPU_ON;
+        reduce(src, dst1, 1, CV_REDUCE_MIN);
+        CPU_OFF;
+
+        GPU_ON;
+        gpu::reduce(d_src, d_dst1, 1, CV_REDUCE_MIN);
+        GPU_OFF;
+    }
+}