new reduce and reduceKeyVal implementation

7a1874b2 · Vladislav Vinogradov · d47c1124 · 7a1874b2 · 7a1874b2 · 7a1874b2
Commit 7a1874b2 authored Nov 12, 2012 by Vladislav Vinogradov
8 changed files
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
--- a/modules/gpu/include/opencv2/gpu/device/reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device
    ///////////////////////////////////////////////////////////////////////////////
    // Reduction
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce_old(volatile T* data, T& partial_reduction, int tid, const Op& op)
    {
        StaticAssert<n >= 8 && n <= 512>::check();
        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);

--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
        }
        __device__ __forceinline__ operator int() const
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
        }
        __device__ __forceinline__ operator float() const
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
        }
        __device__ __forceinline__ operator float() const
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
        }
        __device__ __forceinline__ operator int() const

--- a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
+#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -109,9 +109,9 @@ namespace cv { namespace gpu { namespace device
                    c += Ix * Iy;
                }
-                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
+                reduce_old<32>(srow, a, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
+                reduce_old<32>(srow, b, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+                reduce_old<32>(srow, c, threadIdx.x, plus<volatile int>());
                if (threadIdx.x == 0)
                {
@@ -167,7 +167,7 @@ namespace cv { namespace gpu { namespace device
                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                    m_10 += u * image(loc.y, loc.x + u);
-                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+                reduce_old<32>(srow, m_10, threadIdx.x, plus<volatile int>());
                for (int v = 1; v <= half_k; ++v)
                {
@@ -185,8 +185,8 @@ namespace cv { namespace gpu { namespace device
                        m_sum += u * (val_plus + val_minus);
                    }
-                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
+                    reduce_old<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
-                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+                    reduce_old<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
                    m_10 += m_sum;
                    m_01 += v * v_sum;
@@ -419,4 +419,4 @@ namespace cv { namespace gpu { namespace device
    }
 }}}
 #endif /* CUDA_DISABLER */
\ No newline at end of file
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -599,8 +599,8 @@ namespace cv { namespace gpu { namespace device
                    sumy += s_Y[threadIdx.x + 96];
                }
-                device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
+                device::reduce_old<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
-                device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
+                device::reduce_old<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
                const float temp_mod = sumx * sumx + sumy * sumy;
                if (temp_mod > best_mod)