eliminate unnecessary double arithmetics in CUDA

cd2b7448 · Vladislav Vinogradov · 0b270e2b · cd2b7448 · cd2b7448 · cd2b7448
Commit cd2b7448 authored May 30, 2013 by Vladislav Vinogradov
9 changed files
--- a/modules/gpuarithm/src/cuda/reduce.cu
+++ b/modules/gpuarithm/src/cuda/reduce.cu
@@ -72,7 +72,7 @@ namespace reduce
        }
        template <typename T>
-        __device__ __forceinline__ T result(T r, double) const
+        __device__ __forceinline__ T result(T r, int) const
        {
            return r;
        }
@@ -81,6 +81,15 @@ namespace reduce
        __host__ __device__ __forceinline__ Sum(const Sum&) {}
    };
+    template <typename T> struct OutputType
+    {
+        typedef float type;
+    };
+    template <> struct OutputType<double>
+    {
+        typedef double type;
+    };
    struct Avg
    {
        template <typename T>
@@ -96,7 +105,7 @@ namespace reduce
        }
        template <typename T>
-        __device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
+        __device__ __forceinline__ typename TypeVec<typename OutputType<typename VecTraits<T>::elem_type>::type, VecTraits<T>::cn>::vec_type result(T r, float sz) const
        {
            return r / sz;
        }
@@ -121,7 +130,7 @@ namespace reduce
        }
        template <typename T>
-        __device__ __forceinline__ T result(T r, double) const
+        __device__ __forceinline__ T result(T r, int) const
        {
            return r;
        }
@@ -146,7 +155,7 @@ namespace reduce
        }
        template <typename T>
-        __device__ __forceinline__ T result(T r, double) const
+        __device__ __forceinline__ T result(T r, int) const
        {
            return r;
        }

--- a/modules/gpubgsegm/src/cuda/mog2.cu
+++ b/modules/gpubgsegm/src/cuda/mog2.cu
@@ -227,7 +227,7 @@ namespace cv { namespace gpu { namespace cudev
                //check prune
                if (weight < -prune)
                {
-                    weight = 0.0;
+                    weight = 0.0f;
                    nmodes--;
                }

--- a/modules/gpufeatures2d/perf/perf_features2d.cpp
+++ b/modules/gpufeatures2d/perf/perf_features2d.cpp
@@ -123,7 +123,7 @@ PERF_TEST_P(Image_NFeatures, ORB,
        sortKeyPoints(gpu_keypoints, gpu_descriptors);
-        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints, 1e-4);
        SANITY_CHECK(gpu_descriptors);
    }
    else

--- a/modules/gpufeatures2d/src/cuda/orb.cu
+++ b/modules/gpufeatures2d/src/cuda/orb.cu
@@ -197,8 +197,8 @@ namespace cv { namespace gpu { namespace cudev
                if (threadIdx.x == 0)
                {
                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
-                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
+                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI_F);
-                    kp_dir *= 180.0f / CV_PI;
+                    kp_dir *= 180.0f / CV_PI_F;
                    angle[ptidx] = kp_dir;
                }
@@ -349,7 +349,7 @@ namespace cv { namespace gpu { namespace cudev
            if (ptidx < npoints && descidx < dsize)
            {
                float angle = angle_[ptidx];
-                angle *= (float)(CV_PI / 180.f);
+                angle *= (float)(CV_PI_F / 180.f);
                float sina, cosa;
                ::sincosf(angle, &sina, &cosa);

--- a/modules/gpuimgproc/src/cuda/bilateral_filter.cu
+++ b/modules/gpuimgproc/src/cuda/bilateral_filter.cu
@@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace cudev
            B<T> b(src.rows, src.cols);
            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
-             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
+            float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);

--- a/modules/gpuimgproc/src/cuda/canny.cu
+++ b/modules/gpuimgproc/src/cuda/canny.cu
@@ -43,7 +43,7 @@
 #if !defined CUDA_DISABLER
 #include <utility>
-#include <algorithm>//std::swap
+#include <algorithm>
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
 #include "opencv2/core/cuda/transform.hpp"

--- a/modules/gpuoptflow/src/cuda/needle_map.cu
+++ b/modules/gpuoptflow/src/cuda/needle_map.cu
@@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace cudev
                const float u_avg_val = u_avg(y, x);
                const float v_avg_val = v_avg(y, x);
-                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
+                const float theta = ::atan2f(v_avg_val, u_avg_val);
                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
                r = fmin(14.0f * (r / max_flow), 14.0f);

--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -137,10 +137,10 @@ typedef unsigned char uchar;
    template<bool isDefaultNum>
    __device__ __forceinline__ int fast_angle_bin(const float& dx, const float& dy)
    {
-        const float angle_quantum = CV_PI / 6.f;
+        const float angle_quantum = CV_PI_F / 6.f;
        float angle = atan2(dx, dy) + (angle_quantum / 2.f);
-        if (angle < 0) angle += CV_PI;
+        if (angle < 0) angle += CV_PI_F;
        const float angle_scaling = 1.f / angle_quantum;
        return static_cast<int>(angle * angle_scaling) % 6;
@@ -174,8 +174,8 @@ typedef unsigned char uchar;
        {
            int i = 3;
            float2 bin_vector_i;
-            bin_vector_i.x = ::cos(i * (CV_PI / 6.f));
+            bin_vector_i.x = ::cos(i * (CV_PI_F / 6.f));
-            bin_vector_i.y = ::sin(i * (CV_PI / 6.f));
+            bin_vector_i.y = ::sin(i * (CV_PI_F / 6.f));
            const float dot_product = fabs(dx * bin_vector_i.x + dy * bin_vector_i.y);
            if(dot_product > max_dot)

--- a/modules/superres/perf/perf_superres.cpp
+++ b/modules/superres/perf/perf_superres.cpp
@@ -160,7 +160,7 @@ PERF_TEST_P(Size_MatType, SuperResolution_BTVL1,
        TEST_CYCLE_N(10) superRes->nextFrame(dst);
-        GPU_SANITY_CHECK(dst);
+        GPU_SANITY_CHECK(dst, 2);
    }
    else
    {