added linesAccumGlobal kernel

7928cec6 · Vladislav Vinogradov · 7ae94c57 · 7928cec6 · 7928cec6 · 7928cec6
Commit 7928cec6 authored Aug 15, 2012 by Vladislav Vinogradov
5 changed files
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -112,6 +112,8 @@ namespace cv { namespace gpu

        int multiProcessorCount() const { return multi_processor_count_; }

+        size_t sharedMemPerBlock() const { return sharedMemPerBlock_; }
+
        size_t freeMemory() const;
        size_t totalMemory() const;

@@ -133,6 +135,7 @@ namespace cv { namespace gpu
        int multi_processor_count_;
        int majorVersion_;
        int minorVersion_;
+        size_t sharedMemPerBlock_;
    };

    CV_EXPORTS void printCudaDeviceInfo(int device);

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -42,7 +42,6 @@

 #include "precomp.hpp"
 #include "opencv2/core/gpumat.hpp"
-
 #include <iostream>

 #ifdef HAVE_CUDA
@@ -301,6 +300,7 @@ void cv::gpu::DeviceInfo::query()
    multi_processor_count_ = prop.multiProcessorCount;
    majorVersion_ = prop.major;
    minorVersion_ = prop.minor;
+    sharedMemPerBlock_ = prop.sharedMemPerBlock;
 }

 void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) const

--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -56,9 +56,9 @@ namespace cv { namespace gpu { namespace device
 {
    namespace hough
    {
-        unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list);
-        void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_<unsigned int> accum, float rho, float theta);
-        unsigned int linesGetResult_gpu(DevMem2D_<uint> accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort);
+        int buildPointList_gpu(DevMem2Db src, unsigned int* list);
+        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock);
+        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort);
    }
 }}}

@@ -71,16 +71,21 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
    CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());

    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf);
-    unsigned int count = buildPointList_gpu(src, buf.ptr<unsigned int>());
+
+    const int count = buildPointList_gpu(src, buf.ptr<unsigned int>());

    const int numangle = cvRound(CV_PI / theta);
    const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);

+    CV_Assert(numangle > 0 && numrho > 0);
+
    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
    accum.setTo(cv::Scalar::all(0));

+    cv::gpu::DeviceInfo devInfo;
+
    if (count > 0)
-        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta);
+        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock());
 }

 void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
@@ -90,7 +95,8 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
    CV_Assert(accum.type() == CV_32SC1);

    ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
-    unsigned int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
+
+    int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);

    if (count > 0)
        lines.cols = count;

--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device
            }

            template<typename T>
-            static __device__ __forceinline__ void atomicAdd(T* address, T val)
+            static __device__ __forceinline__ T atomicAdd(T* address, T val)
            {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
                T count;
@@ -110,8 +110,10 @@ namespace cv { namespace gpu { namespace device
                    count = tag | (count + val);
                    *address = count;
                } while (*address != count);
+
+                return (count & TAG_MASK) - val;
 #else
-                ::atomicAdd(address, val);
+                return ::atomicAdd(address, val);
 #endif
            }