gpu::HoughLines : minor code improvements

c26d543e · Vladislav Vinogradov · 1e401207 · c26d543e · c26d543e · c26d543e
Commit c26d543e authored Aug 20, 2012 by Vladislav Vinogradov
5 changed files
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -820,6 +820,7 @@ private:
    int nLayers_;
 };
+//! HoughLines
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta);

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -1626,7 +1626,7 @@ PERF_TEST_P(Sz_DoSort, ImgProc_HoughLines, Combine(GPU_TYPICAL_MAT_SIZES, Bool()
    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
-    const int numLines = rng.uniform(500, 2000);
+    const int numLines = rng.uniform(100, 300);
    for (int i = 0; i < numLines; ++i)
    {
        cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows));

--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
        {
            __shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
            __shared__ int s_qsize[4];
-            __shared__ int s_start[4];
+            __shared__ int s_globStart[4];
            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -73,9 +73,10 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();
            // fill the queue
+            const uchar* srcRow = src.ptr(y);
            for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
            {
-                if (src(y, xx))
+                if (srcRow[xx])
                {
                    const unsigned int val = (y << 16) | xx;
                    const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
@@ -89,36 +90,34 @@ namespace cv { namespace gpu { namespace device
            if (threadIdx.x == 0 && threadIdx.y == 0)
            {
                // find how many items are stored in each list
-                int total_size = 0;
+                int totalSize = 0;
                for (int i = 0; i < blockDim.y; ++i)
                {
-                    s_start[i] = total_size;
+                    s_globStart[i] = totalSize;
-                    total_size += s_qsize[i];
+                    totalSize += s_qsize[i];
                }
                // calculate the offset in the global list
-                const int global_offset = atomicAdd(&g_counter, total_size);
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
                for (int i = 0; i < blockDim.y; ++i)
-                    s_start[i] += global_offset;
+                    s_globStart[i] += globalOffset;
            }
            __syncthreads();
            // copy local queues to global queue
            const int qsize = s_qsize[threadIdx.y];
-            for(int i = threadIdx.x; i < qsize; i += blockDim.x)
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
-            {
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
-                const unsigned int val = s_queues[threadIdx.y][i];
+                list[gidx] = s_queues[threadIdx.y][i];
-                list[s_start[threadIdx.y] + i] = val;
-            }
        }
        int buildPointList_gpu(DevMem2Db src, unsigned int* list)
        {
-            void* counter_ptr;
+            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
            const dim3 block(32, 4);
            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
@@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
-            int total_count;
+            int totalCount;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-            return total_count;
+            return totalCount;
        }
        ////////////////////////////////////////////////////////////////////////
@@ -144,24 +143,26 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;
-            float sin_ang;
+            float sinVal;
-            float cos_ang;
+            float cosVal;
-            sincosf(ang, &sin_ang, &cos_ang);
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
-            const float tabSin = sin_ang * irho;
+            const int shift = (numrho - 1) / 2;
-            const float tabCos = cos_ang * irho;
+            int* accumRow = accum.ptr(n + 1);
            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];
-                const int x = (qvalue & 0x0000FFFF);
+                const int x = (val & 0xFFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int y = (val >> 16) & 0xFFFF;
-                int r = __float2int_rn(x * tabCos + y * tabSin);
+                int r = __float2int_rn(x * cosVal + y * sinVal);
-                r += (numrho - 1) / 2;
+                r += shift;
-                ::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
+                ::atomicAdd(accumRow + r + 1, 1);
            }
        }
@@ -177,30 +178,32 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;
-            float sin_ang;
+            float sinVal;
-            float cos_ang;
+            float cosVal;
-            sincosf(ang, &sin_ang, &cos_ang);
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
-            const float tabSin = sin_ang * irho;
+            const int shift = (numrho - 1) / 2;
-            const float tabCos = cos_ang * irho;
            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];
-                const int x = (qvalue & 0x0000FFFF);
+                const int x = (val & 0xFFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int y = (val >> 16) & 0xFFFF;
-                int r = __float2int_rn(x * tabCos + y * tabSin);
+                int r = __float2int_rn(x * cosVal + y * sinVal);
-                r += (numrho - 1) / 2;
+                r += shift;
                Emulation::smem::atomicAdd(&smem[r + 1], 1);
            }
            __syncthreads();
-            for (int i = threadIdx.x; i < numrho; i += blockDim.x)
+            int* accumRow = accum.ptr(n + 1);
-                accum(n + 1, i) = smem[i];
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
        }
        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
@@ -225,21 +228,21 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // linesGetResult
-        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
+        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho)
        {
            __shared__ int smem[8][32];
-            int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
+            const int x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
-            int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
+            const int y = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
-            if (r >= accum.cols || n >= accum.rows)
+            if (x >= accum.cols || y >= accum.rows)
                return;
-            smem[threadIdx.y][threadIdx.x] = accum(n, r);
+            smem[threadIdx.y][threadIdx.x] = accum(y, x);
            __syncthreads();
-            r -= 1;
+            const int r = x - 1;
-            n -= 1;
+            const int n = y - 1;
            if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2)
                return;
@@ -264,32 +267,32 @@ namespace cv { namespace gpu { namespace device
        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort)
        {
-            void* counter_ptr;
+            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));
-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2);
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
-            int total_count;
+            int totalCount;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-            total_count = ::min(total_count, maxSize);
+            totalCount = ::min(totalCount, maxSize);
-            if (doSort && total_count > 0)
+            if (doSort && totalCount > 0)
            {
-                thrust::device_ptr<float2> out_ptr(out);
+                thrust::device_ptr<float2> outPtr(out);
-                thrust::device_ptr<int> votes_ptr(votes);
+                thrust::device_ptr<int> votesPtr(votes);
-                thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater<int>());
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
            }
-            return total_count;
+            return totalCount;
        }
    }
 }}}
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -57,11 +57,27 @@ namespace cv { namespace gpu { namespace device
    namespace hough
    {
        int buildPointList_gpu(DevMem2Db src, unsigned int* list);
        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort);
    }
 }}}
+//////////////////////////////////////////////////////////
+// HoughLines
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    GpuMat accum, buf;
+    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
+}
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    HoughLinesTransform(src, accum, buf, rho, theta);
+    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
+}
 void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta)
 {
    using namespace cv::gpu::device::hough;
@@ -80,23 +96,23 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
    CV_Assert(numangle > 0 && numrho > 0);
    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
-    accum.setTo(cv::Scalar::all(0));
+    accum.setTo(Scalar::all(0));
-    cv::gpu::DeviceInfo devInfo;
+    DeviceInfo devInfo;
    if (count > 0)
-        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(cv::gpu::FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 }
 void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
 {
-    using namespace cv::gpu::device;
+    using namespace cv::gpu::device::hough;
    CV_Assert(accum.type() == CV_32SC1);
    ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
-    int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
+    int count = linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
    if (count > 0)
        lines.cols = count;
@@ -104,18 +120,6 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
        lines.release();
 }
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    cv::gpu::GpuMat accum, buf;
-    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
-}
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    HoughLinesTransform(src, accum, buf, rho, theta);
-    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
-}
 void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
 {
    if (d_lines.empty())
@@ -129,14 +133,14 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
    CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);
    h_lines_.create(1, d_lines.cols, CV_32FC2);
-    cv::Mat h_lines = h_lines_.getMat();
+    Mat h_lines = h_lines_.getMat();
    d_lines.row(0).download(h_lines);
    if (h_votes_.needed())
    {
        h_votes_.create(1, d_lines.cols, CV_32SC1);
-        cv::Mat h_votes = h_votes_.getMat();
+        Mat h_votes = h_votes_.getMat();
-        cv::gpu::GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
+        GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
        d_votes.download(h_votes);
    }
 }

--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -1129,63 +1129,67 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines
-PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, std::string)
+PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, cv::Size, UseRoi)
 {
-};
+    void generateLines(cv::Mat& img)
+    {
+        img.setTo(cv::Scalar::all(0));
-void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
+        cv::line(img, cv::Point(20, 0), cv::Point(20, img.rows), cv::Scalar::all(255));
-{
+        cv::line(img, cv::Point(0, 50), cv::Point(img.cols, 50), cv::Scalar::all(255));
-    for (size_t i = 0; i < lines.size(); ++i)
+        cv::line(img, cv::Point(0, 0), cv::Point(img.cols, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(img.cols, 0), cv::Point(0, img.rows), cv::Scalar::all(255));
+    }
+    void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
    {
-        float rho = lines[i][0], theta = lines[i][1];
+        dst.setTo(cv::Scalar::all(0));
-        cv::Point pt1, pt2;
-        double a = std::cos(theta), b = std::sin(theta);
+        for (size_t i = 0; i < lines.size(); ++i)
-        double x0 = a*rho, y0 = b*rho;
+        {
-        pt1.x = cvRound(x0 + 1000*(-b));
+            float rho = lines[i][0], theta = lines[i][1];
-        pt1.y = cvRound(y0 + 1000*(a));
+            cv::Point pt1, pt2;
-        pt2.x = cvRound(x0 - 1000*(-b));
+            double a = std::cos(theta), b = std::sin(theta);
-        pt2.y = cvRound(y0 - 1000*(a));
+            double x0 = a*rho, y0 = b*rho;
-        cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+            pt1.x = cvRound(x0 + 1000*(-b));
+            pt1.y = cvRound(y0 + 1000*(a));
+            pt2.x = cvRound(x0 - 1000*(-b));
+            pt2.y = cvRound(y0 - 1000*(a));
+            cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+        }
    }
-}
+};
 TEST_P(HoughLines, Accuracy)
 {
    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
    cv::gpu::setDevice(devInfo.deviceID());
-    const std::string fileName = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);
    const float rho = 1.0f;
-    const float theta = static_cast<float>(CV_PI / 180);
+    const float theta = 1.5f * CV_PI / 180.0f;
-    const int threshold = 50;
+    const int threshold = 100;
-    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-    cv::Mat edges;
+    cv::Mat src(size, CV_8UC1);
-    cv::Canny(img, edges, 50, 200);
+    generateLines(src);
    cv::gpu::GpuMat d_lines;
-    cv::gpu::HoughLines(loadMat(edges), d_lines, rho, theta, threshold);
+    cv::gpu::HoughLines(loadMat(src, useRoi), d_lines, rho, theta, threshold);
    std::vector<cv::Vec2f> lines;
    cv::gpu::HoughLinesDownload(d_lines, lines);
-    cv::Mat dst(img.size(), CV_8UC1, cv::Scalar::all(0));
-    drawLines(dst, lines);
-    std::vector<cv::Vec2f> lines_gold;
+    cv::Mat dst(size, CV_8UC1);
-    cv::HoughLines(edges, lines_gold, rho, theta, threshold);
+    drawLines(dst, lines);
-    cv::Mat dst_gold(img.size(), CV_8UC1, cv::Scalar::all(0));
-    drawLines(dst_gold, lines_gold);
-    ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+    ASSERT_MAT_NEAR(src, dst, 0.0);
 }
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine(
    ALL_DEVICES,
-    testing::Values(std::string("../cv/shared/pic1.png"),
+    DIFFERENT_SIZES,
-                    std::string("../cv/shared/pic3.png"),
+    WHOLE_SUBMAT));
-                    std::string("../cv/shared/pic5.png"),
-                    std::string("../cv/shared/pic6.png"))));
 } // namespace