Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
c3fa7974
Commit
c3fa7974
authored
Aug 14, 2012
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new optimized version
parent
9201db32
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
159 additions
and
39 deletions
+159
-39
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+2
-2
perf_imgproc.cpp
modules/gpu/perf/perf_imgproc.cpp
+3
-2
hough.cu
modules/gpu/src/cuda/hough.cu
+138
-25
hough.cpp
modules/gpu/src/hough.cpp
+15
-8
test_imgproc.cpp
modules/gpu/test/test_imgproc.cpp
+1
-2
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
c3fa7974
...
...
@@ -821,8 +821,8 @@ private:
};
CV_EXPORTS
void
HoughLines
(
const
GpuMat
&
src
,
GpuMat
&
lines
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
=
false
,
int
maxLines
=
4096
);
CV_EXPORTS
void
HoughLines
(
const
GpuMat
&
src
,
GpuMat
&
lines
,
GpuMat
&
accum
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
=
false
,
int
maxLines
=
4096
);
CV_EXPORTS
void
HoughLinesTransform
(
const
GpuMat
&
src
,
GpuMat
&
accum
,
float
rho
,
float
theta
);
CV_EXPORTS
void
HoughLines
(
const
GpuMat
&
src
,
GpuMat
&
lines
,
GpuMat
&
accum
,
GpuMat
&
buf
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
=
false
,
int
maxLines
=
4096
);
CV_EXPORTS
void
HoughLinesTransform
(
const
GpuMat
&
src
,
GpuMat
&
accum
,
GpuMat
&
buf
,
float
rho
,
float
theta
);
CV_EXPORTS
void
HoughLinesGet
(
const
GpuMat
&
accum
,
GpuMat
&
lines
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
=
false
,
int
maxLines
=
4096
);
CV_EXPORTS
void
HoughLinesDownload
(
const
GpuMat
&
d_lines
,
OutputArray
h_lines
,
OutputArray
h_voices
=
noArray
());
...
...
modules/gpu/perf/perf_imgproc.cpp
View file @
c3fa7974
...
...
@@ -1364,11 +1364,12 @@ GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort)
cv
::
gpu
::
GpuMat
d_src
(
src
);
cv
::
gpu
::
GpuMat
d_lines
;
cv
::
gpu
::
GpuMat
d_accum
;
cv
::
gpu
::
HoughLines
(
d_src
,
d_lines
,
d_accum
,
rho
,
theta
,
threshold
,
doSort
);
cv
::
gpu
::
GpuMat
d_buf
;
cv
::
gpu
::
HoughLines
(
d_src
,
d_lines
,
d_accum
,
d_buf
,
rho
,
theta
,
threshold
,
doSort
);
TEST_CYCLE
()
{
cv
::
gpu
::
HoughLines
(
d_src
,
d_lines
,
d_accum
,
rho
,
theta
,
threshold
,
doSort
);
cv
::
gpu
::
HoughLines
(
d_src
,
d_lines
,
d_accum
,
d_buf
,
rho
,
theta
,
threshold
,
doSort
);
}
}
...
...
modules/gpu/src/cuda/hough.cu
View file @
c3fa7974
...
...
@@ -42,55 +42,167 @@
#include <thrust/sort.h>
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/emulation.hpp"
namespace cv { namespace gpu { namespace device
{
namespace hough
{
__global__ void linesAccum(const DevMem2Db src, PtrStep_<uint> accum, const float theta, const int numangle, const int numrho, const float irho)
__device__ unsigned int g_counter;
const int PIXELS_PER_THREAD = 16;
__global__ void buildPointList(const DevMem2Db src, unsigned int* list)
{
const int x = blockIdx.x *
blockDim.x
+ threadIdx.x;
const int y = blockIdx.y *
blockDim.y
+ threadIdx.y;
const int x = blockIdx.x *
32 * PIXELS_PER_THREAD
+ threadIdx.x;
const int y = blockIdx.y *
4
+ threadIdx.y;
if (
x >= src.cols ||
y >= src.rows)
if (y >= src.rows)
return;
if (src(y, x))
volatile int qindex = -1;
__shared__ volatile int s_qindex[4];
__shared__ volatile int s_qstart[4];
s_qindex[threadIdx.y] = -1;
__shared__ volatile unsigned int s_queue[4][32 * PIXELS_PER_THREAD];
// fill the queue
for (int i = 0; i < PIXELS_PER_THREAD; ++i)
{
float ang = 0.0f;
for(int n = 0; n < numangle; ++n, ang += theta)
const int xx = i * blockDim.x + x;
if (xx >= src.cols)
break;
if (src(y, xx))
{
float sin_ang;
float cos_ang;
sincosf(ang, &sin_ang, &cos_ang);
const unsigned int queue_val = (y << 16) | xx;
const float tabSin = sin_ang * irho;
const float tabCos = cos_ang * irho;
do {
qindex++;
s_qindex[threadIdx.y] = qindex;
s_queue[threadIdx.y][qindex] = queue_val;
} while (s_queue[threadIdx.y][qindex] != queue_val);
}
// reload index from smem (last thread to write to smem will have updated it)
qindex = s_qindex[threadIdx.y];
}
int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2;
__syncthreads();
atomicInc(accum.ptr(n + 1) + r + 1, (unsigned int)-1);
// let one thread reserve the space required in the global list
if (threadIdx.x == 0 && threadIdx.y == 0)
{
// find how many items are stored in each list
int total_index = 0;
#pragma unroll
for (int i = 0; i < 4; ++i)
{
s_qstart[i] = total_index;
total_index += (s_qindex[i] + 1u);
}
//calculate the offset in the global list
const unsigned int global_offset = atomicAdd(&g_counter, total_index);
#pragma unroll
for (int i = 0; i < 4; ++i)
s_qstart[i] += global_offset;
}
__syncthreads();
// copy local queues to global queue
for(int i = 0; i <= qindex; i += 32)
{
if(i + threadIdx.x > qindex)
break;
unsigned int qvalue = s_queue[threadIdx.y][i + threadIdx.x];
list[s_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
}
void linesAccum_gpu(DevMem2Db src, DevMem2D_<uint> accum, float rho, float theta
)
unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list
)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
const dim3 block(32, 4);
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(buildPointList, cudaFuncCachePreferShared) );
linesAccum<<<grid, block>>>(src, accum, theta, accum.rows - 2, accum.cols - 2, 1.0f / rho
);
buildPointList<<<grid, block>>>(src, list
);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
return total_count;
}
__device__ unsigned int g_counter;
__global__ void linesAccum(const unsigned int* list, const unsigned int count, PtrStep_<unsigned int> accum,
const float irho, const float theta, const int numrho)
{
extern __shared__ unsigned int smem[];
for (int i = threadIdx.x; i < numrho; i += blockDim.x)
smem[i] = 0;
__syncthreads();
const int n = blockIdx.x;
const float ang = n * theta;
float sin_ang;
float cos_ang;
sincosf(ang, &sin_ang, &cos_ang);
const float tabSin = sin_ang * irho;
const float tabCos = cos_ang * irho;
for (int i = threadIdx.x; i < count; i += blockDim.x)
{
// read one element from global memory
const unsigned int qvalue = list[i];
const unsigned int x = (qvalue & 0x0000FFFF);
const unsigned int y = (qvalue >> 16) & 0x0000FFFF;
int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2;
Emulation::smem::atomicInc(&smem[r], (unsigned int)(-1));
}
__syncthreads();
for (int i = threadIdx.x; i < numrho; i += blockDim.x)
accum(n + 1, i + 1) = smem[i];
}
void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_<unsigned int> accum, float rho, float theta)
{
const dim3 block(1024);
const dim3 grid(accum.rows - 2);
cudaSafeCall( cudaFuncSetCacheConfig(linesAccum, cudaFuncCachePreferShared) );
size_t smem_size = (accum.cols - 2) * sizeof(unsigned int);
linesAccum<<<grid, block, smem_size>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void linesGetResult(const DevMem2D_<uint> accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
__global__ void linesGetResult(const DevMem2D_<unsigned int> accum, float2* out, int* voices, const int maxSize,
const float threshold, const float theta, const float rho, const int numrho)
{
__shared__ uint smem[8][32];
__shared__ u
nsigned
int smem[8][32];
int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
...
...
@@ -125,7 +237,8 @@ namespace cv { namespace gpu { namespace device
}
}
unsigned int linesGetResult_gpu(DevMem2D_<uint> accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort)
unsigned int linesGetResult_gpu(DevMem2D_<unsigned int> accum, float2* out, int* voices, unsigned int maxSize,
float rho, float theta, float threshold, bool doSort)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
...
...
@@ -140,8 +253,8 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() );
uint total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
u
nsigned
int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(u
nsigned
int), cudaMemcpyDeviceToHost) );
total_count = ::min(total_count, maxSize);
...
...
modules/gpu/src/hough.cpp
View file @
c3fa7974
...
...
@@ -46,16 +46,23 @@ namespace cv { namespace gpu { namespace device
{
namespace
hough
{
void
linesAccum_gpu
(
DevMem2Db
src
,
DevMem2D_
<
uint
>
accum
,
float
rho
,
float
theta
);
unsigned
int
buildPointList_gpu
(
DevMem2Db
src
,
unsigned
int
*
list
);
void
linesAccum_gpu
(
const
unsigned
int
*
list
,
unsigned
int
count
,
DevMem2D_
<
unsigned
int
>
accum
,
float
rho
,
float
theta
);
unsigned
int
linesGetResult_gpu
(
DevMem2D_
<
uint
>
accum
,
float2
*
out
,
int
*
voices
,
unsigned
int
maxSize
,
float
rho
,
float
theta
,
float
threshold
,
bool
doSort
);
}
}}}
void
cv
::
gpu
::
HoughLinesTransform
(
const
GpuMat
&
src
,
GpuMat
&
accum
,
float
rho
,
float
theta
)
void
cv
::
gpu
::
HoughLinesTransform
(
const
GpuMat
&
src
,
GpuMat
&
accum
,
GpuMat
&
buf
,
float
rho
,
float
theta
)
{
using
namespace
cv
::
gpu
::
device
;
using
namespace
cv
::
gpu
::
device
::
hough
;
CV_Assert
(
src
.
type
()
==
CV_8UC1
);
CV_Assert
(
src
.
cols
<
std
::
numeric_limits
<
unsigned
short
>::
max
());
CV_Assert
(
src
.
rows
<
std
::
numeric_limits
<
unsigned
short
>::
max
());
ensureSizeIsEnough
(
1
,
src
.
size
().
area
(),
CV_32SC1
,
buf
);
unsigned
int
count
=
buildPointList_gpu
(
src
,
buf
.
ptr
<
unsigned
int
>
());
const
int
numangle
=
cvRound
(
CV_PI
/
theta
);
const
int
numrho
=
cvRound
(((
src
.
cols
+
src
.
rows
)
*
2
+
1
)
/
rho
);
...
...
@@ -63,7 +70,7 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, f
ensureSizeIsEnough
(
numangle
+
2
,
numrho
+
2
,
CV_32SC1
,
accum
);
accum
.
setTo
(
cv
::
Scalar
::
all
(
0
));
hough
::
linesAccum_gpu
(
src
,
accum
,
rho
,
theta
);
linesAccum_gpu
(
buf
.
ptr
<
unsigned
int
>
(),
count
,
accum
,
rho
,
theta
);
}
void
cv
::
gpu
::
HoughLinesGet
(
const
GpuMat
&
accum
,
GpuMat
&
lines
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
,
int
maxLines
)
...
...
@@ -83,13 +90,13 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
void
cv
::
gpu
::
HoughLines
(
const
GpuMat
&
src
,
GpuMat
&
lines
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
,
int
maxLines
)
{
cv
::
gpu
::
GpuMat
accum
;
HoughLines
(
src
,
lines
,
accum
,
rho
,
theta
,
threshold
,
doSort
,
maxLines
);
cv
::
gpu
::
GpuMat
accum
,
buf
;
HoughLines
(
src
,
lines
,
accum
,
buf
,
rho
,
theta
,
threshold
,
doSort
,
maxLines
);
}
void
cv
::
gpu
::
HoughLines
(
const
GpuMat
&
src
,
GpuMat
&
lines
,
GpuMat
&
accum
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
,
int
maxLines
)
void
cv
::
gpu
::
HoughLines
(
const
GpuMat
&
src
,
GpuMat
&
lines
,
GpuMat
&
accum
,
GpuMat
&
buf
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
,
int
maxLines
)
{
HoughLinesTransform
(
src
,
accum
,
rho
,
theta
);
HoughLinesTransform
(
src
,
accum
,
buf
,
rho
,
theta
);
HoughLinesGet
(
accum
,
lines
,
rho
,
theta
,
threshold
,
doSort
,
maxLines
);
}
...
...
modules/gpu/test/test_imgproc.cpp
View file @
c3fa7974
...
...
@@ -1155,7 +1155,7 @@ TEST_P(HoughLines, Accuracy)
const
float
rho
=
1.0
f
;
const
float
theta
=
CV_PI
/
180.0
f
;
const
int
threshold
=
30
0
;
const
int
threshold
=
5
0
;
cv
::
Mat
img
=
readImage
(
fileName
,
cv
::
IMREAD_GRAYSCALE
);
ASSERT_FALSE
(
img
.
empty
());
...
...
@@ -1182,7 +1182,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine(
ALL_DEVICES
,
testing
::
Values
(
std
::
string
(
"../cv/shared/pic1.png"
),
std
::
string
(
"../cv/shared/pic3.png"
),
std
::
string
(
"../cv/shared/pic4.png"
),
std
::
string
(
"../cv/shared/pic5.png"
),
std
::
string
(
"../cv/shared/pic6.png"
))));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment