Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
7928cec6
Commit
7928cec6
authored
Aug 15, 2012
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added linesAccumGlobal kernel
parent
7ae94c57
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
97 additions
and
49 deletions
+97
-49
gpumat.hpp
modules/core/include/opencv2/core/gpumat.hpp
+3
-0
gpumat.cpp
modules/core/src/gpumat.cpp
+1
-1
hough.cu
modules/gpu/src/cuda/hough.cu
+76
-38
hough.cpp
modules/gpu/src/hough.cpp
+12
-6
emulation.hpp
modules/gpu/src/opencv2/gpu/device/emulation.hpp
+5
-4
No files found.
modules/core/include/opencv2/core/gpumat.hpp
View file @
7928cec6
...
@@ -112,6 +112,8 @@ namespace cv { namespace gpu
...
@@ -112,6 +112,8 @@ namespace cv { namespace gpu
int
multiProcessorCount
()
const
{
return
multi_processor_count_
;
}
int
multiProcessorCount
()
const
{
return
multi_processor_count_
;
}
size_t
sharedMemPerBlock
()
const
{
return
sharedMemPerBlock_
;
}
size_t
freeMemory
()
const
;
size_t
freeMemory
()
const
;
size_t
totalMemory
()
const
;
size_t
totalMemory
()
const
;
...
@@ -133,6 +135,7 @@ namespace cv { namespace gpu
...
@@ -133,6 +135,7 @@ namespace cv { namespace gpu
int
multi_processor_count_
;
int
multi_processor_count_
;
int
majorVersion_
;
int
majorVersion_
;
int
minorVersion_
;
int
minorVersion_
;
size_t
sharedMemPerBlock_
;
};
};
CV_EXPORTS
void
printCudaDeviceInfo
(
int
device
);
CV_EXPORTS
void
printCudaDeviceInfo
(
int
device
);
...
...
modules/core/src/gpumat.cpp
View file @
7928cec6
...
@@ -42,7 +42,6 @@
...
@@ -42,7 +42,6 @@
#include "precomp.hpp"
#include "precomp.hpp"
#include "opencv2/core/gpumat.hpp"
#include "opencv2/core/gpumat.hpp"
#include <iostream>
#include <iostream>
#ifdef HAVE_CUDA
#ifdef HAVE_CUDA
...
@@ -301,6 +300,7 @@ void cv::gpu::DeviceInfo::query()
...
@@ -301,6 +300,7 @@ void cv::gpu::DeviceInfo::query()
multi_processor_count_
=
prop
.
multiProcessorCount
;
multi_processor_count_
=
prop
.
multiProcessorCount
;
majorVersion_
=
prop
.
major
;
majorVersion_
=
prop
.
major
;
minorVersion_
=
prop
.
minor
;
minorVersion_
=
prop
.
minor
;
sharedMemPerBlock_
=
prop
.
sharedMemPerBlock
;
}
}
void
cv
::
gpu
::
DeviceInfo
::
queryMemory
(
size_t
&
free_memory
,
size_t
&
total_memory
)
const
void
cv
::
gpu
::
DeviceInfo
::
queryMemory
(
size_t
&
free_memory
,
size_t
&
total_memory
)
const
...
...
modules/gpu/src/cuda/hough.cu
View file @
7928cec6
...
@@ -48,15 +48,18 @@ namespace cv { namespace gpu { namespace device
...
@@ -48,15 +48,18 @@ namespace cv { namespace gpu { namespace device
{
{
namespace hough
namespace hough
{
{
__device__ unsigned int g_counter;
__device__ int g_counter;
////////////////////////////////////////////////////////////////////////
// buildPointList
const int PIXELS_PER_THREAD = 16;
const int PIXELS_PER_THREAD = 16;
__global__ void buildPointList(const DevMem2Db src, unsigned int* list)
__global__ void buildPointList(const DevMem2Db src, unsigned int* list)
{
{
__shared__
unsigned
int s_queues[4][32 * PIXELS_PER_THREAD];
__shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
__shared__
unsigned
int s_qsize[4];
__shared__ int s_qsize[4];
__shared__
unsigned
int s_start[4];
__shared__ int s_start[4];
const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
...
@@ -75,7 +78,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -75,7 +78,7 @@ namespace cv { namespace gpu { namespace device
if (src(y, xx))
if (src(y, xx))
{
{
const unsigned int val = (y << 16) | xx;
const unsigned int val = (y << 16) | xx;
int qidx = Emulation::smem::atomicInc(&s_qsize[threadIdx.y], (unsigned int)(-1)
);
const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1
);
s_queues[threadIdx.y][qidx] = val;
s_queues[threadIdx.y][qidx] = val;
}
}
}
}
...
@@ -86,15 +89,15 @@ namespace cv { namespace gpu { namespace device
...
@@ -86,15 +89,15 @@ namespace cv { namespace gpu { namespace device
if (threadIdx.x == 0 && threadIdx.y == 0)
if (threadIdx.x == 0 && threadIdx.y == 0)
{
{
// find how many items are stored in each list
// find how many items are stored in each list
unsigned
int total_size = 0;
int total_size = 0;
for (int i = 0; i < blockDim.y; ++i)
for (int i = 0; i < blockDim.y; ++i)
{
{
s_start[i] = total_size;
s_start[i] = total_size;
total_size += s_qsize[i];
total_size += s_qsize[i];
}
}
//calculate the offset in the global list
//
calculate the offset in the global list
const
unsigned
int global_offset = atomicAdd(&g_counter, total_size);
const int global_offset = atomicAdd(&g_counter, total_size);
for (int i = 0; i < blockDim.y; ++i)
for (int i = 0; i < blockDim.y; ++i)
s_start[i] += global_offset;
s_start[i] += global_offset;
}
}
...
@@ -102,20 +105,20 @@ namespace cv { namespace gpu { namespace device
...
@@ -102,20 +105,20 @@ namespace cv { namespace gpu { namespace device
__syncthreads();
__syncthreads();
// copy local queues to global queue
// copy local queues to global queue
const
unsigned
int qsize = s_qsize[threadIdx.y];
const int qsize = s_qsize[threadIdx.y];
for(int i = threadIdx.x; i < qsize; i += blockDim.x)
for(int i = threadIdx.x; i < qsize; i += blockDim.x)
{
{
unsigned int val = s_queues[threadIdx.y][i];
const
unsigned int val = s_queues[threadIdx.y][i];
list[s_start[threadIdx.y] + i] = val;
list[s_start[threadIdx.y] + i] = val;
}
}
}
}
unsigned
int buildPointList_gpu(DevMem2Db src, unsigned int* list)
int buildPointList_gpu(DevMem2Db src, unsigned int* list)
{
{
void* counter_ptr;
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(
unsigned
int)) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
const dim3 block(32, 4);
const dim3 block(32, 4);
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
...
@@ -127,19 +130,48 @@ namespace cv { namespace gpu { namespace device
...
@@ -127,19 +130,48 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned
int total_count;
int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(
unsigned
int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
return total_count;
return total_count;
}
}
__global__ void linesAccum(const unsigned int* list, const unsigned int count, PtrStep_<unsigned int> accum,
////////////////////////////////////////////////////////////////////////
const float irho, const float theta, const int numrho)
// linesAccum
__global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
{
{
extern __shared__ unsigned int smem[];
const int n = blockIdx.x;
const float ang = n * theta;
for (int i = threadIdx.x; i < numrho; i += blockDim.x)
float sin_ang;
float cos_ang;
sincosf(ang, &sin_ang, &cos_ang);
const float tabSin = sin_ang * irho;
const float tabCos = cos_ang * irho;
for (int i = threadIdx.x; i < count; i += blockDim.x)
{
const unsigned int qvalue = list[i];
const int x = (qvalue & 0x0000FFFF);
const int y = (qvalue >> 16) & 0x0000FFFF;
int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2;
::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
}
}
__global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
{
extern __shared__ int smem[];
for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
smem[i] = 0;
smem[i] = 0;
__syncthreads();
__syncthreads();
const int n = blockIdx.x;
const int n = blockIdx.x;
...
@@ -154,41 +186,48 @@ namespace cv { namespace gpu { namespace device
...
@@ -154,41 +186,48 @@ namespace cv { namespace gpu { namespace device
for (int i = threadIdx.x; i < count; i += blockDim.x)
for (int i = threadIdx.x; i < count; i += blockDim.x)
{
{
// read one element from global memory
const unsigned int qvalue = list[i];
const unsigned int qvalue = list[i];
const unsigned int x = (qvalue & 0x0000FFFF);
const unsigned int y = (qvalue >> 16) & 0x0000FFFF;
const int x = (qvalue & 0x0000FFFF);
const int y = (qvalue >> 16) & 0x0000FFFF;
int r = __float2int_rn(x * tabCos + y * tabSin);
int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2;
r += (numrho - 1) / 2;
Emulation::smem::atomic
Inc(&smem[r], (unsigned int)(-1)
);
Emulation::smem::atomic
Add(&smem[r + 1], 1
);
}
}
__syncthreads();
__syncthreads();
for (int i = threadIdx.x; i < numrho; i += blockDim.x)
for (int i = threadIdx.x; i < numrho; i += blockDim.x)
accum(n + 1, i
+ 1
) = smem[i];
accum(n + 1, i) = smem[i];
}
}
void linesAccum_gpu(const unsigned int* list,
unsigned int count, DevMem2D_<unsigned int> accum, float rho, float theta
)
void linesAccum_gpu(const unsigned int* list,
int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock
)
{
{
const dim3 block(1024);
const dim3 block(1024);
const dim3 grid(accum.rows - 2);
const dim3 grid(accum.rows - 2);
cudaSafeCall( cudaFuncSetCacheConfig(linesAccum, cudaFuncCachePreferShared) );
cudaSafeCall( cudaFuncSetCacheConfig(linesAccumShared, cudaFuncCachePreferShared) );
size_t smemSize = (accum.cols - 2) * sizeof(int);
size_t smem_size = (accum.cols - 2) * sizeof(unsigned int);
if (smemSize < sharedMemPerBlock - 1000)
linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
else
linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
linesAccum<<<grid, block, smem_size>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
__global__ void linesGetResult(const DevMem2D_<unsigned int> accum, float2* out, int* voices, const int maxSize,
////////////////////////////////////////////////////////////////////////
const float threshold, const float theta, const float rho, const int numrho)
// linesGetResult
__global__ void linesGetResult(const DevMem2Di accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
{
{
__shared__
unsigned
int smem[8][32];
__shared__ int smem[8][32];
int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
...
@@ -211,10 +250,10 @@ namespace cv { namespace gpu { namespace device
...
@@ -211,10 +250,10 @@ namespace cv { namespace gpu { namespace device
smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y][threadIdx.x - 1] &&
smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y][threadIdx.x - 1] &&
smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y][threadIdx.x + 1])
smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y][threadIdx.x + 1])
{
{
float radius = (r - (numrho - 1) * 0.5f) * rho;
const
float radius = (r - (numrho - 1) * 0.5f) * rho;
float angle = n * theta;
const
float angle = n * theta;
const
unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)
);
const
int ind = ::atomicAdd(&g_counter, 1
);
if (ind < maxSize)
if (ind < maxSize)
{
{
out[ind] = make_float2(radius, angle);
out[ind] = make_float2(radius, angle);
...
@@ -223,13 +262,12 @@ namespace cv { namespace gpu { namespace device
...
@@ -223,13 +262,12 @@ namespace cv { namespace gpu { namespace device
}
}
}
}
unsigned int linesGetResult_gpu(DevMem2D_<unsigned int> accum, float2* out, int* voices, unsigned int maxSize,
int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort)
float rho, float theta, float threshold, bool doSort)
{
{
void* counter_ptr;
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(
unsigned
int)) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
const dim3 block(32, 8);
const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));
const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));
...
@@ -239,8 +277,8 @@ namespace cv { namespace gpu { namespace device
...
@@ -239,8 +277,8 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned
int total_count;
int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(
unsigned
int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
total_count = ::min(total_count, maxSize);
total_count = ::min(total_count, maxSize);
...
...
modules/gpu/src/hough.cpp
View file @
7928cec6
...
@@ -56,9 +56,9 @@ namespace cv { namespace gpu { namespace device
...
@@ -56,9 +56,9 @@ namespace cv { namespace gpu { namespace device
{
{
namespace
hough
namespace
hough
{
{
unsigned
int
buildPointList_gpu
(
DevMem2Db
src
,
unsigned
int
*
list
);
int
buildPointList_gpu
(
DevMem2Db
src
,
unsigned
int
*
list
);
void
linesAccum_gpu
(
const
unsigned
int
*
list
,
unsigned
int
count
,
DevMem2D_
<
unsigned
int
>
accum
,
float
rho
,
float
theta
);
void
linesAccum_gpu
(
const
unsigned
int
*
list
,
int
count
,
DevMem2Di
accum
,
float
rho
,
float
theta
,
size_t
sharedMemPerBlock
);
unsigned
int
linesGetResult_gpu
(
DevMem2D_
<
uint
>
accum
,
float2
*
out
,
int
*
voices
,
unsigned
int
maxSize
,
float
rho
,
float
theta
,
float
threshold
,
bool
doSort
);
int
linesGetResult_gpu
(
DevMem2Di
accum
,
float2
*
out
,
int
*
voices
,
int
maxSize
,
float
rho
,
float
theta
,
float
threshold
,
bool
doSort
);
}
}
}}}
}}}
...
@@ -71,16 +71,21 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
...
@@ -71,16 +71,21 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
CV_Assert
(
src
.
rows
<
std
::
numeric_limits
<
unsigned
short
>::
max
());
CV_Assert
(
src
.
rows
<
std
::
numeric_limits
<
unsigned
short
>::
max
());
ensureSizeIsEnough
(
1
,
src
.
size
().
area
(),
CV_32SC1
,
buf
);
ensureSizeIsEnough
(
1
,
src
.
size
().
area
(),
CV_32SC1
,
buf
);
unsigned
int
count
=
buildPointList_gpu
(
src
,
buf
.
ptr
<
unsigned
int
>
());
const
int
count
=
buildPointList_gpu
(
src
,
buf
.
ptr
<
unsigned
int
>
());
const
int
numangle
=
cvRound
(
CV_PI
/
theta
);
const
int
numangle
=
cvRound
(
CV_PI
/
theta
);
const
int
numrho
=
cvRound
(((
src
.
cols
+
src
.
rows
)
*
2
+
1
)
/
rho
);
const
int
numrho
=
cvRound
(((
src
.
cols
+
src
.
rows
)
*
2
+
1
)
/
rho
);
CV_Assert
(
numangle
>
0
&&
numrho
>
0
);
ensureSizeIsEnough
(
numangle
+
2
,
numrho
+
2
,
CV_32SC1
,
accum
);
ensureSizeIsEnough
(
numangle
+
2
,
numrho
+
2
,
CV_32SC1
,
accum
);
accum
.
setTo
(
cv
::
Scalar
::
all
(
0
));
accum
.
setTo
(
cv
::
Scalar
::
all
(
0
));
cv
::
gpu
::
DeviceInfo
devInfo
;
if
(
count
>
0
)
if
(
count
>
0
)
linesAccum_gpu
(
buf
.
ptr
<
unsigned
int
>
(),
count
,
accum
,
rho
,
theta
);
linesAccum_gpu
(
buf
.
ptr
<
unsigned
int
>
(),
count
,
accum
,
rho
,
theta
,
devInfo
.
sharedMemPerBlock
()
);
}
}
void
cv
::
gpu
::
HoughLinesGet
(
const
GpuMat
&
accum
,
GpuMat
&
lines
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
,
int
maxLines
)
void
cv
::
gpu
::
HoughLinesGet
(
const
GpuMat
&
accum
,
GpuMat
&
lines
,
float
rho
,
float
theta
,
int
threshold
,
bool
doSort
,
int
maxLines
)
...
@@ -90,7 +95,8 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
...
@@ -90,7 +95,8 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
CV_Assert
(
accum
.
type
()
==
CV_32SC1
);
CV_Assert
(
accum
.
type
()
==
CV_32SC1
);
ensureSizeIsEnough
(
2
,
maxLines
,
CV_32FC2
,
lines
);
ensureSizeIsEnough
(
2
,
maxLines
,
CV_32FC2
,
lines
);
unsigned
int
count
=
hough
::
linesGetResult_gpu
(
accum
,
lines
.
ptr
<
float2
>
(
0
),
lines
.
ptr
<
int
>
(
1
),
maxLines
,
rho
,
theta
,
threshold
,
doSort
);
int
count
=
hough
::
linesGetResult_gpu
(
accum
,
lines
.
ptr
<
float2
>
(
0
),
lines
.
ptr
<
int
>
(
1
),
maxLines
,
rho
,
theta
,
threshold
,
doSort
);
if
(
count
>
0
)
if
(
count
>
0
)
lines
.
cols
=
count
;
lines
.
cols
=
count
;
...
...
modules/gpu/src/opencv2/gpu/device/emulation.hpp
View file @
7928cec6
...
@@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device
}
}
template
<
typename
T
>
template
<
typename
T
>
static
__device__
__forceinline__
void
atomicAdd
(
T
*
address
,
T
val
)
static
__device__
__forceinline__
T
atomicAdd
(
T
*
address
,
T
val
)
{
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T
count
;
T
count
;
...
@@ -110,8 +110,10 @@ namespace cv { namespace gpu { namespace device
...
@@ -110,8 +110,10 @@ namespace cv { namespace gpu { namespace device
count
=
tag
|
(
count
+
val
);
count
=
tag
|
(
count
+
val
);
*
address
=
count
;
*
address
=
count
;
}
while
(
*
address
!=
count
);
}
while
(
*
address
!=
count
);
return
(
count
&
TAG_MASK
)
-
val
;
#else
#else
::
atomicAdd
(
address
,
val
);
return
::
atomicAdd
(
address
,
val
);
#endif
#endif
}
}
...
@@ -134,4 +136,4 @@ namespace cv { namespace gpu { namespace device
...
@@ -134,4 +136,4 @@ namespace cv { namespace gpu { namespace device
};
};
}}}
// namespace cv { namespace gpu { namespace device
}}}
// namespace cv { namespace gpu { namespace device
#endif
/* OPENCV_GPU_EMULATION_HPP_ */
#endif
/* OPENCV_GPU_EMULATION_HPP_ */
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment