Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
fdb07a0a
Commit
fdb07a0a
authored
Feb 11, 2013
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix performance issue of gpu reduction
parent
b6e7aeab
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
183 additions
and
132 deletions
+183
-132
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+183
-132
No files found.
modules/gpu/src/cuda/matrix_reductions.cu
View file @
fdb07a0a
...
...
@@ -55,6 +55,128 @@
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace detail
{
__device__ __forceinline__ int cvAtomicAdd(int* address, int val)
{
return ::atomicAdd(address, val);
}
__device__ __forceinline__ unsigned int cvAtomicAdd(unsigned int* address, unsigned int val)
{
return ::atomicAdd(address, val);
}
__device__ __forceinline__ float cvAtomicAdd(float* address, float val)
{
#if __CUDA_ARCH__ >= 200
return ::atomicAdd(address, val);
#else
int* address_as_i = (int*) address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed,
__float_as_int(val + __int_as_float(assumed)));
} while (assumed != old);
return __int_as_float(old);
#endif
}
__device__ __forceinline__ double cvAtomicAdd(double* address, double val)
{
#if __CUDA_ARCH__ >= 130
unsigned long long int* address_as_ull = (unsigned long long int*) address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
#else
(void) address;
(void) val;
return 0.0;
#endif
}
__device__ __forceinline__ int cvAtomicMin(int* address, int val)
{
return ::atomicMin(address, val);
}
__device__ __forceinline__ float cvAtomicMin(float* address, float val)
{
#if __CUDA_ARCH__ >= 120
int* address_as_i = (int*) address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed,
__float_as_int(::fminf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
#else
(void) address;
(void) val;
return 0.0f;
#endif
}
__device__ __forceinline__ double cvAtomicMin(double* address, double val)
{
#if __CUDA_ARCH__ >= 130
unsigned long long int* address_as_ull = (unsigned long long int*) address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_ull, assumed,
__double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
#else
(void) address;
(void) val;
return 0.0;
#endif
}
__device__ __forceinline__ int cvAtomicMax(int* address, int val)
{
return ::atomicMax(address, val);
}
__device__ __forceinline__ float cvAtomicMax(float* address, float val)
{
#if __CUDA_ARCH__ >= 120
int* address_as_i = (int*) address;
int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed,
__float_as_int(::fmaxf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
#else
(void) address;
(void) val;
return 0.0f;
#endif
}
__device__ __forceinline__ double cvAtomicMax(double* address, double val)
{
#if __CUDA_ARCH__ >= 130
unsigned long long int* address_as_ull = (unsigned long long int*) address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_ull, assumed,
__double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
#else
(void) address;
(void) val;
return 0.0;
#endif
}
}
namespace detail
{
template <int cn> struct Unroll;
...
...
@@ -152,7 +274,7 @@ namespace sum
{
static __device__ void run(R* ptr, R val)
{
::a
tomicAdd(ptr, val);
detail::cvA
tomicAdd(ptr, val);
}
};
template <typename R> struct AtomicAdd<R, 2>
...
...
@@ -161,8 +283,8 @@ namespace sum
static __device__ void run(R* ptr, val_type val)
{
::a
tomicAdd(ptr, val.x);
::a
tomicAdd(ptr + 1, val.y);
detail::cvA
tomicAdd(ptr, val.x);
detail::cvA
tomicAdd(ptr + 1, val.y);
}
};
template <typename R> struct AtomicAdd<R, 3>
...
...
@@ -171,9 +293,9 @@ namespace sum
static __device__ void run(R* ptr, val_type val)
{
::a
tomicAdd(ptr, val.x);
::a
tomicAdd(ptr + 1, val.y);
::a
tomicAdd(ptr + 2, val.z);
detail::cvA
tomicAdd(ptr, val.x);
detail::cvA
tomicAdd(ptr + 1, val.y);
detail::cvA
tomicAdd(ptr + 2, val.z);
}
};
template <typename R> struct AtomicAdd<R, 4>
...
...
@@ -182,10 +304,10 @@ namespace sum
static __device__ void run(R* ptr, val_type val)
{
::a
tomicAdd(ptr, val.x);
::a
tomicAdd(ptr + 1, val.y);
::a
tomicAdd(ptr + 2, val.z);
::a
tomicAdd(ptr + 3, val.w);
detail::cvA
tomicAdd(ptr, val.x);
detail::cvA
tomicAdd(ptr + 1, val.y);
detail::cvA
tomicAdd(ptr + 2, val.z);
detail::cvA
tomicAdd(ptr + 3, val.w);
}
};
...
...
@@ -229,41 +351,6 @@ namespace sum
#endif
}
};
template <int BLOCK_SIZE, int cn>
struct GlobalReduce<BLOCK_SIZE, double, cn>
{
typedef typename TypeVec<double, cn>::vec_type result_type;
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, double* smem)
{
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = sum;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<double>()));
if (tid == 0)
{
result[0] = sum;
blocks_finished = 0;
}
}
}
};
template <int BLOCK_SIZE, typename src_type, typename result_type, class Op>
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Op op, const int twidth, const int theight)
...
...
@@ -518,53 +605,12 @@ namespace minMax
struct GlobalReduce
{
static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
{
__shared__ bool is_last;
if (tid == 0)
{
minval[bid] = mymin;
maxval[bid] = mymax;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
int idx = ::min(tid, gridDim.x * gridDim.y - 1);
mymin = minval[idx];
mymax = maxval[idx];
const minimum<R> minOp;
const maximum<R> maxOp;
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
if (tid == 0)
{
minval[0] = mymin;
maxval[0] = mymax;
blocks_finished = 0;
}
}
}
};
template <int BLOCK_SIZE>
struct GlobalReduce<BLOCK_SIZE, int>
{
static __device__ void run(int& mymin, int& mymax, int* minval, int* maxval, int tid, int bid, int* sminval, int* smaxval)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
{
::a
tomicMin(minval, mymin);
::a
tomicMax(maxval, mymax);
detail::cvA
tomicMin(minval, mymin);
detail::cvA
tomicMax(maxval, mymax);
}
#else
__shared__ bool is_last;
...
...
@@ -589,8 +635,8 @@ namespace minMax
mymin = minval[idx];
mymax = maxval[idx];
const minimum<
int
> minOp;
const maximum<
int
> maxOp;
const minimum<
R
> minOp;
const maximum<
R
> maxOp;
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
if (tid == 0)
...
...
@@ -672,12 +718,19 @@ namespace minMax
*minval_buf = numeric_limits<int>::max();
*maxval_buf = numeric_limits<int>::min();
}
template <typename R>
void setDefault(R*, R*)
__global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
{
*minval_buf = numeric_limits<float>::max();
*maxval_buf = -numeric_limits<float>::max();
}
__global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
{
*minval_buf = numeric_limits<double>::max();
*maxval_buf = -numeric_limits<double>::max();
}
void setDefault(int* minval_buf, int* maxval_buf)
template <typename R>
void setDefault(R* minval_buf, R* maxval_buf)
{
setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
}
...
...
@@ -728,21 +781,19 @@ namespace minMax
namespace minMaxLoc
{
__device__ unsigned int blocks_finished = 0;
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<u
nsigned
char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<s
igned
char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<u
nsigned
short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename T, class Mask>
__global__ void kernel(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
__global__ void kernel
_pass_1
(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
...
...
@@ -750,7 +801,6 @@ namespace minMaxLoc
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
__shared__ bool is_last;
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
...
...
@@ -799,38 +849,36 @@ namespace minMaxLoc
maxval[bid] = (T) mymax;
minloc[bid] = myminloc;
maxloc[bid] = mymaxloc;
__threadfence();
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
}
template <int BLOCK_SIZE, typename T>
__global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
__syncthreads();
if (is_last)
{
unsigned int idx = ::min(tid, gridDim.x * gridDim.y - 1);
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
mymin = minval[idx];
mymax = maxval[idx];
myminloc = minloc[idx];
mymaxloc = maxloc[idx];
unsigned int idx = ::min(threadIdx.x, count - 1);
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
tid,
thrust::make_tuple(less<work_type>(), greater<work_type>()))
;
work_type mymin = minval[idx];
work_type mymax = maxval[idx];
unsigned int myminloc = minloc[idx];
unsigned int mymaxloc = maxloc[idx]
;
if (tid == 0)
{
minval[0] = (T) mymin;
maxval[0] = (T) mymax;
minloc[0] = myminloc;
maxloc[0] = mymaxloc;
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
threadIdx.x,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
blocks_finished = 0;
}
if (threadIdx.x == 0)
{
minval[0] = (T) mymin;
maxval[0] = (T) mymax;
minloc[0] = myminloc;
maxloc[0] = mymaxloc;
}
}
...
...
@@ -877,10 +925,13 @@ namespace minMaxLoc
unsigned int* maxloc_buf = locbuf.ptr(1);
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
kernel
_pass_1
<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
...
...
@@ -898,9 +949,9 @@ namespace minMaxLoc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<u
nsigned
char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<s
igned
char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<u
nsigned
short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment