Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b705e0d8
Commit
b705e0d8
authored
Aug 27, 2013
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
used new device layer for cv::gpu::sum
parent
9fe92e21
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
143 additions
and
434 deletions
+143
-434
sum.cu
modules/cudaarithm/src/cuda/sum.cu
+116
-294
reductions.cpp
modules/cudaarithm/src/reductions.cpp
+0
-131
CMakeLists.txt
modules/cudev/CMakeLists.txt
+1
-1
reduce.hpp
modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
+3
-6
reduce.hpp
modules/cudev/include/opencv2/cudev/grid/reduce.hpp
+8
-0
vec_math.hpp
modules/cudev/include/opencv2/cudev/util/vec_math.hpp
+15
-2
No files found.
modules/cudaarithm/src/cuda/sum.cu
View file @
b705e0d8
...
...
@@ -40,342 +40,164 @@
//
//M*/
#i
f !defined CUDA_DISABLER
#i
nclude "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/utility.hpp"
#ifndef HAVE_OPENCV_CUDEV
#
include "unroll_detail.hpp
"
#
error "opencv_cudev is required
"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace sum
{
__device__ unsigned int blocks_finished = 0;
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
template <typename R, int cn> struct AtomicAdd;
template <typename R> struct AtomicAdd<R, 1>
{
static __device__ void run(R* ptr, R val)
{
Emulation::glob::atomicAdd(ptr, val);
}
};
template <typename R> struct AtomicAdd<R, 2>
{
typedef typename TypeVec<R, 2>::vec_type val_type;
using namespace cv::cudev;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
}
};
template <typename R> struct AtomicAdd<R, 3>
namespace
{
template <typename T, typename R, int cn>
cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
{
typedef typename TypeVec<R, 3>::vec_type val_type;
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
}
};
template <typename R> struct AtomicAdd<R, 4>
{
typedef typename TypeVec<R, 4>::vec_type val_type;
GpuMat_<src_type> src(_src);
GpuMat_<res_type> buf(_buf);
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
Emulation::glob::atomicAdd(ptr + 3, val.w);
}
};
if (mask.empty())
gridCalcSum(src, buf);
else
gridCalcSum(src, buf, globPtr<uchar>(mask));
template <int BLOCK_SIZE, typename R, int cn>
struct GlobalReduce
{
typedef typename TypeVec<R, cn>::vec_type result_type;
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
AtomicAdd<R, cn>::run((R*) result, sum);
#else
__shared__ bool is_last;
return res;
}
if (tid == 0)
template <typename T, typename R, int cn>
cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
{
result[bid] = sum;
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
GpuMat_<src_type> src(_src);
GpuMat_<res_type> buf(_buf);
__syncthreads();
if (is_last)
{
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
if (mask.empty())
gridCalcSum(abs_(cvt_<res_type>(src)), buf);
else
gridCalcSum(abs_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
if (tid == 0)
{
result[0] = sum;
blocks_finished = 0;
return res;
}
}
#endif
}
};
template <
int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op
>
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight
)
template <
typename T, typename R, int cn
>
cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf
)
{
typedef typename VecTraits<src_type>::elem_type T;
typedef typename VecTraits<result_type>::elem_type R;
const int cn = VecTraits<src_type>::cn;
__shared__ R smem[BLOCK_SIZE * cn];
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
result_type sum = VecTraits<result_type>::all(0);
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const src_type* ptr = src.ptr(y);
GpuMat_<src_type> src(_src);
GpuMat_<res_type> buf(_buf);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const src_type srcVal = ptr[x];
sum = sum + op(saturate_cast<result_type>(srcVal));
}
}
}
if (mask.empty())
gridCalcSum(sqr_(cvt_<res_type>(src)), buf);
else
gridCalcSum(sqr_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem)
;
return res
;
}
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
{sumImpl<uchar , uint , 1>, sumImpl<uchar , uint , 2>, sumImpl<uchar , uint , 3>, sumImpl<uchar , uint , 4>},
{sumImpl<schar , int , 1>, sumImpl<schar , int , 2>, sumImpl<schar , int , 3>, sumImpl<schar , int , 4>},
{sumImpl<ushort, uint , 1>, sumImpl<ushort, uint , 2>, sumImpl<ushort, uint , 3>, sumImpl<ushort, uint , 4>},
{sumImpl<short , int , 1>, sumImpl<short , int , 2>, sumImpl<short , int , 3>, sumImpl<short , int , 4>},
{sumImpl<int , int , 1>, sumImpl<int , int , 2>, sumImpl<int , int , 3>, sumImpl<int , int , 4>},
{sumImpl<float , float , 1>, sumImpl<float , float , 2>, sumImpl<float , float , 3>, sumImpl<float , float , 4>},
{sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
};
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
const int res_depth = std::max(src.depth(), CV_32F);
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
template <typename T, typename R, int cn, template <typename> class Op>
void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<R, cn>::vec_type result_type;
const func_t func = funcs[src.depth()][src.channels() - 1];
PtrStepSz<src_type> src(src_
);
result_type* buf = (result_type*) buf_;
return func(src, mask, buf
);
}
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
{sumAbsImpl<uchar , uint , 1>, sumAbsImpl<uchar , uint , 2>, sumAbsImpl<uchar , uint , 3>, sumAbsImpl<uchar , uint , 4>},
{sumAbsImpl<schar , int , 1>, sumAbsImpl<schar , int , 2>, sumAbsImpl<schar , int , 3>, sumAbsImpl<schar , int , 4>},
{sumAbsImpl<ushort, uint , 1>, sumAbsImpl<ushort, uint , 2>, sumAbsImpl<ushort, uint , 3>, sumAbsImpl<ushort, uint , 4>},
{sumAbsImpl<short , int , 1>, sumAbsImpl<short , int , 2>, sumAbsImpl<short , int , 3>, sumAbsImpl<short , int , 4>},
{sumAbsImpl<int , int , 1>, sumAbsImpl<int , int , 2>, sumAbsImpl<int , int , 3>, sumAbsImpl<int , int , 4>},
{sumAbsImpl<float , float , 1>, sumAbsImpl<float , float , 2>, sumAbsImpl<float , float , 3>, sumAbsImpl<float , float , 4>},
{sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
};
const int twidth = divUp(divUp(src.cols, grid.x), block.x
);
const int theight = divUp(divUp(src.rows, grid.y), block.y
);
GpuMat src = _src.getGpuMat(
);
GpuMat mask = _mask.getGpuMat(
);
Op<result_type> op
;
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) )
;
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
cudaSafeCall( cudaGetLastError() );
const int res_depth = std::max(src.depth(), CV_32F);
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
cudaSafeCall( cudaDeviceSynchronize() )
;
const func_t func = funcs[src.depth()][src.channels() - 1]
;
R result[4] = {0, 0, 0, 0}
;
cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
return func(src, mask, buf)
;
}
out[0] = result[0];
out[1] = result[1];
out[2] = result[2];
out[3] = result[3];
}
cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
{sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
{sumSqrImpl<schar , double, 1>, sumSqrImpl<schar , double, 2>, sumSqrImpl<schar , double, 3>, sumSqrImpl<schar , double, 4>},
{sumSqrImpl<ushort, double, 1>, sumSqrImpl<ushort, double, 2>, sumSqrImpl<ushort, double, 3>, sumSqrImpl<ushort, double, 4>},
{sumSqrImpl<short , double, 1>, sumSqrImpl<short , double, 2>, sumSqrImpl<short , double, 3>, sumSqrImpl<short , double, 4>},
{sumSqrImpl<int , double, 1>, sumSqrImpl<int , double, 2>, sumSqrImpl<int , double, 3>, sumSqrImpl<int , double, 4>},
{sumSqrImpl<float , double, 1>, sumSqrImpl<float , double, 2>, sumSqrImpl<float , double, 3>, sumSqrImpl<float , double, 4>},
{sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
};
template <typename T> struct SumType;
template <> struct SumType<uchar> { typedef unsigned int R; };
template <> struct SumType<schar> { typedef int R; };
template <> struct SumType<ushort> { typedef unsigned int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, identity>(src, buf, out, mask);
}
GpuMat src = _src.getGpuMat();
GpuMat mask = _mask.getGpuMat();
template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, abs_func>(src, buf, out, mask);
}
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T> struct Sqr : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
return x * x;
}
};
const int res_depth = CV_64F;
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
template <typename T, int cn>
void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
caller<T, double, cn, Sqr>(src, buf, out, mask);
}
const func_t func = funcs[src.depth()][src.channels() - 1];
template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
return func(src, mask, buf);
}
#endif
// CUDA_DISABLER
#endif
modules/cudaarithm/src/reductions.cpp
View file @
b705e0d8
...
...
@@ -186,137 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
return
retVal
;
}
////////////////////////////////////////////////////////////////////////
// Sum
namespace
sum
{
void
getBufSize
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
,
int
cn
>
void
run
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
template
<
typename
T
,
int
cn
>
void
runAbs
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
template
<
typename
T
,
int
cn
>
void
runSqr
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
}
Scalar
cv
::
cuda
::
sum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
run
<
uchar
,
1
>
,
::
sum
::
run
<
uchar
,
2
>
,
::
sum
::
run
<
uchar
,
3
>
,
::
sum
::
run
<
uchar
,
4
>
},
{
0
,
::
sum
::
run
<
schar
,
1
>
,
::
sum
::
run
<
schar
,
2
>
,
::
sum
::
run
<
schar
,
3
>
,
::
sum
::
run
<
schar
,
4
>
},
{
0
,
::
sum
::
run
<
ushort
,
1
>
,
::
sum
::
run
<
ushort
,
2
>
,
::
sum
::
run
<
ushort
,
3
>
,
::
sum
::
run
<
ushort
,
4
>
},
{
0
,
::
sum
::
run
<
short
,
1
>
,
::
sum
::
run
<
short
,
2
>
,
::
sum
::
run
<
short
,
3
>
,
::
sum
::
run
<
short
,
4
>
},
{
0
,
::
sum
::
run
<
int
,
1
>
,
::
sum
::
run
<
int
,
2
>
,
::
sum
::
run
<
int
,
3
>
,
::
sum
::
run
<
int
,
4
>
},
{
0
,
::
sum
::
run
<
float
,
1
>
,
::
sum
::
run
<
float
,
2
>
,
::
sum
::
run
<
float
,
3
>
,
::
sum
::
run
<
float
,
4
>
},
{
0
,
::
sum
::
run
<
double
,
1
>
,
::
sum
::
run
<
double
,
2
>
,
::
sum
::
run
<
double
,
3
>
,
::
sum
::
run
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
cuda
::
absSum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
runAbs
<
uchar
,
1
>
,
::
sum
::
runAbs
<
uchar
,
2
>
,
::
sum
::
runAbs
<
uchar
,
3
>
,
::
sum
::
runAbs
<
uchar
,
4
>
},
{
0
,
::
sum
::
runAbs
<
schar
,
1
>
,
::
sum
::
runAbs
<
schar
,
2
>
,
::
sum
::
runAbs
<
schar
,
3
>
,
::
sum
::
runAbs
<
schar
,
4
>
},
{
0
,
::
sum
::
runAbs
<
ushort
,
1
>
,
::
sum
::
runAbs
<
ushort
,
2
>
,
::
sum
::
runAbs
<
ushort
,
3
>
,
::
sum
::
runAbs
<
ushort
,
4
>
},
{
0
,
::
sum
::
runAbs
<
short
,
1
>
,
::
sum
::
runAbs
<
short
,
2
>
,
::
sum
::
runAbs
<
short
,
3
>
,
::
sum
::
runAbs
<
short
,
4
>
},
{
0
,
::
sum
::
runAbs
<
int
,
1
>
,
::
sum
::
runAbs
<
int
,
2
>
,
::
sum
::
runAbs
<
int
,
3
>
,
::
sum
::
runAbs
<
int
,
4
>
},
{
0
,
::
sum
::
runAbs
<
float
,
1
>
,
::
sum
::
runAbs
<
float
,
2
>
,
::
sum
::
runAbs
<
float
,
3
>
,
::
sum
::
runAbs
<
float
,
4
>
},
{
0
,
::
sum
::
runAbs
<
double
,
1
>
,
::
sum
::
runAbs
<
double
,
2
>
,
::
sum
::
runAbs
<
double
,
3
>
,
::
sum
::
runAbs
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
cuda
::
sqrSum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
runSqr
<
uchar
,
1
>
,
::
sum
::
runSqr
<
uchar
,
2
>
,
::
sum
::
runSqr
<
uchar
,
3
>
,
::
sum
::
runSqr
<
uchar
,
4
>
},
{
0
,
::
sum
::
runSqr
<
schar
,
1
>
,
::
sum
::
runSqr
<
schar
,
2
>
,
::
sum
::
runSqr
<
schar
,
3
>
,
::
sum
::
runSqr
<
schar
,
4
>
},
{
0
,
::
sum
::
runSqr
<
ushort
,
1
>
,
::
sum
::
runSqr
<
ushort
,
2
>
,
::
sum
::
runSqr
<
ushort
,
3
>
,
::
sum
::
runSqr
<
ushort
,
4
>
},
{
0
,
::
sum
::
runSqr
<
short
,
1
>
,
::
sum
::
runSqr
<
short
,
2
>
,
::
sum
::
runSqr
<
short
,
3
>
,
::
sum
::
runSqr
<
short
,
4
>
},
{
0
,
::
sum
::
runSqr
<
int
,
1
>
,
::
sum
::
runSqr
<
int
,
2
>
,
::
sum
::
runSqr
<
int
,
3
>
,
::
sum
::
runSqr
<
int
,
4
>
},
{
0
,
::
sum
::
runSqr
<
float
,
1
>
,
::
sum
::
runSqr
<
float
,
2
>
,
::
sum
::
runSqr
<
float
,
3
>
,
::
sum
::
runSqr
<
float
,
4
>
},
{
0
,
::
sum
::
runSqr
<
double
,
1
>
,
::
sum
::
runSqr
<
double
,
2
>
,
::
sum
::
runSqr
<
double
,
3
>
,
::
sum
::
runSqr
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
////////////////////////////////////////////////////////////////////////
// minMax
...
...
modules/cudev/CMakeLists.txt
View file @
b705e0d8
...
...
@@ -4,7 +4,7 @@ endif()
set
(
the_description
"CUDA device layer"
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable
-Wenum-compare
)
ocv_add_module
(
cudev
)
...
...
modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
View file @
b705e0d8
...
...
@@ -418,9 +418,7 @@ namespace grid_reduce_detail
const
dim3
block
(
Policy
::
block_size_x
,
Policy
::
block_size_y
);
const
dim3
grid
(
divUp
(
cols
,
block
.
x
*
Policy
::
patch_size_x
),
divUp
(
rows
,
block
.
y
*
Policy
::
patch_size_y
));
const
int
BLOCK_SIZE
=
Policy
::
block_size_x
*
Policy
::
block_size_y
;
glob_reduce
<
Reductor
,
BLOCK_SIZE
,
Policy
::
patch_size_x
,
Policy
::
patch_size_y
><<<
grid
,
block
,
0
,
stream
>>>
(
src
,
result
,
mask
,
rows
,
cols
);
glob_reduce
<
Reductor
,
Policy
::
block_size_x
*
Policy
::
block_size_y
,
Policy
::
patch_size_x
,
Policy
::
patch_size_y
><<<
grid
,
block
,
0
,
stream
>>>
(
src
,
result
,
mask
,
rows
,
cols
);
CV_CUDEV_SAFE_CALL
(
cudaGetLastError
()
);
if
(
stream
==
0
)
...
...
@@ -433,10 +431,9 @@ namespace grid_reduce_detail
__host__
void
sum
(
const
SrcPtr
&
src
,
ResType
*
result
,
const
MaskPtr
&
mask
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
const
int
cn
=
VecTraits
<
src_type
>::
cn
;
typedef
typename
MakeVec
<
ResType
,
cn
>::
type
work_type
;
typedef
typename
VecTraits
<
ResType
>::
elem_type
res_elem_type
;
glob_reduce
<
SumReductor
<
src_type
,
work_type
>
,
Policy
>
(
src
,
result
,
mask
,
rows
,
cols
,
stream
);
glob_reduce
<
SumReductor
<
src_type
,
ResType
>
,
Policy
>
(
src
,
(
res_elem_type
*
)
result
,
mask
,
rows
,
cols
,
stream
);
}
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
...
...
modules/cudev/include/opencv2/cudev/grid/reduce.hpp
View file @
b705e0d8
...
...
@@ -59,6 +59,10 @@ namespace cv { namespace cudev {
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
const
MaskPtr
&
mask
,
Stream
&
stream
=
Stream
::
Null
())
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
CV_StaticAssert
(
VecTraits
<
src_type
>::
cn
==
VecTraits
<
ResType
>::
cn
,
""
);
dst
.
create
(
1
,
1
);
dst
.
setTo
(
0
,
stream
);
...
...
@@ -77,6 +81,10 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
>
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
Stream
&
stream
=
Stream
::
Null
())
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
CV_StaticAssert
(
VecTraits
<
src_type
>::
cn
==
VecTraits
<
ResType
>::
cn
,
""
);
dst
.
create
(
1
,
1
);
dst
.
setTo
(
0
,
stream
);
...
...
modules/cudev/include/opencv2/cudev/util/vec_math.hpp
View file @
b705e0d8
...
...
@@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
}
namespace
vec_math_detail
{
__device__
__forceinline__
schar
abs_
(
schar
val
)
{
return
(
schar
)
::
abs
((
int
)
val
);
}
__device__
__forceinline__
short
abs_
(
short
val
)
{
return
(
short
)
::
abs
((
int
)
val
);
}
}
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uchar
,
uchar
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
char
,
char
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
vec_math_detail
::
abs_
,
char
,
char
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
ushort
,
ushort
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
short
,
short
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
vec_math_detail
::
abs_
,
short
,
short
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
int
,
int
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uint
,
uint
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
fabsf
,
float
,
float
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment