Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b705e0d8
Commit
b705e0d8
authored
Aug 27, 2013
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
used new device layer for cv::gpu::sum
parent
9fe92e21
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
143 additions
and
434 deletions
+143
-434
sum.cu
modules/cudaarithm/src/cuda/sum.cu
+116
-294
reductions.cpp
modules/cudaarithm/src/reductions.cpp
+0
-131
CMakeLists.txt
modules/cudev/CMakeLists.txt
+1
-1
reduce.hpp
modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
+3
-6
reduce.hpp
modules/cudev/include/opencv2/cudev/grid/reduce.hpp
+8
-0
vec_math.hpp
modules/cudev/include/opencv2/cudev/util/vec_math.hpp
+15
-2
No files found.
modules/cudaarithm/src/cuda/sum.cu
View file @
b705e0d8
...
@@ -40,342 +40,164 @@
...
@@ -40,342 +40,164 @@
//
//
//M*/
//M*/
#i
f !defined CUDA_DISABLER
#i
nclude "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/utility.hpp"
#
include "unroll_detail.hpp
"
#
error "opencv_cudev is required
"
using namespace cv::cuda;
#else
using namespace cv::cuda::device;
namespace sum
#include "opencv2/cudaarithm.hpp"
{
#include "opencv2/cudev.hpp"
__device__ unsigned int blocks_finished = 0;
template <typename R, int cn> struct AtomicAdd;
using namespace cv::cudev;
template <typename R> struct AtomicAdd<R, 1>
{
static __device__ void run(R* ptr, R val)
{
Emulation::glob::atomicAdd(ptr, val);
}
};
template <typename R> struct AtomicAdd<R, 2>
{
typedef typename TypeVec<R, 2>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
namespace
{
{
Emulation::glob::atomicAdd(ptr, val.x);
template <typename T, typename R, int cn>
Emulation::glob::atomicAdd(ptr + 1, val.y);
cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
}
};
template <typename R> struct AtomicAdd<R, 3>
{
{
typedef typename TypeVec<R, 3>::vec_type val_type;
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
static __device__ void run(R* ptr, val_type val)
GpuMat_<src_type> src(_src);
{
GpuMat_<res_type> buf(_buf);
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
}
};
template <typename R> struct AtomicAdd<R, 4>
{
typedef typename TypeVec<R, 4>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
if (mask.empty())
{
gridCalcSum(src, buf);
Emulation::glob::atomicAdd(ptr, val.x);
else
Emulation::glob::atomicAdd(ptr + 1, val.y);
gridCalcSum(src, buf, globPtr<uchar>(mask));
Emulation::glob::atomicAdd(ptr + 2, val.z);
Emulation::glob::atomicAdd(ptr + 3, val.w);
}
};
template <int BLOCK_SIZE, typename R, int cn>
cv::Scalar_<R> res;
struct GlobalReduce
cv::Mat res_mat(buf.size(), buf.type(), res.val);
{
buf.download(res_mat);
typedef typename TypeVec<R, cn>::vec_type result_type;
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
return res;
{
}
#if __CUDA_ARCH__ >= 200
if (tid == 0)
AtomicAdd<R, cn>::run((R*) result, sum);
#else
__shared__ bool is_last;
if (tid == 0)
template <typename T, typename R, int cn>
cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
{
{
result[bid] = sum;
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
__threadfence();
GpuMat_<src_type> src(_src);
GpuMat_<res_type> buf(_buf);
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (mask.empty())
gridCalcSum(abs_(cvt_<res_type>(src)), buf);
if (is_last)
else
{
gridCalcSum(abs_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
if (tid == 0)
return res;
{
result[0] = sum;
blocks_finished = 0;
}
}
}
#endif
}
};
template <
int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op
>
template <
typename T, typename R, int cn
>
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight
)
cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf
)
{
{
typedef typename VecTraits<src_type>::elem_type T;
typedef typename MakeVec<T, cn>::type src_type;
typedef typename VecTraits<result_type>::elem_type R;
typedef typename MakeVec<R, cn>::type res_type;
const int cn = VecTraits<src_type>::cn;
__shared__ R smem[BLOCK_SIZE * cn];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
GpuMat_<src_type> src(_src);
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
GpuMat_<res_type> buf(_buf);
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
result_type sum = VecTraits<result_type>::all(0);
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const src_type* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
if (mask.empty())
{
gridCalcSum(sqr_(cvt_<res_type>(src)), buf);
if (mask(y, x))
else
{
gridCalcSum(sqr_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
const src_type srcVal = ptr[x];
sum = sum + op(saturate_cast<result_type>(srcVal));
}
}
}
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem)
;
return res
;
}
}
}
const int threads_x = 32;
cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
const int threads_y = 8;
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
static const func_t funcs[7][4] =
{
{
block = dim3(threads_x, threads_y);
{sumImpl<uchar , uint , 1>, sumImpl<uchar , uint , 2>, sumImpl<uchar , uint , 3>, sumImpl<uchar , uint , 4>},
{sumImpl<schar , int , 1>, sumImpl<schar , int , 2>, sumImpl<schar , int , 3>, sumImpl<schar , int , 4>},
grid = dim3(divUp(cols, block.x * block.y),
{sumImpl<ushort, uint , 1>, sumImpl<ushort, uint , 2>, sumImpl<ushort, uint , 3>, sumImpl<ushort, uint , 4>},
divUp(rows, block.y * block.x));
{sumImpl<short , int , 1>, sumImpl<short , int , 2>, sumImpl<short , int , 3>, sumImpl<short , int , 4>},
{sumImpl<int , int , 1>, sumImpl<int , int , 2>, sumImpl<int , int , 3>, sumImpl<int , int , 4>},
{sumImpl<float , float , 1>, sumImpl<float , float , 2>, sumImpl<float , float , 3>, sumImpl<float , float , 4>},
{sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
};
grid.x = ::min(grid.x, block.x);
GpuMat src = _src.getGpuMat();
grid.y = ::min(grid.y, block.y);
GpuMat mask = _mask.getGpuMat();
}
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
const int res_depth = std::max(src.depth(), CV_32F);
bufrows = 1;
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
}
template <typename T, typename R, int cn, template <typename> class Op>
const func_t func = funcs[src.depth()][src.channels() - 1];
void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<R, cn>::vec_type result_type;
PtrStepSz<src_type> src(src_
);
return func(src, mask, buf
);
result_type* buf = (result_type*) buf_;
}
dim3 block, grid;
cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
getLaunchCfg(src.cols, src.rows, block, grid);
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
{sumAbsImpl<uchar , uint , 1>, sumAbsImpl<uchar , uint , 2>, sumAbsImpl<uchar , uint , 3>, sumAbsImpl<uchar , uint , 4>},
{sumAbsImpl<schar , int , 1>, sumAbsImpl<schar , int , 2>, sumAbsImpl<schar , int , 3>, sumAbsImpl<schar , int , 4>},
{sumAbsImpl<ushort, uint , 1>, sumAbsImpl<ushort, uint , 2>, sumAbsImpl<ushort, uint , 3>, sumAbsImpl<ushort, uint , 4>},
{sumAbsImpl<short , int , 1>, sumAbsImpl<short , int , 2>, sumAbsImpl<short , int , 3>, sumAbsImpl<short , int , 4>},
{sumAbsImpl<int , int , 1>, sumAbsImpl<int , int , 2>, sumAbsImpl<int , int , 3>, sumAbsImpl<int , int , 4>},
{sumAbsImpl<float , float , 1>, sumAbsImpl<float , float , 2>, sumAbsImpl<float , float , 3>, sumAbsImpl<float , float , 4>},
{sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
};
const int twidth = divUp(divUp(src.cols, grid.x), block.x
);
GpuMat src = _src.getGpuMat(
);
const int theight = divUp(divUp(src.rows, grid.y), block.y
);
GpuMat mask = _mask.getGpuMat(
);
Op<result_type> op
;
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) )
;
if (mask.data)
const int res_depth = std::max(src.depth(), CV_32F);
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
else
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() )
;
const func_t func = funcs[src.depth()][src.channels() - 1]
;
R result[4] = {0, 0, 0, 0}
;
return func(src, mask, buf)
;
cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
}
out[0] = result[0];
cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
out[1] = result[1];
{
out[2] = result[2];
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
out[3] = result[3];
static const func_t funcs[7][4] =
}
{
{sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
{sumSqrImpl<schar , double, 1>, sumSqrImpl<schar , double, 2>, sumSqrImpl<schar , double, 3>, sumSqrImpl<schar , double, 4>},
{sumSqrImpl<ushort, double, 1>, sumSqrImpl<ushort, double, 2>, sumSqrImpl<ushort, double, 3>, sumSqrImpl<ushort, double, 4>},
{sumSqrImpl<short , double, 1>, sumSqrImpl<short , double, 2>, sumSqrImpl<short , double, 3>, sumSqrImpl<short , double, 4>},
{sumSqrImpl<int , double, 1>, sumSqrImpl<int , double, 2>, sumSqrImpl<int , double, 3>, sumSqrImpl<int , double, 4>},
{sumSqrImpl<float , double, 1>, sumSqrImpl<float , double, 2>, sumSqrImpl<float , double, 3>, sumSqrImpl<float , double, 4>},
{sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
};
template <typename T> struct SumType;
GpuMat src = _src.getGpuMat();
template <> struct SumType<uchar> { typedef unsigned int R; };
GpuMat mask = _mask.getGpuMat();
template <> struct SumType<schar> { typedef int R; };
template <> struct SumType<ushort> { typedef unsigned int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, identity>(src, buf, out, mask);
}
template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, abs_func>(src, buf, out, mask);
}
template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
const int res_depth = CV_64F;
template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T> struct Sqr : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
return x * x;
}
};
template <typename T, int cn>
const func_t func = funcs[src.depth()][src.channels() - 1];
void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
caller<T, double, cn, Sqr>(src, buf, out, mask);
}
template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
return func(src, mask, buf);
template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
}
}
#endif
// CUDA_DISABLER
#endif
modules/cudaarithm/src/reductions.cpp
View file @
b705e0d8
...
@@ -186,137 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
...
@@ -186,137 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
return
retVal
;
return
retVal
;
}
}
////////////////////////////////////////////////////////////////////////
// Sum
namespace
sum
{
void
getBufSize
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
,
int
cn
>
void
run
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
template
<
typename
T
,
int
cn
>
void
runAbs
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
template
<
typename
T
,
int
cn
>
void
runSqr
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
}
Scalar
cv
::
cuda
::
sum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
run
<
uchar
,
1
>
,
::
sum
::
run
<
uchar
,
2
>
,
::
sum
::
run
<
uchar
,
3
>
,
::
sum
::
run
<
uchar
,
4
>
},
{
0
,
::
sum
::
run
<
schar
,
1
>
,
::
sum
::
run
<
schar
,
2
>
,
::
sum
::
run
<
schar
,
3
>
,
::
sum
::
run
<
schar
,
4
>
},
{
0
,
::
sum
::
run
<
ushort
,
1
>
,
::
sum
::
run
<
ushort
,
2
>
,
::
sum
::
run
<
ushort
,
3
>
,
::
sum
::
run
<
ushort
,
4
>
},
{
0
,
::
sum
::
run
<
short
,
1
>
,
::
sum
::
run
<
short
,
2
>
,
::
sum
::
run
<
short
,
3
>
,
::
sum
::
run
<
short
,
4
>
},
{
0
,
::
sum
::
run
<
int
,
1
>
,
::
sum
::
run
<
int
,
2
>
,
::
sum
::
run
<
int
,
3
>
,
::
sum
::
run
<
int
,
4
>
},
{
0
,
::
sum
::
run
<
float
,
1
>
,
::
sum
::
run
<
float
,
2
>
,
::
sum
::
run
<
float
,
3
>
,
::
sum
::
run
<
float
,
4
>
},
{
0
,
::
sum
::
run
<
double
,
1
>
,
::
sum
::
run
<
double
,
2
>
,
::
sum
::
run
<
double
,
3
>
,
::
sum
::
run
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
cuda
::
absSum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
runAbs
<
uchar
,
1
>
,
::
sum
::
runAbs
<
uchar
,
2
>
,
::
sum
::
runAbs
<
uchar
,
3
>
,
::
sum
::
runAbs
<
uchar
,
4
>
},
{
0
,
::
sum
::
runAbs
<
schar
,
1
>
,
::
sum
::
runAbs
<
schar
,
2
>
,
::
sum
::
runAbs
<
schar
,
3
>
,
::
sum
::
runAbs
<
schar
,
4
>
},
{
0
,
::
sum
::
runAbs
<
ushort
,
1
>
,
::
sum
::
runAbs
<
ushort
,
2
>
,
::
sum
::
runAbs
<
ushort
,
3
>
,
::
sum
::
runAbs
<
ushort
,
4
>
},
{
0
,
::
sum
::
runAbs
<
short
,
1
>
,
::
sum
::
runAbs
<
short
,
2
>
,
::
sum
::
runAbs
<
short
,
3
>
,
::
sum
::
runAbs
<
short
,
4
>
},
{
0
,
::
sum
::
runAbs
<
int
,
1
>
,
::
sum
::
runAbs
<
int
,
2
>
,
::
sum
::
runAbs
<
int
,
3
>
,
::
sum
::
runAbs
<
int
,
4
>
},
{
0
,
::
sum
::
runAbs
<
float
,
1
>
,
::
sum
::
runAbs
<
float
,
2
>
,
::
sum
::
runAbs
<
float
,
3
>
,
::
sum
::
runAbs
<
float
,
4
>
},
{
0
,
::
sum
::
runAbs
<
double
,
1
>
,
::
sum
::
runAbs
<
double
,
2
>
,
::
sum
::
runAbs
<
double
,
3
>
,
::
sum
::
runAbs
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
cuda
::
sqrSum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
runSqr
<
uchar
,
1
>
,
::
sum
::
runSqr
<
uchar
,
2
>
,
::
sum
::
runSqr
<
uchar
,
3
>
,
::
sum
::
runSqr
<
uchar
,
4
>
},
{
0
,
::
sum
::
runSqr
<
schar
,
1
>
,
::
sum
::
runSqr
<
schar
,
2
>
,
::
sum
::
runSqr
<
schar
,
3
>
,
::
sum
::
runSqr
<
schar
,
4
>
},
{
0
,
::
sum
::
runSqr
<
ushort
,
1
>
,
::
sum
::
runSqr
<
ushort
,
2
>
,
::
sum
::
runSqr
<
ushort
,
3
>
,
::
sum
::
runSqr
<
ushort
,
4
>
},
{
0
,
::
sum
::
runSqr
<
short
,
1
>
,
::
sum
::
runSqr
<
short
,
2
>
,
::
sum
::
runSqr
<
short
,
3
>
,
::
sum
::
runSqr
<
short
,
4
>
},
{
0
,
::
sum
::
runSqr
<
int
,
1
>
,
::
sum
::
runSqr
<
int
,
2
>
,
::
sum
::
runSqr
<
int
,
3
>
,
::
sum
::
runSqr
<
int
,
4
>
},
{
0
,
::
sum
::
runSqr
<
float
,
1
>
,
::
sum
::
runSqr
<
float
,
2
>
,
::
sum
::
runSqr
<
float
,
3
>
,
::
sum
::
runSqr
<
float
,
4
>
},
{
0
,
::
sum
::
runSqr
<
double
,
1
>
,
::
sum
::
runSqr
<
double
,
2
>
,
::
sum
::
runSqr
<
double
,
3
>
,
::
sum
::
runSqr
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// minMax
// minMax
...
...
modules/cudev/CMakeLists.txt
View file @
b705e0d8
...
@@ -4,7 +4,7 @@ endif()
...
@@ -4,7 +4,7 @@ endif()
set
(
the_description
"CUDA device layer"
)
set
(
the_description
"CUDA device layer"
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable
-Wenum-compare
)
ocv_add_module
(
cudev
)
ocv_add_module
(
cudev
)
...
...
modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
View file @
b705e0d8
...
@@ -418,9 +418,7 @@ namespace grid_reduce_detail
...
@@ -418,9 +418,7 @@ namespace grid_reduce_detail
const
dim3
block
(
Policy
::
block_size_x
,
Policy
::
block_size_y
);
const
dim3
block
(
Policy
::
block_size_x
,
Policy
::
block_size_y
);
const
dim3
grid
(
divUp
(
cols
,
block
.
x
*
Policy
::
patch_size_x
),
divUp
(
rows
,
block
.
y
*
Policy
::
patch_size_y
));
const
dim3
grid
(
divUp
(
cols
,
block
.
x
*
Policy
::
patch_size_x
),
divUp
(
rows
,
block
.
y
*
Policy
::
patch_size_y
));
const
int
BLOCK_SIZE
=
Policy
::
block_size_x
*
Policy
::
block_size_y
;
glob_reduce
<
Reductor
,
Policy
::
block_size_x
*
Policy
::
block_size_y
,
Policy
::
patch_size_x
,
Policy
::
patch_size_y
><<<
grid
,
block
,
0
,
stream
>>>
(
src
,
result
,
mask
,
rows
,
cols
);
glob_reduce
<
Reductor
,
BLOCK_SIZE
,
Policy
::
patch_size_x
,
Policy
::
patch_size_y
><<<
grid
,
block
,
0
,
stream
>>>
(
src
,
result
,
mask
,
rows
,
cols
);
CV_CUDEV_SAFE_CALL
(
cudaGetLastError
()
);
CV_CUDEV_SAFE_CALL
(
cudaGetLastError
()
);
if
(
stream
==
0
)
if
(
stream
==
0
)
...
@@ -433,10 +431,9 @@ namespace grid_reduce_detail
...
@@ -433,10 +431,9 @@ namespace grid_reduce_detail
__host__
void
sum
(
const
SrcPtr
&
src
,
ResType
*
result
,
const
MaskPtr
&
mask
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
__host__
void
sum
(
const
SrcPtr
&
src
,
ResType
*
result
,
const
MaskPtr
&
mask
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
const
int
cn
=
VecTraits
<
src_type
>::
cn
;
typedef
typename
VecTraits
<
ResType
>::
elem_type
res_elem_type
;
typedef
typename
MakeVec
<
ResType
,
cn
>::
type
work_type
;
glob_reduce
<
SumReductor
<
src_type
,
work_type
>
,
Policy
>
(
src
,
result
,
mask
,
rows
,
cols
,
stream
);
glob_reduce
<
SumReductor
<
src_type
,
ResType
>
,
Policy
>
(
src
,
(
res_elem_type
*
)
result
,
mask
,
rows
,
cols
,
stream
);
}
}
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
...
...
modules/cudev/include/opencv2/cudev/grid/reduce.hpp
View file @
b705e0d8
...
@@ -59,6 +59,10 @@ namespace cv { namespace cudev {
...
@@ -59,6 +59,10 @@ namespace cv { namespace cudev {
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
const
MaskPtr
&
mask
,
Stream
&
stream
=
Stream
::
Null
())
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
const
MaskPtr
&
mask
,
Stream
&
stream
=
Stream
::
Null
())
{
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
CV_StaticAssert
(
VecTraits
<
src_type
>::
cn
==
VecTraits
<
ResType
>::
cn
,
""
);
dst
.
create
(
1
,
1
);
dst
.
create
(
1
,
1
);
dst
.
setTo
(
0
,
stream
);
dst
.
setTo
(
0
,
stream
);
...
@@ -77,6 +81,10 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP
...
@@ -77,6 +81,10 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
>
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
>
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
Stream
&
stream
=
Stream
::
Null
())
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
Stream
&
stream
=
Stream
::
Null
())
{
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
CV_StaticAssert
(
VecTraits
<
src_type
>::
cn
==
VecTraits
<
ResType
>::
cn
,
""
);
dst
.
create
(
1
,
1
);
dst
.
create
(
1
,
1
);
dst
.
setTo
(
0
,
stream
);
dst
.
setTo
(
0
,
stream
);
...
...
modules/cudev/include/opencv2/cudev/util/vec_math.hpp
View file @
b705e0d8
...
@@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
...
@@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
}
}
namespace
vec_math_detail
{
__device__
__forceinline__
schar
abs_
(
schar
val
)
{
return
(
schar
)
::
abs
((
int
)
val
);
}
__device__
__forceinline__
short
abs_
(
short
val
)
{
return
(
short
)
::
abs
((
int
)
val
);
}
}
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uchar
,
uchar
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uchar
,
uchar
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
char
,
char
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
vec_math_detail
::
abs_
,
char
,
char
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
ushort
,
ushort
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
ushort
,
ushort
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
short
,
short
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
vec_math_detail
::
abs_
,
short
,
short
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
int
,
int
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
int
,
int
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uint
,
uint
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uint
,
uint
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
fabsf
,
float
,
float
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
fabsf
,
float
,
float
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment