Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b705e0d8
Commit
b705e0d8
authored
Aug 27, 2013
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
used new device layer for cv::gpu::sum
parent
9fe92e21
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
145 additions
and
436 deletions
+145
-436
sum.cu
modules/cudaarithm/src/cuda/sum.cu
+118
-296
reductions.cpp
modules/cudaarithm/src/reductions.cpp
+0
-131
CMakeLists.txt
modules/cudev/CMakeLists.txt
+1
-1
reduce.hpp
modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
+3
-6
reduce.hpp
modules/cudev/include/opencv2/cudev/grid/reduce.hpp
+8
-0
vec_math.hpp
modules/cudev/include/opencv2/cudev/util/vec_math.hpp
+15
-2
No files found.
modules/cudaarithm/src/cuda/sum.cu
View file @
b705e0d8
...
@@ -40,342 +40,164 @@
...
@@ -40,342 +40,164 @@
//
//
//M*/
//M*/
#i
f !defined CUDA_DISABLER
#i
nclude "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/utility.hpp"
#
include "unroll_detail.hpp
"
#
error "opencv_cudev is required
"
using namespace cv::cuda;
#else
using namespace cv::cuda::device;
namespace sum
#include "opencv2/cudaarithm.hpp"
{
#include "opencv2/cudev.hpp"
__device__ unsigned int blocks_finished = 0;
template <typename R, int cn> struct AtomicAdd;
using namespace cv::cudev;
template <typename R> struct AtomicAdd<R, 1>
{
static __device__ void run(R* ptr, R val)
{
Emulation::glob::atomicAdd(ptr, val);
}
};
template <typename R> struct AtomicAdd<R, 2>
{
typedef typename TypeVec<R, 2>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
namespace
{
{
Emulation::glob::atomicAdd(ptr, val.x);
template <typename T, typename R, int cn>
Emulation::glob::atomicAdd(ptr + 1, val.y);
cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
}
};
template <typename R> struct AtomicAdd<R, 3>
{
typedef typename TypeVec<R, 3>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
}
};
template <typename R> struct AtomicAdd<R, 4>
{
{
typedef typename TypeVec<R, 4>::vec_type val_type;
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
Emulation::glob::atomicAdd(ptr + 3, val.w);
}
};
template <int BLOCK_SIZE, typename R, int cn>
GpuMat_<src_type> src(_src);
struct GlobalReduce
GpuMat_<res_type> buf(_buf);
{
typedef typename TypeVec<R, cn>::vec_type result_type;
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
AtomicAdd<R, cn>::run((R*) result, sum);
#else
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = sum;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
if (tid == 0)
{
result[0] = sum;
blocks_finished = 0;
}
}
#endif
}
};
template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
if (mask.empty())
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
gridCalcSum(src, buf);
{
else
typedef typename VecTraits<src_type>::elem_type T;
gridCalcSum(src, buf, globPtr<uchar>(mask));
typedef typename VecTraits<result_type>::elem_type R;
const int cn = VecTraits<src_type>::cn;
__shared__ R smem[BLOCK_SIZE * cn];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
cv::Scalar_<R> res;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
const int tid = threadIdx.y * blockDim.x + threadIdx.x
;
return res
;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
}
result_type sum = VecTraits<result_type>::all(0);
template <typename T, typename R, int cn>
cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
{
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
GpuMat_<src_type> src(_src);
{
GpuMat_<res_type> buf(_buf);
const src_type* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
if (mask.empty())
{
gridCalcSum(abs_(cvt_<res_type>(src)), buf);
if (mask(y, x))
else
{
gridCalcSum(abs_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
const src_type srcVal = ptr[x];
sum = sum + op(saturate_cast<result_type>(srcVal));
}
}
}
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
cv::Scalar_<R> res;
cv::Mat res_mat(buf.size(), buf.type(), res.val);
buf.download(res_mat);
GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem)
;
return res
;
}
}
const int threads_x = 32;
template <typename T, typename R, int cn>
const int threads_y = 8;
cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
{
block = dim3(threads_x, threads_y);
typedef typename MakeVec<T, cn>::type src_type;
typedef typename MakeVec<R, cn>::type res_type;
grid = dim3(divUp(cols, block.x * block.y),
GpuMat_<src_type> src(_src);
divUp(rows, block.y * block.x)
);
GpuMat_<res_type> buf(_buf
);
grid.x = ::min(grid.x, block.x);
if (mask.empty())
grid.y = ::min(grid.y, block.y);
gridCalcSum(sqr_(cvt_<res_type>(src)), buf);
}
else
gridCalcSum(sqr_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
cv::Scalar_<R> res;
{
cv::Mat res_mat(buf.size(), buf.type(), res.val);
dim3 block, grid;
buf.download(res_mat);
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
return res;
bufrows = 1;
}
}
}
template <typename T, typename R, int cn, template <typename> class Op>
cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
{
typedef typename TypeVec<T, cn>::vec_type src_type;
{sumImpl<uchar , uint , 1>, sumImpl<uchar , uint , 2>, sumImpl<uchar , uint , 3>, sumImpl<uchar , uint , 4>},
typedef typename TypeVec<R, cn>::vec_type result_type;
{sumImpl<schar , int , 1>, sumImpl<schar , int , 2>, sumImpl<schar , int , 3>, sumImpl<schar , int , 4>},
{sumImpl<ushort, uint , 1>, sumImpl<ushort, uint , 2>, sumImpl<ushort, uint , 3>, sumImpl<ushort, uint , 4>},
{sumImpl<short , int , 1>, sumImpl<short , int , 2>, sumImpl<short , int , 3>, sumImpl<short , int , 4>},
{sumImpl<int , int , 1>, sumImpl<int , int , 2>, sumImpl<int , int , 3>, sumImpl<int , int , 4>},
{sumImpl<float , float , 1>, sumImpl<float , float , 2>, sumImpl<float , float , 3>, sumImpl<float , float , 4>},
{sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
};
PtrStepSz<src_type> src(src_
);
GpuMat src = _src.getGpuMat(
);
result_type* buf = (result_type*) buf_
;
GpuMat mask = _mask.getGpuMat()
;
dim3 block, grid;
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x
);
const int res_depth = std::max(src.depth(), CV_32F
);
const int theight = divUp(divUp(src.rows, grid.y), block.y
);
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf
);
Op<result_type> op
;
const func_t func = funcs[src.depth()][src.channels() - 1]
;
if (mask.data)
return func(src, mask, buf);
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
}
else
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
static const func_t funcs[7][4] =
{
{sumAbsImpl<uchar , uint , 1>, sumAbsImpl<uchar , uint , 2>, sumAbsImpl<uchar , uint , 3>, sumAbsImpl<uchar , uint , 4>},
{sumAbsImpl<schar , int , 1>, sumAbsImpl<schar , int , 2>, sumAbsImpl<schar , int , 3>, sumAbsImpl<schar , int , 4>},
{sumAbsImpl<ushort, uint , 1>, sumAbsImpl<ushort, uint , 2>, sumAbsImpl<ushort, uint , 3>, sumAbsImpl<ushort, uint , 4>},
{sumAbsImpl<short , int , 1>, sumAbsImpl<short , int , 2>, sumAbsImpl<short , int , 3>, sumAbsImpl<short , int , 4>},
{sumAbsImpl<int , int , 1>, sumAbsImpl<int , int , 2>, sumAbsImpl<int , int , 3>, sumAbsImpl<int , int , 4>},
{sumAbsImpl<float , float , 1>, sumAbsImpl<float , float , 2>, sumAbsImpl<float , float , 3>, sumAbsImpl<float , float , 4>},
{sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
};
R result[4] = {0, 0, 0, 0}
;
GpuMat src = _src.getGpuMat()
;
cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost)
);
GpuMat mask = _mask.getGpuMat(
);
out[0] = result[0];
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
out[1] = result[1];
out[2] = result[2];
out[3] = result[3];
}
template <typename T> struct SumType;
const int res_depth = std::max(src.depth(), CV_32F);
template <> struct SumType<uchar> { typedef unsigned int R; };
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
template <> struct SumType<schar> { typedef int R; };
template <> struct SumType<ushort> { typedef unsigned int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, identity>(src, buf, out, mask);
}
template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
const func_t func = funcs[src.depth()][src.channels() - 1];
template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, abs_func>(src, buf, out, mask);
}
template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
return func(src, mask, buf);
template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
}
template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
{
template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
static const func_t funcs[7][4] =
template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T> struct Sqr : unary_function<T, T>
{
{
__device__ __forceinline__ T operator ()(T x) const
{sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
{
{sumSqrImpl<schar , double, 1>, sumSqrImpl<schar , double, 2>, sumSqrImpl<schar , double, 3>, sumSqrImpl<schar , double, 4>},
return x * x;
{sumSqrImpl<ushort, double, 1>, sumSqrImpl<ushort, double, 2>, sumSqrImpl<ushort, double, 3>, sumSqrImpl<ushort, double, 4>},
}
{sumSqrImpl<short , double, 1>, sumSqrImpl<short , double, 2>, sumSqrImpl<short , double, 3>, sumSqrImpl<short , double, 4>},
{sumSqrImpl<int , double, 1>, sumSqrImpl<int , double, 2>, sumSqrImpl<int , double, 3>, sumSqrImpl<int , double, 4>},
{sumSqrImpl<float , double, 1>, sumSqrImpl<float , double, 2>, sumSqrImpl<float , double, 3>, sumSqrImpl<float , double, 4>},
{sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
};
};
template <typename T, int cn>
GpuMat src = _src.getGpuMat();
void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
GpuMat mask = _mask.getGpuMat();
{
caller<T, double, cn, Sqr>(src, buf, out, mask);
CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
}
const int res_depth = CV_64F;
cv::cuda::ensureSizeIsEnough(1, 1, CV_MAKE_TYPE(res_depth, src.channels()), buf);
const func_t func = funcs[src.depth()][src.channels() - 1];
template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
return func(src, mask, buf);
template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
}
}
#endif
// CUDA_DISABLER
#endif
modules/cudaarithm/src/reductions.cpp
View file @
b705e0d8
...
@@ -186,137 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
...
@@ -186,137 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
return
retVal
;
return
retVal
;
}
}
////////////////////////////////////////////////////////////////////////
// Sum
namespace
sum
{
void
getBufSize
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
,
int
cn
>
void
run
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
template
<
typename
T
,
int
cn
>
void
runAbs
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
template
<
typename
T
,
int
cn
>
void
runSqr
(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
}
Scalar
cv
::
cuda
::
sum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
run
<
uchar
,
1
>
,
::
sum
::
run
<
uchar
,
2
>
,
::
sum
::
run
<
uchar
,
3
>
,
::
sum
::
run
<
uchar
,
4
>
},
{
0
,
::
sum
::
run
<
schar
,
1
>
,
::
sum
::
run
<
schar
,
2
>
,
::
sum
::
run
<
schar
,
3
>
,
::
sum
::
run
<
schar
,
4
>
},
{
0
,
::
sum
::
run
<
ushort
,
1
>
,
::
sum
::
run
<
ushort
,
2
>
,
::
sum
::
run
<
ushort
,
3
>
,
::
sum
::
run
<
ushort
,
4
>
},
{
0
,
::
sum
::
run
<
short
,
1
>
,
::
sum
::
run
<
short
,
2
>
,
::
sum
::
run
<
short
,
3
>
,
::
sum
::
run
<
short
,
4
>
},
{
0
,
::
sum
::
run
<
int
,
1
>
,
::
sum
::
run
<
int
,
2
>
,
::
sum
::
run
<
int
,
3
>
,
::
sum
::
run
<
int
,
4
>
},
{
0
,
::
sum
::
run
<
float
,
1
>
,
::
sum
::
run
<
float
,
2
>
,
::
sum
::
run
<
float
,
3
>
,
::
sum
::
run
<
float
,
4
>
},
{
0
,
::
sum
::
run
<
double
,
1
>
,
::
sum
::
run
<
double
,
2
>
,
::
sum
::
run
<
double
,
3
>
,
::
sum
::
run
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
cuda
::
absSum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
runAbs
<
uchar
,
1
>
,
::
sum
::
runAbs
<
uchar
,
2
>
,
::
sum
::
runAbs
<
uchar
,
3
>
,
::
sum
::
runAbs
<
uchar
,
4
>
},
{
0
,
::
sum
::
runAbs
<
schar
,
1
>
,
::
sum
::
runAbs
<
schar
,
2
>
,
::
sum
::
runAbs
<
schar
,
3
>
,
::
sum
::
runAbs
<
schar
,
4
>
},
{
0
,
::
sum
::
runAbs
<
ushort
,
1
>
,
::
sum
::
runAbs
<
ushort
,
2
>
,
::
sum
::
runAbs
<
ushort
,
3
>
,
::
sum
::
runAbs
<
ushort
,
4
>
},
{
0
,
::
sum
::
runAbs
<
short
,
1
>
,
::
sum
::
runAbs
<
short
,
2
>
,
::
sum
::
runAbs
<
short
,
3
>
,
::
sum
::
runAbs
<
short
,
4
>
},
{
0
,
::
sum
::
runAbs
<
int
,
1
>
,
::
sum
::
runAbs
<
int
,
2
>
,
::
sum
::
runAbs
<
int
,
3
>
,
::
sum
::
runAbs
<
int
,
4
>
},
{
0
,
::
sum
::
runAbs
<
float
,
1
>
,
::
sum
::
runAbs
<
float
,
2
>
,
::
sum
::
runAbs
<
float
,
3
>
,
::
sum
::
runAbs
<
float
,
4
>
},
{
0
,
::
sum
::
runAbs
<
double
,
1
>
,
::
sum
::
runAbs
<
double
,
2
>
,
::
sum
::
runAbs
<
double
,
3
>
,
::
sum
::
runAbs
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
cuda
::
sqrSum
(
InputArray
_src
,
InputArray
_mask
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
PtrStepSzb
src
,
void
*
buf
,
double
*
sum
,
PtrStepSzb
mask
);
static
const
func_t
funcs
[
7
][
5
]
=
{
{
0
,
::
sum
::
runSqr
<
uchar
,
1
>
,
::
sum
::
runSqr
<
uchar
,
2
>
,
::
sum
::
runSqr
<
uchar
,
3
>
,
::
sum
::
runSqr
<
uchar
,
4
>
},
{
0
,
::
sum
::
runSqr
<
schar
,
1
>
,
::
sum
::
runSqr
<
schar
,
2
>
,
::
sum
::
runSqr
<
schar
,
3
>
,
::
sum
::
runSqr
<
schar
,
4
>
},
{
0
,
::
sum
::
runSqr
<
ushort
,
1
>
,
::
sum
::
runSqr
<
ushort
,
2
>
,
::
sum
::
runSqr
<
ushort
,
3
>
,
::
sum
::
runSqr
<
ushort
,
4
>
},
{
0
,
::
sum
::
runSqr
<
short
,
1
>
,
::
sum
::
runSqr
<
short
,
2
>
,
::
sum
::
runSqr
<
short
,
3
>
,
::
sum
::
runSqr
<
short
,
4
>
},
{
0
,
::
sum
::
runSqr
<
int
,
1
>
,
::
sum
::
runSqr
<
int
,
2
>
,
::
sum
::
runSqr
<
int
,
3
>
,
::
sum
::
runSqr
<
int
,
4
>
},
{
0
,
::
sum
::
runSqr
<
float
,
1
>
,
::
sum
::
runSqr
<
float
,
2
>
,
::
sum
::
runSqr
<
float
,
3
>
,
::
sum
::
runSqr
<
float
,
4
>
},
{
0
,
::
sum
::
runSqr
<
double
,
1
>
,
::
sum
::
runSqr
<
double
,
2
>
,
::
sum
::
runSqr
<
double
,
3
>
,
::
sum
::
runSqr
<
double
,
4
>
}
};
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src
.
size
())
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
sum
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
buf
.
setTo
(
Scalar
::
all
(
0
));
const
func_t
func
=
funcs
[
src
.
depth
()][
src
.
channels
()];
double
result
[
4
];
func
(
src
,
buf
.
data
,
result
,
mask
);
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// minMax
// minMax
...
...
modules/cudev/CMakeLists.txt
View file @
b705e0d8
...
@@ -4,7 +4,7 @@ endif()
...
@@ -4,7 +4,7 @@ endif()
set
(
the_description
"CUDA device layer"
)
set
(
the_description
"CUDA device layer"
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable
)
ocv_warnings_disable
(
CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable
-Wenum-compare
)
ocv_add_module
(
cudev
)
ocv_add_module
(
cudev
)
...
...
modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
View file @
b705e0d8
...
@@ -418,9 +418,7 @@ namespace grid_reduce_detail
...
@@ -418,9 +418,7 @@ namespace grid_reduce_detail
const
dim3
block
(
Policy
::
block_size_x
,
Policy
::
block_size_y
);
const
dim3
block
(
Policy
::
block_size_x
,
Policy
::
block_size_y
);
const
dim3
grid
(
divUp
(
cols
,
block
.
x
*
Policy
::
patch_size_x
),
divUp
(
rows
,
block
.
y
*
Policy
::
patch_size_y
));
const
dim3
grid
(
divUp
(
cols
,
block
.
x
*
Policy
::
patch_size_x
),
divUp
(
rows
,
block
.
y
*
Policy
::
patch_size_y
));
const
int
BLOCK_SIZE
=
Policy
::
block_size_x
*
Policy
::
block_size_y
;
glob_reduce
<
Reductor
,
Policy
::
block_size_x
*
Policy
::
block_size_y
,
Policy
::
patch_size_x
,
Policy
::
patch_size_y
><<<
grid
,
block
,
0
,
stream
>>>
(
src
,
result
,
mask
,
rows
,
cols
);
glob_reduce
<
Reductor
,
BLOCK_SIZE
,
Policy
::
patch_size_x
,
Policy
::
patch_size_y
><<<
grid
,
block
,
0
,
stream
>>>
(
src
,
result
,
mask
,
rows
,
cols
);
CV_CUDEV_SAFE_CALL
(
cudaGetLastError
()
);
CV_CUDEV_SAFE_CALL
(
cudaGetLastError
()
);
if
(
stream
==
0
)
if
(
stream
==
0
)
...
@@ -433,10 +431,9 @@ namespace grid_reduce_detail
...
@@ -433,10 +431,9 @@ namespace grid_reduce_detail
__host__
void
sum
(
const
SrcPtr
&
src
,
ResType
*
result
,
const
MaskPtr
&
mask
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
__host__
void
sum
(
const
SrcPtr
&
src
,
ResType
*
result
,
const
MaskPtr
&
mask
,
int
rows
,
int
cols
,
cudaStream_t
stream
)
{
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
const
int
cn
=
VecTraits
<
src_type
>::
cn
;
typedef
typename
VecTraits
<
ResType
>::
elem_type
res_elem_type
;
typedef
typename
MakeVec
<
ResType
,
cn
>::
type
work_type
;
glob_reduce
<
SumReductor
<
src_type
,
work_type
>
,
Policy
>
(
src
,
result
,
mask
,
rows
,
cols
,
stream
);
glob_reduce
<
SumReductor
<
src_type
,
ResType
>
,
Policy
>
(
src
,
(
res_elem_type
*
)
result
,
mask
,
rows
,
cols
,
stream
);
}
}
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
...
...
modules/cudev/include/opencv2/cudev/grid/reduce.hpp
View file @
b705e0d8
...
@@ -59,6 +59,10 @@ namespace cv { namespace cudev {
...
@@ -59,6 +59,10 @@ namespace cv { namespace cudev {
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
,
class
MaskPtr
>
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
const
MaskPtr
&
mask
,
Stream
&
stream
=
Stream
::
Null
())
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
const
MaskPtr
&
mask
,
Stream
&
stream
=
Stream
::
Null
())
{
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
CV_StaticAssert
(
VecTraits
<
src_type
>::
cn
==
VecTraits
<
ResType
>::
cn
,
""
);
dst
.
create
(
1
,
1
);
dst
.
create
(
1
,
1
);
dst
.
setTo
(
0
,
stream
);
dst
.
setTo
(
0
,
stream
);
...
@@ -77,6 +81,10 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP
...
@@ -77,6 +81,10 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
>
template
<
class
Policy
,
class
SrcPtr
,
typename
ResType
>
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
Stream
&
stream
=
Stream
::
Null
())
__host__
void
gridCalcSum_
(
const
SrcPtr
&
src
,
GpuMat_
<
ResType
>&
dst
,
Stream
&
stream
=
Stream
::
Null
())
{
{
typedef
typename
PtrTraits
<
SrcPtr
>::
value_type
src_type
;
CV_StaticAssert
(
VecTraits
<
src_type
>::
cn
==
VecTraits
<
ResType
>::
cn
,
""
);
dst
.
create
(
1
,
1
);
dst
.
create
(
1
,
1
);
dst
.
setTo
(
0
,
stream
);
dst
.
setTo
(
0
,
stream
);
...
...
modules/cudev/include/opencv2/cudev/util/vec_math.hpp
View file @
b705e0d8
...
@@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
...
@@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
}
}
namespace
vec_math_detail
{
__device__
__forceinline__
schar
abs_
(
schar
val
)
{
return
(
schar
)
::
abs
((
int
)
val
);
}
__device__
__forceinline__
short
abs_
(
short
val
)
{
return
(
short
)
::
abs
((
int
)
val
);
}
}
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uchar
,
uchar
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uchar
,
uchar
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
char
,
char
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
vec_math_detail
::
abs_
,
char
,
char
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
ushort
,
ushort
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
ushort
,
ushort
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
short
,
short
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
vec_math_detail
::
abs_
,
short
,
short
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
int
,
int
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
abs
,
int
,
int
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uint
,
uint
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
/*::abs*/
,
uint
,
uint
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
fabsf
,
float
,
float
)
CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
(
abs
,
::
fabsf
,
float
,
float
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment