Commit 21233656 authored by Roman Donchenko's avatar Roman Donchenko Committed by OpenCV Buildbot

Merge pull request #1540 from jet47:gpuarithm-cudev

parents e290436a 8ed47c01
......@@ -216,7 +216,7 @@ namespace
template <typename T>
void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream)
{
gridTransform_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
gridTransformUnary_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
}
}
......@@ -268,14 +268,14 @@ namespace
void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream)
{
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
}
template <typename T>
void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream)
{
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
}
}
......@@ -382,7 +382,7 @@ namespace
typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
typedef typename LargerType<float, dst_elem_type>::type scalar_type;
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
}
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
......@@ -408,7 +408,7 @@ namespace
op.alpha = cv::saturate_cast<scalar_type>(alpha);
op.beta = cv::saturate_cast<scalar_type>(beta);
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
}
}
......
......@@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices")
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy)
ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev)
ocv_module_include_directories()
ocv_glob_module_sources()
......
......@@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// Integral
PERF_TEST_P(Sz, Integral,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
cv::cuda::GpuMat d_buf;
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
CUDA_SANITY_CHECK(dst);
}
else
{
cv::Mat dst;
TEST_CYCLE() cv::integral(src, dst);
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// IntegralSqr
PERF_TEST_P(Sz, IntegralSqr,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst, buf;
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
CUDA_SANITY_CHECK(dst);
}
else
{
FAIL_NO_CPU();
}
}
......@@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);
CUDA_SANITY_CHECK(dst);
}
......@@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
{
cv::Mat dst;
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);
CPU_SANITY_CHECK(dst);
}
......@@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev,
SANITY_CHECK(cpu_stddev);
}
}
//////////////////////////////////////////////////////////////////////
// Integral
PERF_TEST_P(Sz, Integral,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
cv::cuda::GpuMat d_buf;
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
CUDA_SANITY_CHECK(dst);
}
else
{
cv::Mat dst;
TEST_CYCLE() cv::integral(src, dst);
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// IntegralSqr
PERF_TEST_P(Sz, IntegralSqr,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst, buf;
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
CUDA_SANITY_CHECK(dst);
}
else
{
FAIL_NO_CPU();
}
}
......@@ -292,95 +292,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void) flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, dst, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) scale;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void)flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// dft
......
This diff is collapsed.
......@@ -40,43 +40,22 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
{
struct VAbsDiff4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
__host__ __device__ __forceinline__ VAbsDiff4() {}
__host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
};
#include "opencv2/cudev.hpp"
struct VAbsDiff2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
using namespace cv::cudev;
__host__ __device__ __forceinline__ VAbsDiff2() {}
__host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
};
void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
namespace
{
__device__ __forceinline__ int _abs(int a)
{
return ::abs(a);
......@@ -90,58 +69,120 @@ namespace arithm
return ::fabs(a);
}
template <typename T> struct AbsDiffMat : binary_function<T, T, T>
template <typename T> struct AbsDiffOp1 : binary_function<T, T, T>
{
__device__ __forceinline__ T operator ()(T a, T b) const
{
return saturate_cast<T>(_abs(a - b));
}
__host__ __device__ __forceinline__ AbsDiffMat() {}
__host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename T>
void absDiffMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), AbsDiffOp1<T>(), stream);
}
struct AbsDiffOp2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
};
}}}
namespace arithm
{
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
void absDiffMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
const int vcols = src1.cols >> 1;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp2(), stream);
}
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
struct AbsDiffOp4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
};
void absDiffMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp4(), stream);
}
}
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
{
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
absDiffMat_v1<uchar>,
absDiffMat_v1<schar>,
absDiffMat_v1<ushort>,
absDiffMat_v1<short>,
absDiffMat_v1<int>,
absDiffMat_v1<float>,
absDiffMat_v1<double>
};
const int depth = src1.depth();
CV_DbgAssert( depth <= CV_64F );
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
if (depth == CV_8U || depth == CV_16U)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
absDiffMat_v4(src1_, src2_, dst_, stream);
return;
}
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
absDiffMat_v2(src1_, src2_, dst_, stream);
return;
}
}
}
template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
const func_t func = funcs[depth];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
func(src1_, src2_, dst_, stream);
}
#endif // CUDA_DISABLER
#endif
......@@ -40,59 +40,71 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace arithm
#include "opencv2/cudev.hpp"
using namespace cv::cudev;
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
namespace
{
template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
template <typename T, typename S> struct AbsDiffScalarOp : unary_function<T, T>
{
S val;
__host__ explicit AbsDiffScalar(S val_) : val(val_) {}
__device__ __forceinline__ T operator ()(T a) const
{
abs_func<S> f;
return saturate_cast<T>(f(a - val));
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
};
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{
enum {
shift = 1
};
};
}}}
namespace arithm
template <typename SrcType, typename ScalarDepth>
void absDiffScalarImpl(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
{
AbsDiffScalarOp<SrcType, ScalarDepth> op;
op.val = static_cast<ScalarDepth>(value);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<SrcType>(dst), op, stream);
}
}
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
{
template <typename T, typename S>
void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
typedef void (*func_t)(const GpuMat& src, double val, GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
AbsDiffScalar<T, S> op(static_cast<S>(val));
absDiffScalarImpl<uchar, float>,
absDiffScalarImpl<schar, float>,
absDiffScalarImpl<ushort, float>,
absDiffScalarImpl<short, float>,
absDiffScalarImpl<int, float>,
absDiffScalarImpl<float, float>,
absDiffScalarImpl<double, double>
};
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
}
const int depth = src.depth();
CV_DbgAssert( depth <= CV_64F );
template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
funcs[depth](src, val[0], dst, stream);
}
#endif // CUDA_DISABLER
#endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -40,65 +40,132 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "arithm_func_traits.hpp"
#error "opencv_cudev is required"
using namespace cv::cuda;
using namespace cv::cuda::device;
#else
namespace cv { namespace cuda { namespace device
#include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp"
using namespace cv::cudev;
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
namespace
{
template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <template <typename> class Op, typename T>
void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
{
};
gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
}
typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);
template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <typename T, bit_scalar_func_t func> struct BitScalar
{
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
func(src, cv::saturate_cast<T>(value[0]), dst, stream);
}
};
template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <bit_scalar_func_t func> struct BitScalar4
{
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
uint packedVal = 0;
packedVal |= cv::saturate_cast<uchar>(value[0]);
packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
func(src, packedVal, dst, stream);
}
};
}}}
namespace arithm
{
template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
template <int DEPTH, int cn> struct NppBitwiseCFunc
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
}
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
};
template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
}
typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
{
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStreamHandler h(stream);
NppiSize oSizeROI;
oSizeROI.width = src.cols;
oSizeROI.height = src.rows;
const npp_type pConstants[] =
{
cv::saturate_cast<npp_type>(value[0]),
cv::saturate_cast<npp_type>(value[1]),
cv::saturate_cast<npp_type>(value[2]),
cv::saturate_cast<npp_type>(value[3])
};
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
if (stream == 0)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
};
}
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
{
(void) mask;
template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
static const func_t funcs[3][6][4] =
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
}
{
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
},
{
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
},
{
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
}
};
template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
const int depth = src.depth();
const int cn = src.channels();
template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
CV_DbgAssert( depth <= CV_32F );
CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
CV_DbgAssert( mask.empty() );
CV_DbgAssert( op >= 0 && op < 3 );
template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
funcs[op][depth][cn - 1](src, value, dst, stream);
}
#endif // CUDA_DISABLER
#endif
This diff is collapsed.
This diff is collapsed.
......@@ -40,137 +40,57 @@
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#ifndef HAVE_OPENCV_CUDEV
using namespace cv::cuda;
using namespace cv::cuda::device;
#error "opencv_cudev is required"
namespace countNonZero
{
__device__ unsigned int blocks_finished = 0;
template <int BLOCK_SIZE, typename T>
__global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
{
__shared__ unsigned int scount[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int mycount = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
const T srcVal = ptr[x];
mycount += (srcVal != 0);
}
}
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#if __CUDA_ARCH__ >= 200
if (tid == 0)
::atomicAdd(count, mycount);
#else
__shared__ bool is_last;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
if (tid == 0)
{
count[bid] = mycount;
__threadfence();
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
#else
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
if (tid == 0)
{
count[0] = mycount;
using namespace cv::cudev;
blocks_finished = 0;
}
}
#endif
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
namespace
{
template <typename T>
int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
gridCountNonZero(src, buf);
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
int data;
buf.download(cv::Mat(1, 1, buf.type(), &data));
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
return data;
}
}
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
{
typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
static const func_t funcs[] =
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
countNonZeroImpl<uchar>,
countNonZeroImpl<schar>,
countNonZeroImpl<ushort>,
countNonZeroImpl<short>,
countNonZeroImpl<int>,
countNonZeroImpl<float>,
countNonZeroImpl<double>
};
unsigned int* count_buf = buf.ptr(0);
GpuMat src = _src.getGpuMat();
cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
CV_Assert( src.channels() == 1 );
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
return count;
}
const func_t func = funcs[src.depth()];
template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
return func(src, buf);
}
#endif // CUDA_DISABLER
#endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -52,13 +52,6 @@
#include "opencv2/core/private.cuda.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_CUDALEGACY
# include "opencv2/cudalegacy.hpp"
# include "opencv2/cudalegacy/private.hpp"
#endif
#ifdef HAVE_CUBLAS
# include <cublas.h>
#endif
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -4,7 +4,7 @@ endif()
set(the_description "CUDA device layer")
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable)
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare)
ocv_add_module(cudev)
......
......@@ -73,7 +73,7 @@
#include "cudev/block/vec_distance.hpp"
#include "cudev/grid/copy.hpp"
#include "cudev/grid/glob_reduce.hpp"
#include "cudev/grid/reduce.hpp"
#include "cudev/grid/histogram.hpp"
#include "cudev/grid/integral.hpp"
#include "cudev/grid/pyramids.hpp"
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment