Commit 21233656 authored by Roman Donchenko's avatar Roman Donchenko Committed by OpenCV Buildbot

Merge pull request #1540 from jet47:gpuarithm-cudev

parents e290436a 8ed47c01
...@@ -216,7 +216,7 @@ namespace ...@@ -216,7 +216,7 @@ namespace
template <typename T> template <typename T>
void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream) void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream)
{ {
gridTransform_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream); gridTransformUnary_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
} }
} }
...@@ -268,14 +268,14 @@ namespace ...@@ -268,14 +268,14 @@ namespace
void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream) void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream)
{ {
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar; Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream); gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
} }
template <typename T> template <typename T>
void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream) void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream)
{ {
Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar; Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream); gridTransformUnary(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
} }
} }
...@@ -382,7 +382,7 @@ namespace ...@@ -382,7 +382,7 @@ namespace
typedef typename LargerType<src_elem_type, float>::type larger_elem_type; typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
typedef typename LargerType<float, dst_elem_type>::type scalar_type; typedef typename LargerType<float, dst_elem_type>::type scalar_type;
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream); gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
} }
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D> template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
...@@ -408,7 +408,7 @@ namespace ...@@ -408,7 +408,7 @@ namespace
op.alpha = cv::saturate_cast<scalar_type>(alpha); op.alpha = cv::saturate_cast<scalar_type>(alpha);
op.beta = cv::saturate_cast<scalar_type>(beta); op.beta = cv::saturate_cast<scalar_type>(beta);
gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream); gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
} }
} }
......
...@@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices") ...@@ -6,7 +6,7 @@ set(the_description "CUDA-accelerated Operations on Matrices")
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations) ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy) ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev)
ocv_module_include_directories() ocv_module_include_directories()
ocv_glob_module_sources() ocv_glob_module_sources()
......
...@@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve, ...@@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
CPU_SANITY_CHECK(dst); CPU_SANITY_CHECK(dst);
} }
} }
//////////////////////////////////////////////////////////////////////
// Integral
PERF_TEST_P(Sz, Integral,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
cv::cuda::GpuMat d_buf;
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
CUDA_SANITY_CHECK(dst);
}
else
{
cv::Mat dst;
TEST_CYCLE() cv::integral(src, dst);
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// IntegralSqr
PERF_TEST_P(Sz, IntegralSqr,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst, buf;
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
CUDA_SANITY_CHECK(dst);
}
else
{
FAIL_NO_CPU();
}
}
...@@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce, ...@@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
const cv::cuda::GpuMat d_src(src); const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst; cv::cuda::GpuMat dst;
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp); TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);
CUDA_SANITY_CHECK(dst); CUDA_SANITY_CHECK(dst);
} }
...@@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce, ...@@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
{ {
cv::Mat dst; cv::Mat dst;
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp); TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);
CPU_SANITY_CHECK(dst); CPU_SANITY_CHECK(dst);
} }
...@@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev, ...@@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev,
SANITY_CHECK(cpu_stddev); SANITY_CHECK(cpu_stddev);
} }
} }
//////////////////////////////////////////////////////////////////////
// Integral
PERF_TEST_P(Sz, Integral,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst;
cv::cuda::GpuMat d_buf;
TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
CUDA_SANITY_CHECK(dst);
}
else
{
cv::Mat dst;
TEST_CYCLE() cv::integral(src, dst);
CPU_SANITY_CHECK(dst);
}
}
//////////////////////////////////////////////////////////////////////
// IntegralSqr
PERF_TEST_P(Sz, IntegralSqr,
CUDA_TYPICAL_MAT_SIZES)
{
const cv::Size size = GetParam();
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_CUDA())
{
const cv::cuda::GpuMat d_src(src);
cv::cuda::GpuMat dst, buf;
TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
CUDA_SANITY_CHECK(dst);
}
else
{
FAIL_NO_CPU();
}
}
...@@ -292,95 +292,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray ...@@ -292,95 +292,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
#endif #endif
} }
//////////////////////////////////////////////////////////////////////////////
// mulSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void) flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, dst, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) scale;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void)flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
#endif
}
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// dft // dft
......
This diff is collapsed.
...@@ -40,43 +40,22 @@ ...@@ -40,43 +40,22 @@
// //
//M*/ //M*/
#if !defined CUDA_DISABLER #include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp" #ifndef HAVE_OPENCV_CUDEV
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp" #error "opencv_cudev is required"
using namespace cv::cuda; #else
using namespace cv::cuda::device;
namespace arithm #include "opencv2/cudev.hpp"
{
struct VAbsDiff4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
__host__ __device__ __forceinline__ VAbsDiff4() {}
__host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
};
struct VAbsDiff2 : binary_function<uint, uint, uint> using namespace cv::cudev;
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
__host__ __device__ __forceinline__ VAbsDiff2() {} void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
__host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
};
namespace
{
__device__ __forceinline__ int _abs(int a) __device__ __forceinline__ int _abs(int a)
{ {
return ::abs(a); return ::abs(a);
...@@ -90,58 +69,120 @@ namespace arithm ...@@ -90,58 +69,120 @@ namespace arithm
return ::fabs(a); return ::fabs(a);
} }
template <typename T> struct AbsDiffMat : binary_function<T, T, T> template <typename T> struct AbsDiffOp1 : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(T a, T b) const __device__ __forceinline__ T operator ()(T a, T b) const
{ {
return saturate_cast<T>(_abs(a - b)); return saturate_cast<T>(_abs(a - b));
} }
__host__ __device__ __forceinline__ AbsDiffMat() {}
__host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
}; };
}
namespace cv { namespace cuda { namespace device template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{ {
}; };
template <> struct TransformPolicy<double> : DefaultTransformPolicy
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{ {
enum {
shift = 1
};
}; };
template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)> template <typename T>
void absDiffMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{ {
gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), AbsDiffOp1<T>(), stream);
}
struct AbsDiffOp2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
}; };
}}}
namespace arithm void absDiffMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{ {
device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream); const int vcols = src1.cols >> 1;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp2(), stream);
} }
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream) struct AbsDiffOp4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
};
void absDiffMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{ {
device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream); const int vcols = src1.cols >> 2;
GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
gridTransformBinary(src1_, src2_, dst_, AbsDiffOp4(), stream);
} }
}
template <typename T> void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) {
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
static const func_t funcs[] =
{
absDiffMat_v1<uchar>,
absDiffMat_v1<schar>,
absDiffMat_v1<ushort>,
absDiffMat_v1<short>,
absDiffMat_v1<int>,
absDiffMat_v1<float>,
absDiffMat_v1<double>
};
const int depth = src1.depth();
CV_DbgAssert( depth <= CV_64F );
GpuMat src1_ = src1.reshape(1);
GpuMat src2_ = src2.reshape(1);
GpuMat dst_ = dst.reshape(1);
if (depth == CV_8U || depth == CV_16U)
{ {
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream); const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
absDiffMat_v4(src1_, src2_, dst_, stream);
return;
}
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
absDiffMat_v2(src1_, src2_, dst_, stream);
return;
}
}
} }
template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); const func_t func = funcs[depth];
template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); if (!func)
template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); func(src1_, src2_, dst_, stream);
template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
} }
#endif // CUDA_DISABLER #endif
...@@ -40,59 +40,71 @@ ...@@ -40,59 +40,71 @@
// //
//M*/ //M*/
#if !defined CUDA_DISABLER #include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp" #ifndef HAVE_OPENCV_CUDEV
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp" #error "opencv_cudev is required"
using namespace cv::cuda; #else
using namespace cv::cuda::device;
namespace arithm #include "opencv2/cudev.hpp"
using namespace cv::cudev;
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
namespace
{ {
template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T> template <typename T, typename S> struct AbsDiffScalarOp : unary_function<T, T>
{ {
S val; S val;
__host__ explicit AbsDiffScalar(S val_) : val(val_) {}
__device__ __forceinline__ T operator ()(T a) const __device__ __forceinline__ T operator ()(T a) const
{ {
abs_func<S> f; abs_func<S> f;
return saturate_cast<T>(f(a - val)); return saturate_cast<T>(f(a - val));
} }
}; };
}
namespace cv { namespace cuda { namespace device template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
{ {
template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)> };
template <> struct TransformPolicy<double> : DefaultTransformPolicy
{ {
enum {
shift = 1
};
}; };
}}}
namespace arithm template <typename SrcType, typename ScalarDepth>
void absDiffScalarImpl(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
{
AbsDiffScalarOp<SrcType, ScalarDepth> op;
op.val = static_cast<ScalarDepth>(value);
gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<SrcType>(dst), op, stream);
}
}
void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
{ {
template <typename T, typename S> typedef void (*func_t)(const GpuMat& src, double val, GpuMat& dst, Stream& stream);
void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) static const func_t funcs[] =
{ {
AbsDiffScalar<T, S> op(static_cast<S>(val)); absDiffScalarImpl<uchar, float>,
absDiffScalarImpl<schar, float>,
absDiffScalarImpl<ushort, float>,
absDiffScalarImpl<short, float>,
absDiffScalarImpl<int, float>,
absDiffScalarImpl<float, float>,
absDiffScalarImpl<double, double>
};
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream); const int depth = src.depth();
}
CV_DbgAssert( depth <= CV_64F );
template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); funcs[depth](src, val[0], dst, stream);
template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
} }
#endif // CUDA_DISABLER #endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -40,65 +40,132 @@ ...@@ -40,65 +40,132 @@
// //
//M*/ //M*/
#if !defined CUDA_DISABLER #include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp" #ifndef HAVE_OPENCV_CUDEV
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp" #error "opencv_cudev is required"
using namespace cv::cuda; #else
using namespace cv::cuda::device;
namespace cv { namespace cuda { namespace device #include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp"
using namespace cv::cudev;
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
namespace
{ {
template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)> template <template <typename> class Op, typename T>
void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
{ {
}; gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
}
typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);
template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)> template <typename T, bit_scalar_func_t func> struct BitScalar
{ {
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
func(src, cv::saturate_cast<T>(value[0]), dst, stream);
}
}; };
template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)> template <bit_scalar_func_t func> struct BitScalar4
{ {
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
{
uint packedVal = 0;
packedVal |= cv::saturate_cast<uchar>(value[0]);
packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
func(src, packedVal, dst, stream);
}
}; };
}}}
namespace arithm template <int DEPTH, int cn> struct NppBitwiseCFunc
{
template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{ {
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream); typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
}
typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
};
template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream) template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
{ {
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream); typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
}
static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
{
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStreamHandler h(stream);
NppiSize oSizeROI;
oSizeROI.width = src.cols;
oSizeROI.height = src.rows;
const npp_type pConstants[] =
{
cv::saturate_cast<npp_type>(value[0]),
cv::saturate_cast<npp_type>(value[1]),
cv::saturate_cast<npp_type>(value[2]),
cv::saturate_cast<npp_type>(value[3])
};
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
if (stream == 0)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
};
}
void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
{
(void) mask;
template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream) typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
static const func_t funcs[3][6][4] =
{ {
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream); {
} {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
},
{
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
},
{
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
{BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
{BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
{BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
}
};
template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); const int depth = src.depth();
template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); const int cn = src.channels();
template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); CV_DbgAssert( depth <= CV_32F );
template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); CV_DbgAssert( mask.empty() );
template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); CV_DbgAssert( op >= 0 && op < 3 );
template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); funcs[op][depth][cn - 1](src, value, dst, stream);
template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
} }
#endif // CUDA_DISABLER #endif
This diff is collapsed.
This diff is collapsed.
...@@ -40,137 +40,57 @@ ...@@ -40,137 +40,57 @@
// //
//M*/ //M*/
#if !defined CUDA_DISABLER #include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp" #ifndef HAVE_OPENCV_CUDEV
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
using namespace cv::cuda; #error "opencv_cudev is required"
using namespace cv::cuda::device;
namespace countNonZero #else
{
__device__ unsigned int blocks_finished = 0;
template <int BLOCK_SIZE, typename T>
__global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
{
__shared__ unsigned int scount[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int mycount = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
const T srcVal = ptr[x];
mycount += (srcVal != 0);
}
}
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#if __CUDA_ARCH__ >= 200
if (tid == 0)
::atomicAdd(count, mycount);
#else
__shared__ bool is_last;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
if (tid == 0)
{
count[bid] = mycount;
__threadfence();
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>()); #include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
if (tid == 0) using namespace cv::cudev;
{
count[0] = mycount;
blocks_finished = 0; namespace
} {
} template <typename T>
#endif int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{ {
block = dim3(threads_x, threads_y); const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x); gridCountNonZero(src, buf);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, int& bufcols, int& bufrows) int data;
{ buf.download(cv::Mat(1, 1, buf.type(), &data));
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(int); return data;
bufrows = 1;
} }
}
template <typename T> int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
int run(const PtrStepSzb src, PtrStep<unsigned int> buf) {
typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
static const func_t funcs[] =
{ {
dim3 block, grid; countNonZeroImpl<uchar>,
getLaunchCfg(src.cols, src.rows, block, grid); countNonZeroImpl<schar>,
countNonZeroImpl<ushort>,
const int twidth = divUp(divUp(src.cols, grid.x), block.x); countNonZeroImpl<short>,
const int theight = divUp(divUp(src.rows, grid.y), block.y); countNonZeroImpl<int>,
countNonZeroImpl<float>,
countNonZeroImpl<double>
};
unsigned int* count_buf = buf.ptr(0); GpuMat src = _src.getGpuMat();
cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) ); CV_Assert( src.channels() == 1 );
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight); const func_t func = funcs[src.depth()];
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
return count;
}
template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf); return func(src, buf);
template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
} }
#endif // CUDA_DISABLER #endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -52,13 +52,6 @@ ...@@ -52,13 +52,6 @@
#include "opencv2/core/private.cuda.hpp" #include "opencv2/core/private.cuda.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_CUDALEGACY
# include "opencv2/cudalegacy.hpp"
# include "opencv2/cudalegacy/private.hpp"
#endif
#ifdef HAVE_CUBLAS #ifdef HAVE_CUBLAS
# include <cublas.h> # include <cublas.h>
#endif #endif
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -4,7 +4,7 @@ endif() ...@@ -4,7 +4,7 @@ endif()
set(the_description "CUDA device layer") set(the_description "CUDA device layer")
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable) ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare)
ocv_add_module(cudev) ocv_add_module(cudev)
......
...@@ -73,7 +73,7 @@ ...@@ -73,7 +73,7 @@
#include "cudev/block/vec_distance.hpp" #include "cudev/block/vec_distance.hpp"
#include "cudev/grid/copy.hpp" #include "cudev/grid/copy.hpp"
#include "cudev/grid/glob_reduce.hpp" #include "cudev/grid/reduce.hpp"
#include "cudev/grid/histogram.hpp" #include "cudev/grid/histogram.hpp"
#include "cudev/grid/integral.hpp" #include "cudev/grid/integral.hpp"
#include "cudev/grid/pyramids.hpp" #include "cudev/grid/pyramids.hpp"
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment