Commit d2591704 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

Merge branch 'kepler-optimization' into cuda-dev

parents 22b0ea1c 2eca75cc
...@@ -79,6 +79,8 @@ namespace cv { namespace gpu ...@@ -79,6 +79,8 @@ namespace cv { namespace gpu
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30 WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
}; };
CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
// Gives information about what GPU archs this OpenCV GPU module was // Gives information about what GPU archs this OpenCV GPU module was
// compiled for // compiled for
class CV_EXPORTS TargetArchs class CV_EXPORTS TargetArchs
......
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/type_traits.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device ...@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
void writeScalar(const int*); void writeScalar(const int*);
void writeScalar(const float*); void writeScalar(const float*);
void writeScalar(const double*); void writeScalar(const double*);
void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t); void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
}}} }}}
...@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device ...@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D> template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
{ {
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
{ {
return saturate_cast<D>(alpha * src + beta); return saturate_cast<D>(alpha * src + beta);
} }
double alpha, beta; S alpha, beta;
}; };
namespace detail namespace detail
...@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device ...@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
}; };
} }
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> > template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
{ {
}; };
template<typename T, typename D> template<typename T, typename D, typename S>
void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream) void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
{ {
cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta); Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream); cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
} }
...@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device ...@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
static const caller_t tab[8][8] = static const caller_t tab[7][7] =
{ {
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>, {
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0}, cvt_<uchar, uchar, float>,
cvt_<uchar, schar, float>,
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>, cvt_<uchar, ushort, float>,
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0}, cvt_<uchar, short, float>,
cvt_<uchar, int, float>,
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>, cvt_<uchar, float, float>,
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0}, cvt_<uchar, double, double>
},
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>, {
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0}, cvt_<schar, uchar, float>,
cvt_<schar, schar, float>,
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>, cvt_<schar, ushort, float>,
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0}, cvt_<schar, short, float>,
cvt_<schar, int, float>,
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>, cvt_<schar, float, float>,
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0}, cvt_<schar, double, double>
},
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>, {
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0}, cvt_<ushort, uchar, float>,
cvt_<ushort, schar, float>,
{0,0,0,0,0,0,0,0} cvt_<ushort, ushort, float>,
cvt_<ushort, short, float>,
cvt_<ushort, int, float>,
cvt_<ushort, float, float>,
cvt_<ushort, double, double>
},
{
cvt_<short, uchar, float>,
cvt_<short, schar, float>,
cvt_<short, ushort, float>,
cvt_<short, short, float>,
cvt_<short, int, float>,
cvt_<short, float, float>,
cvt_<short, double, double>
},
{
cvt_<int, uchar, float>,
cvt_<int, schar, float>,
cvt_<int, ushort, float>,
cvt_<int, short, float>,
cvt_<int, int, double>,
cvt_<int, float, double>,
cvt_<int, double, double>
},
{
cvt_<float, uchar, float>,
cvt_<float, schar, float>,
cvt_<float, ushort, float>,
cvt_<float, short, float>,
cvt_<float, int, float>,
cvt_<float, float, float>,
cvt_<float, double, double>
},
{
cvt_<double, uchar, double>,
cvt_<double, schar, double>,
cvt_<double, ushort, double>,
cvt_<double, short, double>,
cvt_<double, int, double>,
cvt_<double, float, double>,
cvt_<double, double, double>
}
}; };
caller_t func = tab[sdepth][ddepth]; caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
func(src, dst, alpha, beta, stream); func(src, dst, alpha, beta, stream);
} }
......
...@@ -69,33 +69,89 @@ using namespace cv::gpu; ...@@ -69,33 +69,89 @@ using namespace cv::gpu;
namespace namespace
{ {
// Compares value to set using the given comparator. Returns true if class CudaArch
// there is at least one element x in the set satisfying to: x cmp value {
// predicate. public:
template <typename Comparer> CudaArch();
bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
bool builtWith(FeatureSet feature_set) const;
bool hasPtx(int major, int minor) const;
bool hasBin(int major, int minor) const;
bool hasEqualOrLessPtx(int major, int minor) const;
bool hasEqualOrGreaterPtx(int major, int minor) const;
bool hasEqualOrGreaterBin(int major, int minor) const;
private:
static void fromStr(const string& set_as_str, vector<int>& arr);
vector<int> bin;
vector<int> ptx;
vector<int> features;
};
const CudaArch cudaArch;
CudaArch::CudaArch()
{
#ifdef HAVE_CUDA
fromStr(CUDA_ARCH_BIN, bin);
fromStr(CUDA_ARCH_PTX, ptx);
fromStr(CUDA_ARCH_FEATURES, features);
#endif
}
bool CudaArch::builtWith(FeatureSet feature_set) const
{
return !features.empty() && (features.back() >= feature_set);
}
bool CudaArch::hasPtx(int major, int minor) const
{
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
}
bool CudaArch::hasBin(int major, int minor) const
{
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
}
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
{
return !bin.empty() && (bin.back() >= major * 10 + minor);
}
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
{ {
if (set_as_str.find_first_not_of(" ") == string::npos) if (set_as_str.find_first_not_of(" ") == string::npos)
return false; return;
std::stringstream stream(set_as_str); istringstream stream(set_as_str);
int cur_value; int cur_value;
while (!stream.eof()) while (!stream.eof())
{ {
stream >> cur_value; stream >> cur_value;
if (cmp(cur_value, value)) arr.push_back(cur_value);
return true;
} }
return false; sort(arr.begin(), arr.end());
} }
} }
bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>()); return cudaArch.builtWith(feature_set);
#else #else
(void)feature_set; (void)feature_set;
return false; return false;
...@@ -110,7 +166,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor) ...@@ -110,7 +166,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
bool cv::gpu::TargetArchs::hasPtx(int major, int minor) bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>()); return cudaArch.hasPtx(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -121,7 +177,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor) ...@@ -121,7 +177,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasBin(int major, int minor) bool cv::gpu::TargetArchs::hasBin(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>()); return cudaArch.hasBin(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -132,8 +188,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor) ...@@ -132,8 +188,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, return cudaArch.hasEqualOrLessPtx(major, minor);
std::less_equal<int>());
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -143,14 +198,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) ...@@ -143,14 +198,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
{ {
return hasEqualOrGreaterPtx(major, minor) || return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
hasEqualOrGreaterBin(major, minor);
} }
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>()); return cudaArch.hasEqualOrGreaterPtx(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -161,8 +215,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) ...@@ -161,8 +215,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, return cudaArch.hasEqualOrGreaterBin(major, minor);
std::greater_equal<int>());
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -170,6 +223,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) ...@@ -170,6 +223,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
#endif #endif
} }
bool cv::gpu::deviceSupports(FeatureSet feature_set)
{
static int versions[] =
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
const int devId = getDevice();
int version;
if (devId < cache_size && versions[devId] >= 0)
version = versions[devId];
else
{
DeviceInfo dev(devId);
version = dev.majorVersion() * 10 + dev.minorVersion();
if (devId < cache_size)
versions[devId] = version;
}
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
}
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
......
This diff is collapsed.
...@@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device ...@@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
template <> struct name<type> : binary_function<type, type, type> \ template <> struct name<type> : binary_function<type, type, type> \
{ \ { \
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
__device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\ __device__ __forceinline__ name() {}\
__device__ __forceinline__ name():binary_function<type, type, type>(){}\ __device__ __forceinline__ name(const name&) {}\
}; };
template <typename T> struct maximum : binary_function<T, T, T> template <typename T> struct maximum : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{ {
return lhs < rhs ? rhs : lhs; return max(lhs, rhs);
} }
__device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){} __device__ __forceinline__ maximum() {}
__device__ __forceinline__ maximum():binary_function<T, T, T>(){} __device__ __forceinline__ maximum(const maximum&) {}
}; };
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
...@@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device ...@@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
{ {
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{ {
return lhs < rhs ? lhs : rhs; return min(lhs, rhs);
} }
__device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){} __device__ __forceinline__ minimum() {}
__device__ __forceinline__ minimum():binary_function<T, T, T>(){} __device__ __forceinline__ minimum(const minimum&) {}
}; };
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
...@@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device ...@@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
// Math functions // Math functions
///bound========================================= ///bound=========================================
template <typename T> struct abs_func : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
{
return abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
{
__device__ __forceinline__ unsigned char operator ()(unsigned char x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<signed char> : unary_function<signed char, signed char>
{
__device__ __forceinline__ signed char operator ()(signed char x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<char> : unary_function<char, char>
{
__device__ __forceinline__ char operator ()(char x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
{
__device__ __forceinline__ unsigned short operator ()(unsigned short x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<short> : unary_function<short, short>
{
__device__ __forceinline__ short operator ()(short x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
{
__device__ __forceinline__ unsigned int operator ()(unsigned int x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<int> : unary_function<int, int>
{
__device__ __forceinline__ int operator ()(int x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<float> : unary_function<float, float>
{
__device__ __forceinline__ float operator ()(float x) const
{
return ::fabsf(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<double> : unary_function<double, double>
{
__device__ __forceinline__ double operator ()(double x) const
{
return ::fabs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
template <typename T> struct name ## _func : unary_function<T, float> \ template <typename T> struct name ## _func : unary_function<T, float> \
{ \ { \
...@@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device ...@@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
{ \ { \
return func ## f(v); \ return func ## f(v); \
} \ } \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; \ }; \
template <> struct name ## _func<double> : unary_function<double, double> \ template <> struct name ## _func<double> : unary_function<double, double> \
{ \ { \
...@@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device ...@@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
{ \ { \
return func(v); \ return func(v); \
} \ } \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; };
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \ #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
...@@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device ...@@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
......
This diff is collapsed.
...@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device ...@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
{ {
return (uchar) ::max((int)v, 0); uint res = 0;
int vi = v;
asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
uint res = 0;
asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
{ {
return (uchar) ::min((uint)v, (uint)UCHAR_MAX); uint res = 0;
asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
{ {
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); uint res = 0;
asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
{ {
return (uchar) ::min(v, (uint)UCHAR_MAX); uint res = 0;
asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
return saturate_cast<uchar>((uint)v);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{ {
int iv = __float2int_rn(v); uint res = 0;
return saturate_cast<uchar>(iv); asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); uint res = 0;
return saturate_cast<uchar>(iv); asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else #else
return saturate_cast<uchar>((float)v); return saturate_cast<uchar>((float)v);
#endif #endif
...@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device ...@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v) template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
{ {
return (schar) ::min((int)v, SCHAR_MAX); uint res = 0;
uint vi = v;
asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v) template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
{ {
return (schar) ::min((uint)v, (uint)SCHAR_MAX); uint res = 0;
asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v) template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
{ {
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); uint res = 0;
asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v) template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
{ {
return saturate_cast<schar>((int)v); uint res = 0;
asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v) template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
{ {
return (schar) ::min(v, (uint)SCHAR_MAX); uint res = 0;
asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v) template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
{ {
int iv = __float2int_rn(v); uint res = 0;
return saturate_cast<schar>(iv); asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v) template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); uint res = 0;
return saturate_cast<schar>(iv); asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else #else
return saturate_cast<schar>((float)v); return saturate_cast<schar>((float)v);
#endif #endif
...@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device ...@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
{ {
return (ushort) ::max((int)v, 0); ushort res = 0;
int vi = v;
asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
{ {
return (ushort) ::max((int)v, 0); ushort res = 0;
asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
{ {
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); ushort res = 0;
asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
{ {
return (ushort) ::min(v, (uint)USHRT_MAX); ushort res = 0;
asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
{ {
int iv = __float2int_rn(v); ushort res = 0;
return saturate_cast<ushort>(iv); asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); ushort res = 0;
return saturate_cast<ushort>(iv); asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else #else
return saturate_cast<ushort>((float)v); return saturate_cast<ushort>((float)v);
#endif #endif
...@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device ...@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v) template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
{ {
return (short) ::min((int)v, SHRT_MAX); short res = 0;
asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(int v) template<> __device__ __forceinline__ short saturate_cast<short>(int v)
{ {
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); short res = 0;
asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(uint v) template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
{ {
return (short) ::min(v, (uint)SHRT_MAX); short res = 0;
asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(float v) template<> __device__ __forceinline__ short saturate_cast<short>(float v)
{ {
int iv = __float2int_rn(v); short res = 0;
return saturate_cast<short>(iv); asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(double v) template<> __device__ __forceinline__ short saturate_cast<short>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); short res = 0;
return saturate_cast<short>(iv); asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else #else
return saturate_cast<short>((float)v); return saturate_cast<short>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
{
int res = 0;
asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ int saturate_cast<int>(float v) template<> __device__ __forceinline__ int saturate_cast<int>(float v)
{ {
return __float2int_rn(v); return __float2int_rn(v);
...@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device ...@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
#endif #endif
} }
template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
{
uint res = 0;
int vi = v;
asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
{
uint res = 0;
asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
{
uint res = 0;
asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v) template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
{ {
return __float2uint_rn(v); return __float2uint_rn(v);
......
...@@ -45,7 +45,6 @@ ...@@ -45,7 +45,6 @@
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "datamov_utils.hpp" #include "datamov_utils.hpp"
#include "detail/reduction_detail.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device ...@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
} }
}; };
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
}
template <int n, typename T, typename V, typename Pred>
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
}
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Solve linear system // Solve linear system
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "utility.hpp" #include "reduce.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "detail/vec_distance_detail.hpp" #include "detail/vec_distance_detail.hpp"
...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device ...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const
...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device ...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
...@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device ...@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
...@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device ...@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const
......
...@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device ...@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
namespace cv { namespace gpu { namespace device
{
template <typename T>
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl(val, srcLane, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl((int) val, srcLane, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl(lo, srcLane, width);
hi = __shfl(hi, srcLane, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_down(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_down((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo, delta, width);
hi = __shfl_down(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_up(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_up((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_up(lo, delta, width);
hi = __shfl_up(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
}}}
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
...@@ -792,31 +792,23 @@ private: ...@@ -792,31 +792,23 @@ private:
GpuMat lab, l, ab; GpuMat lab, l, ab;
}; };
struct CV_EXPORTS CannyBuf;
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
struct CV_EXPORTS CannyBuf struct CV_EXPORTS CannyBuf
{ {
CannyBuf() {}
explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
void create(const Size& image_size, int apperture_size = 3); void create(const Size& image_size, int apperture_size = 3);
void release(); void release();
GpuMat dx, dy; GpuMat dx, dy;
GpuMat dx_buf, dy_buf; GpuMat mag;
GpuMat edgeBuf; GpuMat map;
GpuMat trackBuf1, trackBuf2; GpuMat st1, st2;
Ptr<FilterEngine_GPU> filterDX, filterDY; Ptr<FilterEngine_GPU> filterDX, filterDY;
}; };
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
class CV_EXPORTS ImagePyramid class CV_EXPORTS ImagePyramid
{ {
public: public:
...@@ -1036,11 +1028,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels ...@@ -1036,11 +1028,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
//! Calculates histogram for 8u one channel image //! Calculates histogram for 8u one channel image
//! Output hist will have one row, 256 cols and CV32SC1 type. //! Output hist will have one row, 256 cols and CV32SC1 type.
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//! normalizes the grayscale image brightness and contrast by normalizing its histogram //! normalizes the grayscale image brightness and contrast by normalizing its histogram
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//////////////////////////////// StereoBM_GPU //////////////////////////////// //////////////////////////////// StereoBM_GPU ////////////////////////////////
......
...@@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES) ...@@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES)
{ {
cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_src(src);
cv::gpu::GpuMat d_hist; cv::gpu::GpuMat d_hist;
cv::gpu::GpuMat d_buf;
cv::gpu::calcHist(d_src, d_hist, d_buf); cv::gpu::calcHist(d_src, d_hist);
TEST_CYCLE() TEST_CYCLE()
{ {
cv::gpu::calcHist(d_src, d_hist, d_buf); cv::gpu::calcHist(d_src, d_hist);
} }
GPU_SANITY_CHECK(d_hist); GPU_SANITY_CHECK(d_hist);
......
...@@ -42,10 +42,13 @@ ...@@ -42,10 +42,13 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device ...@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx) float* s_distance, int* s_trainIdx)
{ {
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
float d1, d2;
int i1, i2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
...@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device ...@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2; bestTrainIdx2 = myBestTrainIdx2;
#endif
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
...@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device ...@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
int& bestImgIdx1, int& bestImgIdx2, int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx) float* s_distance, int* s_trainIdx, int* s_imgIdx)
{ {
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
(void) s_imgIdx;
float d1, d2;
int i1, i2;
int j1, j2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
bestImgIdx2 = j1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx2 = bestImgIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
bestImgIdx1 = j1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
bestImgIdx2 = j2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
...@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device ...@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
bestImgIdx1 = myBestImgIdx1; bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2; bestImgIdx2 = myBestImgIdx2;
#endif
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
...@@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx[threadIdx.x] = bestIdx; s_trainIdx[threadIdx.x] = bestIdx;
__syncthreads(); __syncthreads();
reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>()); reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
......
...@@ -42,7 +42,9 @@ ...@@ -42,7 +42,9 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
...@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device ...@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
s_distance += threadIdx.y * BLOCK_SIZE; s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance; reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
s_trainIdx[threadIdx.x] = bestTrainIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
...@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device ...@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance; reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
s_trainIdx[threadIdx.x] = bestTrainIdx;
s_imgIdx [threadIdx.x] = bestImgIdx;
__syncthreads();
reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
......
...@@ -42,7 +42,8 @@ ...@@ -42,7 +42,8 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
...@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device ...@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
...@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device ...@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal; bestDistance.ptr(queryIdx)[ind] = distVal;
} }
} }
#endif
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
...@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device ...@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
...@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device ...@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal; bestDistance.ptr(queryIdx)[ind] = distVal;
} }
} }
#endif
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
......
...@@ -42,9 +42,10 @@ ...@@ -42,9 +42,10 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device ...@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
} }
__device__ __forceinline__ TransformOp() {}
__device__ __forceinline__ TransformOp(const TransformOp&) {}
}; };
void call(const PtrStepSz<float3> src, const float* rot, void call(const PtrStepSz<float3> src, const float* rot,
...@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device ...@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
} }
__device__ __forceinline__ ProjectOp() {}
__device__ __forceinline__ ProjectOp(const ProjectOp&) {}
}; };
void call(const PtrStepSz<float3> src, const float* rot, void call(const PtrStepSz<float3> src, const float* rot,
...@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device ...@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
return x * x; return x * x;
} }
template <int BLOCK_SIZE>
__global__ void computeHypothesisScoresKernel( __global__ void computeHypothesisScoresKernel(
const int num_points, const float3* object, const float2* image, const int num_points, const float3* object, const float2* image,
const float dist_threshold, int* g_num_inliers) const float dist_threshold, int* g_num_inliers)
...@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device ...@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
++num_inliers; ++num_inliers;
} }
extern __shared__ float s_num_inliers[]; __shared__ int s_num_inliers[BLOCK_SIZE];
s_num_inliers[threadIdx.x] = num_inliers; reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
__syncthreads();
for (int step = blockDim.x / 2; step > 0; step >>= 1)
{
if (threadIdx.x < step)
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
__syncthreads();
}
if (threadIdx.x == 0) if (threadIdx.x == 0)
g_num_inliers[blockIdx.x] = s_num_inliers[0]; g_num_inliers[blockIdx.x] = num_inliers;
} }
void computeHypothesisScores( void computeHypothesisScores(
...@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device ...@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
dim3 threads(256); dim3 threads(256);
dim3 grid(num_hypotheses); dim3 grid(num_hypotheses);
int smem_size = threads.x * sizeof(float);
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>( computeHypothesisScoresKernel<256><<<grid, threads>>>(
num_points, object, image, dist_threshold, hypothesis_scores); num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
......
This diff is collapsed.
This diff is collapsed.
...@@ -46,6 +46,8 @@ ...@@ -46,6 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "fgd_bgfg_common.hpp" #include "fgd_bgfg_common.hpp"
using namespace cv::gpu; using namespace cv::gpu;
...@@ -181,57 +183,8 @@ namespace bgfg ...@@ -181,57 +183,8 @@ namespace bgfg
__shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE]; __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
__shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE]; __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
data0[threadIdx.x] = sum0; plus<unsigned int> op;
data1[threadIdx.x] = sum1; reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
data2[threadIdx.x] = sum2;
__syncthreads();
if (threadIdx.x < 128)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
}
__syncthreads();
if (threadIdx.x < 64)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
}
__syncthreads();
if (threadIdx.x < 32)
{
volatile unsigned int* vdata0 = data0;
volatile unsigned int* vdata1 = data1;
volatile unsigned int* vdata2 = data2;
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
}
if(threadIdx.x == 0) if(threadIdx.x == 0)
{ {
......
...@@ -43,182 +43,115 @@ ...@@ -43,182 +43,115 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/emulation.hpp"
#include "opencv2/gpu/device/transform.hpp"
namespace cv { namespace gpu { namespace device using namespace cv::gpu;
{ using namespace cv::gpu::device;
#define UINT_BITS 32U
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
#define HISTOGRAM256_BIN_COUNT 256
//Shared memory per threadblock
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
#define PARTIAL_HISTOGRAM256_COUNT 240
#define MERGE_THREADBLOCK_SIZE 256
#define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)) namespace
{
namespace hist __global__ void histogram256(const uchar* src, int cols, int rows, size_t step, int* hist)
{ {
#if (!USE_SMEM_ATOMICS) __shared__ int shist[256];
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag) const int y = blockIdx.x * blockDim.y + threadIdx.y;
{ const int tid = threadIdx.y * blockDim.x + threadIdx.x;
uint count;
do
{
count = s_WarpHist[data] & TAG_MASK;
count = threadTag | (count + 1);
s_WarpHist[data] = count;
} while (s_WarpHist[data] != count);
}
#else shist[tid] = 0;
__syncthreads();
#define TAG_MASK 0xFFFFFFFFU
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag) if (y < rows)
{ {
atomicAdd(s_WarpHist + data, 1); const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
}
#endif const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
{ {
uint x = pos_x << 2; unsigned int data = rowPtr[x];
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
} }
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols) if (cols % 4 != 0 && threadIdx.x == 0)
{ {
//Per-warp subhistogram storage for (int x = cols_4 * 4; x < cols; ++x)
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
//Clear shared memory storage for current threadblock before processing
#pragma unroll
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
//Cycle through the entire data set, update subhistograms for each warp
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
__syncthreads();
const uint colsui = d_Data.step / sizeof(uint);
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
{ {
uint pos_y = pos / colsui; unsigned int data = ((const uchar*)rowPtr)[x];
uint pos_x = pos % colsui; Emulation::smem::atomicAdd(&shist[data], 1);
uint data = d_Data.ptr(pos_y)[pos_x];
addWord(s_WarpHist, data, tag, pos_x, cols);
} }
//Merge per-warp histograms into per-block and write to global memory
__syncthreads();
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
{
uint sum = 0;
for (uint i = 0; i < WARP_COUNT; i++)
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
} }
} }
////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output
// Run one threadblock per bin; each threadblock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
{
uint sum = 0;
#pragma unroll
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
data[threadIdx.x] = sum;
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
{
__syncthreads(); __syncthreads();
if(threadIdx.x < stride)
data[threadIdx.x] += data[threadIdx.x + stride];
}
if(threadIdx.x == 0) const int histVal = shist[tid];
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]); if (histVal > 0)
::atomicAdd(hist + tid, histVal);
} }
}
void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream) namespace hist
{
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
{ {
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>( const dim3 block(32, 8);
PtrStepSz<uint>(src), const dim3 grid(divUp(src.rows, block.y));
buf,
static_cast<uint>(src.rows * src.step / sizeof(uint)),
src.cols);
cudaSafeCall( cudaGetLastError() );
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
::histogram256<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}
/////////////////////////////////////////////////////////////////////////
namespace
{
__constant__ int c_lut[256]; __constant__ int c_lut[256];
__global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst) struct EqualizeHist : unary_function<uchar, uchar>
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; float scale;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__host__ EqualizeHist(float _scale) : scale(_scale) {}
if (x < src.cols && y < src.rows) __device__ __forceinline__ uchar operator ()(uchar val) const
{ {
const uchar val = src.ptr(y)[x];
const int lut = c_lut[val]; const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut); return __float2int_rn(scale * lut);
}
} }
};
}
void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream) namespace cv { namespace gpu { namespace device
{
template <> struct TransformFunctorTraits<EqualizeHist> : DefaultTransformFunctorTraits<EqualizeHist>
{ {
dim3 block(16, 16); enum { smart_shift = 4 };
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); };
}}}
namespace hist
{
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
{
if (stream == 0)
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) ); cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
else
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
equalizeHist<<<grid, block, 0, stream>>>(src, dst); const float scale = 255.0f / (src.cols * src.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
cudaSafeCall( cudaDeviceSynchronize() );
} }
} // namespace hist }
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */ #endif /* CUDA_DISABLER */
...@@ -42,7 +42,10 @@ ...@@ -42,7 +42,10 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device ...@@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device
template<int size> template<int size>
__device__ float reduce_smem(volatile float* smem) __device__ float reduce_smem(float* smem, float val)
{ {
unsigned int tid = threadIdx.x; unsigned int tid = threadIdx.x;
float sum = smem[tid]; float sum = val;
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); } reduce<size>(smem, sum, tid, plus<float>());
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
if (tid < 32) if (size == 32)
{ {
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; #if __CUDA_ARCH__ >= 300
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; return shfl(sum, 0);
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; #else
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; return smem[0];
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; #endif
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
} }
#if __CUDA_ARCH__ >= 300
if (threadIdx.x == 0)
smem[0] = sum;
#endif
__syncthreads(); __syncthreads();
sum = smem[0];
return sum; return smem[0];
} }
...@@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device ...@@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device
if (threadIdx.x < block_hist_size) if (threadIdx.x < block_hist_size)
elem = hist[0]; elem = hist[0];
squares[threadIdx.x] = elem * elem; float sum = reduce_smem<nthreads>(squares, elem * elem);
__syncthreads();
float sum = reduce_smem<nthreads>(squares);
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
elem = ::min(elem * scale, threshold); elem = ::min(elem * scale, threshold);
__syncthreads(); sum = reduce_smem<nthreads>(squares, elem * elem);
squares[threadIdx.x] = elem * elem;
__syncthreads();
sum = reduce_smem<nthreads>(squares);
scale = 1.0f / (::sqrtf(sum) + 1e-3f); scale = 1.0f / (::sqrtf(sum) + 1e-3f);
if (threadIdx.x < block_hist_size) if (threadIdx.x < block_hist_size)
...@@ -355,40 +353,11 @@ namespace cv { namespace gpu { namespace device ...@@ -355,40 +353,11 @@ namespace cv { namespace gpu { namespace device
__shared__ float products[nthreads * nblocks]; __shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x; const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
__syncthreads();
if (nthreads >= 512)
{
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32) reduce<nthreads>(products, product, tid, plus<float>());
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
if (threadIdx.x == 0) if (threadIdx.x == 0)
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
= (float)(product + free_coef);
} }
...@@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device ...@@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device
__shared__ float products[nthreads * nblocks]; __shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x; const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
__syncthreads();
if (nthreads >= 512) reduce<nthreads>(products, product, tid, plus<float>());
{
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32)
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
if (threadIdx.x == 0) if (threadIdx.x == 0)
labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold); labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
......
This diff is collapsed.
...@@ -43,11 +43,11 @@ ...@@ -43,11 +43,11 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/block.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu; using namespace cv::gpu;
...@@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device ...@@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
{ {
namespace imgproc namespace imgproc
{ {
template <int cn> struct Unroll;
template <> struct Unroll<1>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
{
return thrust::tie(val1, val2);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op);
}
};
template <> struct Unroll<2>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
{
return thrust::tie(val1, val2.x, val2.y);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op);
}
};
template <> struct Unroll<3>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op);
}
};
template <> struct Unroll<4>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op, op);
}
};
__device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); } __device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); }
__device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); } __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); } __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
...@@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device ...@@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device
sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x)); sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
} }
volatile __shared__ float cta_buffer[CTA_SIZE]; __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
int tid = threadIdx.x;
cta_buffer[tid] = weights_sum;
__syncthreads();
Block::reduce<CTA_SIZE>(cta_buffer, plus());
weights_sum = cta_buffer[0];
__syncthreads();
reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
threadIdx.x,
Unroll<VecTraits<T>::cn>::op());
for(int n = 0; n < VecTraits<T>::cn; ++n) if (threadIdx.x == 0)
{ dst = saturate_cast<T>(sum / weights_sum);
cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
__syncthreads();
Block::reduce<CTA_SIZE>(cta_buffer, plus());
reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
__syncthreads();
}
if (tid == 0)
dst = saturate_cast<T>(sum/weights_sum);
} }
__device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
......
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
#include <thrust/sort.h> #include <thrust/sort.h>
#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
...@@ -75,9 +75,9 @@ namespace cv { namespace gpu { namespace device ...@@ -75,9 +75,9 @@ namespace cv { namespace gpu { namespace device
__global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k) __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
{ {
__shared__ int smem[8 * 32]; __shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
volatile int* srow = smem + threadIdx.y * blockDim.x; __shared__ int smem2[8 * 32];
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
...@@ -109,9 +109,12 @@ namespace cv { namespace gpu { namespace device ...@@ -109,9 +109,12 @@ namespace cv { namespace gpu { namespace device
c += Ix * Iy; c += Ix * Iy;
} }
reduce<32>(srow, a, threadIdx.x, plus<volatile int>()); int* srow0 = smem0 + threadIdx.y * blockDim.x;
reduce<32>(srow, b, threadIdx.x, plus<volatile int>()); int* srow1 = smem1 + threadIdx.y * blockDim.x;
reduce<32>(srow, c, threadIdx.x, plus<volatile int>()); int* srow2 = smem2 + threadIdx.y * blockDim.x;
plus<int> op;
reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
...@@ -151,9 +154,13 @@ namespace cv { namespace gpu { namespace device ...@@ -151,9 +154,13 @@ namespace cv { namespace gpu { namespace device
__global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k) __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
{ {
__shared__ int smem[8 * 32]; __shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
volatile int* srow = smem + threadIdx.y * blockDim.x; plus<int> op;
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
...@@ -167,7 +174,7 @@ namespace cv { namespace gpu { namespace device ...@@ -167,7 +174,7 @@ namespace cv { namespace gpu { namespace device
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x) for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u); m_10 += u * image(loc.y, loc.x + u);
reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>()); reduce<32>(srow0, m_10, threadIdx.x, op);
for (int v = 1; v <= half_k; ++v) for (int v = 1; v <= half_k; ++v)
{ {
...@@ -185,8 +192,7 @@ namespace cv { namespace gpu { namespace device ...@@ -185,8 +192,7 @@ namespace cv { namespace gpu { namespace device
m_sum += u * (val_plus + val_minus); m_sum += u * (val_plus + val_minus);
} }
reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>()); reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
m_10 += m_sum; m_10 += m_sum;
m_01 += v * v_sum; m_01 += v * v_sum;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -65,6 +65,8 @@ ...@@ -65,6 +65,8 @@
#include "NPP_staging/NPP_staging.hpp" #include "NPP_staging/NPP_staging.hpp"
#include "NCVRuntimeTemplates.hpp" #include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp" #include "NCVHaarObjectDetection.hpp"
#include "opencv2/gpu/device/warp.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
//============================================================================== //==============================================================================
...@@ -81,6 +83,20 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th ...@@ -81,6 +83,20 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
//assuming size <= WARP_SIZE and size is power of 2 //assuming size <= WARP_SIZE and size is power of 2
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data) __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{ {
#if __CUDA_ARCH__ >= 300
const unsigned int laneId = cv::gpu::device::Warp::laneId();
// scan on shuffl functions
#pragma unroll
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
{
const Ncv32u n = cv::gpu::device::shfl_up(idata, i);
if (laneId >= i)
idata += n;
}
return idata;
#else
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0; s_Data[pos] = 0;
pos += K_WARP_SIZE; pos += K_WARP_SIZE;
...@@ -93,6 +109,7 @@ __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data) ...@@ -93,6 +109,7 @@ __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
s_Data[pos] += s_Data[pos - 16]; s_Data[pos] += s_Data[pos - 16];
return s_Data[pos]; return s_Data[pos];
#endif
} }
__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data) __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment