Commit d2591704 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

Merge branch 'kepler-optimization' into cuda-dev

parents 22b0ea1c 2eca75cc
......@@ -79,6 +79,8 @@ namespace cv { namespace gpu
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
};
CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
// Gives information about what GPU archs this OpenCV GPU module was
// compiled for
class CV_EXPORTS TargetArchs
......
......@@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/type_traits.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
void writeScalar(const int*);
void writeScalar(const float*);
void writeScalar(const double*);
void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
}}}
......@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D>
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
{
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const
__device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
{
return saturate_cast<D>(alpha * src + beta);
}
double alpha, beta;
S alpha, beta;
};
namespace detail
......@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
};
}
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
{
};
template<typename T, typename D>
template<typename T, typename D, typename S>
void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
{
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta);
Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
}
......@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
{
typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
static const caller_t tab[8][8] =
{
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
{0,0,0,0,0,0,0,0}
static const caller_t tab[7][7] =
{
{
cvt_<uchar, uchar, float>,
cvt_<uchar, schar, float>,
cvt_<uchar, ushort, float>,
cvt_<uchar, short, float>,
cvt_<uchar, int, float>,
cvt_<uchar, float, float>,
cvt_<uchar, double, double>
},
{
cvt_<schar, uchar, float>,
cvt_<schar, schar, float>,
cvt_<schar, ushort, float>,
cvt_<schar, short, float>,
cvt_<schar, int, float>,
cvt_<schar, float, float>,
cvt_<schar, double, double>
},
{
cvt_<ushort, uchar, float>,
cvt_<ushort, schar, float>,
cvt_<ushort, ushort, float>,
cvt_<ushort, short, float>,
cvt_<ushort, int, float>,
cvt_<ushort, float, float>,
cvt_<ushort, double, double>
},
{
cvt_<short, uchar, float>,
cvt_<short, schar, float>,
cvt_<short, ushort, float>,
cvt_<short, short, float>,
cvt_<short, int, float>,
cvt_<short, float, float>,
cvt_<short, double, double>
},
{
cvt_<int, uchar, float>,
cvt_<int, schar, float>,
cvt_<int, ushort, float>,
cvt_<int, short, float>,
cvt_<int, int, double>,
cvt_<int, float, double>,
cvt_<int, double, double>
},
{
cvt_<float, uchar, float>,
cvt_<float, schar, float>,
cvt_<float, ushort, float>,
cvt_<float, short, float>,
cvt_<float, int, float>,
cvt_<float, float, float>,
cvt_<float, double, double>
},
{
cvt_<double, uchar, double>,
cvt_<double, schar, double>,
cvt_<double, ushort, double>,
cvt_<double, short, double>,
cvt_<double, int, double>,
cvt_<double, float, double>,
cvt_<double, double, double>
}
};
caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
func(src, dst, alpha, beta, stream);
}
......
......@@ -69,33 +69,89 @@ using namespace cv::gpu;
namespace
{
// Compares value to set using the given comparator. Returns true if
// there is at least one element x in the set satisfying to: x cmp value
// predicate.
template <typename Comparer>
bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
class CudaArch
{
public:
CudaArch();
bool builtWith(FeatureSet feature_set) const;
bool hasPtx(int major, int minor) const;
bool hasBin(int major, int minor) const;
bool hasEqualOrLessPtx(int major, int minor) const;
bool hasEqualOrGreaterPtx(int major, int minor) const;
bool hasEqualOrGreaterBin(int major, int minor) const;
private:
static void fromStr(const string& set_as_str, vector<int>& arr);
vector<int> bin;
vector<int> ptx;
vector<int> features;
};
const CudaArch cudaArch;
CudaArch::CudaArch()
{
#ifdef HAVE_CUDA
fromStr(CUDA_ARCH_BIN, bin);
fromStr(CUDA_ARCH_PTX, ptx);
fromStr(CUDA_ARCH_FEATURES, features);
#endif
}
bool CudaArch::builtWith(FeatureSet feature_set) const
{
return !features.empty() && (features.back() >= feature_set);
}
bool CudaArch::hasPtx(int major, int minor) const
{
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
}
bool CudaArch::hasBin(int major, int minor) const
{
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
}
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
{
return !bin.empty() && (bin.back() >= major * 10 + minor);
}
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
{
if (set_as_str.find_first_not_of(" ") == string::npos)
return false;
return;
std::stringstream stream(set_as_str);
istringstream stream(set_as_str);
int cur_value;
while (!stream.eof())
{
stream >> cur_value;
if (cmp(cur_value, value))
return true;
arr.push_back(cur_value);
}
return false;
sort(arr.begin(), arr.end());
}
}
bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
{
#if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>());
return cudaArch.builtWith(feature_set);
#else
(void)feature_set;
return false;
......@@ -110,7 +166,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
{
#if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
return cudaArch.hasPtx(major, minor);
#else
(void)major;
(void)minor;
......@@ -121,7 +177,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasBin(int major, int minor)
{
#if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>());
return cudaArch.hasBin(major, minor);
#else
(void)major;
(void)minor;
......@@ -132,8 +188,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
{
#if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor,
std::less_equal<int>());
return cudaArch.hasEqualOrLessPtx(major, minor);
#else
(void)major;
(void)minor;
......@@ -143,14 +198,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
{
return hasEqualOrGreaterPtx(major, minor) ||
hasEqualOrGreaterBin(major, minor);
return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
}
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
{
#if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>());
return cudaArch.hasEqualOrGreaterPtx(major, minor);
#else
(void)major;
(void)minor;
......@@ -161,8 +215,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
{
#if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor,
std::greater_equal<int>());
return cudaArch.hasEqualOrGreaterBin(major, minor);
#else
(void)major;
(void)minor;
......@@ -170,6 +223,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
#endif
}
bool cv::gpu::deviceSupports(FeatureSet feature_set)
{
static int versions[] =
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
const int devId = getDevice();
int version;
if (devId < cache_size && versions[devId] >= 0)
version = versions[devId];
else
{
DeviceInfo dev(devId);
version = dev.majorVersion() * 10 + dev.minorVersion();
if (devId < cache_size)
versions[devId] = version;
}
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
}
#if !defined (HAVE_CUDA)
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
......
This diff is collapsed.
......@@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
template <> struct name<type> : binary_function<type, type, type> \
{ \
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
__device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\
__device__ __forceinline__ name():binary_function<type, type, type>(){}\
__device__ __forceinline__ name() {}\
__device__ __forceinline__ name(const name&) {}\
};
template <typename T> struct maximum : binary_function<T, T, T>
{
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{
return lhs < rhs ? rhs : lhs;
return max(lhs, rhs);
}
__device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){}
__device__ __forceinline__ maximum():binary_function<T, T, T>(){}
__device__ __forceinline__ maximum() {}
__device__ __forceinline__ maximum(const maximum&) {}
};
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
......@@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
{
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{
return lhs < rhs ? lhs : rhs;
return min(lhs, rhs);
}
__device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){}
__device__ __forceinline__ minimum():binary_function<T, T, T>(){}
__device__ __forceinline__ minimum() {}
__device__ __forceinline__ minimum(const minimum&) {}
};
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
......@@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
// Math functions
///bound=========================================
template <typename T> struct abs_func : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
{
return abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
{
__device__ __forceinline__ unsigned char operator ()(unsigned char x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<signed char> : unary_function<signed char, signed char>
{
__device__ __forceinline__ signed char operator ()(signed char x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<char> : unary_function<char, char>
{
__device__ __forceinline__ char operator ()(char x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
{
__device__ __forceinline__ unsigned short operator ()(unsigned short x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<short> : unary_function<short, short>
{
__device__ __forceinline__ short operator ()(short x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
{
__device__ __forceinline__ unsigned int operator ()(unsigned int x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<int> : unary_function<int, int>
{
__device__ __forceinline__ int operator ()(int x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<float> : unary_function<float, float>
{
__device__ __forceinline__ float operator ()(float x) const
{
return ::fabsf(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<double> : unary_function<double, double>
{
__device__ __forceinline__ double operator ()(double x) const
{
return ::fabs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
template <typename T> struct name ## _func : unary_function<T, float> \
{ \
......@@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
{ \
return func ## f(v); \
} \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; \
template <> struct name ## _func<double> : unary_function<double, double> \
{ \
......@@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
{ \
return func(v); \
} \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
};
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
......@@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
} \
};
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
......
This diff is collapsed.
......@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
{
return (uchar) ::max((int)v, 0);
uint res = 0;
int vi = v;
asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
uint res = 0;
asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
{
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
uint res = 0;
asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
{
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
uint res = 0;
asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
{
return (uchar) ::min(v, (uint)UCHAR_MAX);
uint res = 0;
asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
return saturate_cast<uchar>((uint)v);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<uchar>(iv);
uint res = 0;
asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<uchar>(iv);
#if __CUDA_ARCH__ >= 130
uint res = 0;
asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else
return saturate_cast<uchar>((float)v);
#endif
......@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
{
return (schar) ::min((int)v, SCHAR_MAX);
uint res = 0;
uint vi = v;
asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
{
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
uint res = 0;
asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
{
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
uint res = 0;
asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
{
return saturate_cast<schar>((int)v);
uint res = 0;
asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
{
return (schar) ::min(v, (uint)SCHAR_MAX);
uint res = 0;
asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<schar>(iv);
uint res = 0;
asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<schar>(iv);
#if __CUDA_ARCH__ >= 130
uint res = 0;
asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else
return saturate_cast<schar>((float)v);
#endif
......@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
{
return (ushort) ::max((int)v, 0);
ushort res = 0;
int vi = v;
asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
{
return (ushort) ::max((int)v, 0);
ushort res = 0;
asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
{
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
ushort res = 0;
asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
{
return (ushort) ::min(v, (uint)USHRT_MAX);
ushort res = 0;
asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<ushort>(iv);
ushort res = 0;
asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<ushort>(iv);
#if __CUDA_ARCH__ >= 130
ushort res = 0;
asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else
return saturate_cast<ushort>((float)v);
#endif
......@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
{
return (short) ::min((int)v, SHRT_MAX);
short res = 0;
asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
{
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
short res = 0;
asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
{
return (short) ::min(v, (uint)SHRT_MAX);
short res = 0;
asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<short>(iv);
short res = 0;
asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<short>(iv);
#if __CUDA_ARCH__ >= 130
short res = 0;
asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else
return saturate_cast<short>((float)v);
#endif
}
template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
{
int res = 0;
asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
{
return __float2int_rn(v);
......@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
#endif
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
{
uint res = 0;
int vi = v;
asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
{
uint res = 0;
asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
{
uint res = 0;
asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
{
return __float2uint_rn(v);
......
......@@ -45,7 +45,6 @@
#include "saturate_cast.hpp"
#include "datamov_utils.hpp"
#include "detail/reduction_detail.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
}
};
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
}
template <int n, typename T, typename V, typename Pred>
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
}
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
......
......@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "utility.hpp"
#include "reduce.hpp"
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
......@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
}
__device__ __forceinline__ operator int() const
......@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
}
__device__ __forceinline__ operator float() const
......@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
}
__device__ __forceinline__ operator float() const
......@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
}
__device__ __forceinline__ operator int() const
......
......@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
namespace cv { namespace gpu { namespace device
{
template <typename T>
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl(val, srcLane, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl((int) val, srcLane, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl(lo, srcLane, width);
hi = __shfl(hi, srcLane, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_down(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_down((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo, delta, width);
hi = __shfl_down(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_up(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_up((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_up(lo, delta, width);
hi = __shfl_up(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
}}}
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
......@@ -792,31 +792,23 @@ private:
GpuMat lab, l, ab;
};
struct CV_EXPORTS CannyBuf;
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
struct CV_EXPORTS CannyBuf
{
CannyBuf() {}
explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
void create(const Size& image_size, int apperture_size = 3);
void release();
GpuMat dx, dy;
GpuMat dx_buf, dy_buf;
GpuMat edgeBuf;
GpuMat trackBuf1, trackBuf2;
GpuMat mag;
GpuMat map;
GpuMat st1, st2;
Ptr<FilterEngine_GPU> filterDX, filterDY;
};
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
class CV_EXPORTS ImagePyramid
{
public:
......@@ -1036,11 +1028,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
//! Calculates histogram for 8u one channel image
//! Output hist will have one row, 256 cols and CV32SC1 type.
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//! normalizes the grayscale image brightness and contrast by normalizing its histogram
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//////////////////////////////// StereoBM_GPU ////////////////////////////////
......
......@@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES)
{
cv::gpu::GpuMat d_src(src);
cv::gpu::GpuMat d_hist;
cv::gpu::GpuMat d_buf;
cv::gpu::calcHist(d_src, d_hist, d_buf);
cv::gpu::calcHist(d_src, d_hist);
TEST_CYCLE()
{
cv::gpu::calcHist(d_src, d_hist, d_buf);
cv::gpu::calcHist(d_src, d_hist);
}
GPU_SANITY_CHECK(d_hist);
......
......@@ -42,10 +42,13 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx)
{
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
float d1, d2;
int i1, i2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
......@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2;
#endif
}
template <int BLOCK_SIZE>
......@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
(void) s_imgIdx;
float d1, d2;
int i1, i2;
int j1, j2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
bestImgIdx2 = j1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx2 = bestImgIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
bestImgIdx1 = j1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
bestImgIdx2 = j2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
......@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2;
#endif
}
///////////////////////////////////////////////////////////////////////////////
......@@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx[threadIdx.x] = bestIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
if (threadIdx.x == 0)
{
......
......@@ -42,7 +42,9 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
......@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance;
s_trainIdx[threadIdx.x] = bestTrainIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
}
template <int BLOCK_SIZE>
......@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance;
s_trainIdx[threadIdx.x] = bestTrainIdx;
s_imgIdx [threadIdx.x] = bestImgIdx;
__syncthreads();
reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
}
///////////////////////////////////////////////////////////////////////////////
......
......@@ -42,7 +42,8 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
......@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
......@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
#endif
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
......@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
......@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
#endif
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
......
......@@ -42,9 +42,10 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
}
__device__ __forceinline__ TransformOp() {}
__device__ __forceinline__ TransformOp(const TransformOp&) {}
};
void call(const PtrStepSz<float3> src, const float* rot,
......@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
}
__device__ __forceinline__ ProjectOp() {}
__device__ __forceinline__ ProjectOp(const ProjectOp&) {}
};
void call(const PtrStepSz<float3> src, const float* rot,
......@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
return x * x;
}
template <int BLOCK_SIZE>
__global__ void computeHypothesisScoresKernel(
const int num_points, const float3* object, const float2* image,
const float dist_threshold, int* g_num_inliers)
......@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
++num_inliers;
}
extern __shared__ float s_num_inliers[];
s_num_inliers[threadIdx.x] = num_inliers;
__syncthreads();
for (int step = blockDim.x / 2; step > 0; step >>= 1)
{
if (threadIdx.x < step)
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
__syncthreads();
}
__shared__ int s_num_inliers[BLOCK_SIZE];
reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
if (threadIdx.x == 0)
g_num_inliers[blockIdx.x] = s_num_inliers[0];
g_num_inliers[blockIdx.x] = num_inliers;
}
void computeHypothesisScores(
......@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
dim3 threads(256);
dim3 grid(num_hypotheses);
int smem_size = threads.x * sizeof(float);
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
computeHypothesisScoresKernel<256><<<grid, threads>>>(
num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall( cudaGetLastError() );
......
This diff is collapsed.
This diff is collapsed.
......@@ -46,6 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "fgd_bgfg_common.hpp"
using namespace cv::gpu;
......@@ -181,57 +183,8 @@ namespace bgfg
__shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
__shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
data0[threadIdx.x] = sum0;
data1[threadIdx.x] = sum1;
data2[threadIdx.x] = sum2;
__syncthreads();
if (threadIdx.x < 128)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
}
__syncthreads();
if (threadIdx.x < 64)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
}
__syncthreads();
if (threadIdx.x < 32)
{
volatile unsigned int* vdata0 = data0;
volatile unsigned int* vdata1 = data1;
volatile unsigned int* vdata2 = data2;
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
}
plus<unsigned int> op;
reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
if(threadIdx.x == 0)
{
......
......@@ -43,182 +43,115 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/emulation.hpp"
#include "opencv2/gpu/device/transform.hpp"
namespace cv { namespace gpu { namespace device
{
#define UINT_BITS 32U
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
#define HISTOGRAM256_BIN_COUNT 256
//Shared memory per threadblock
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
#define PARTIAL_HISTOGRAM256_COUNT 240
#define MERGE_THREADBLOCK_SIZE 256
using namespace cv::gpu;
using namespace cv::gpu::device;
#define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
namespace hist
namespace
{
__global__ void histogram256(const uchar* src, int cols, int rows, size_t step, int* hist)
{
#if (!USE_SMEM_ATOMICS)
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
__shared__ int shist[256];
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
{
uint count;
do
{
count = s_WarpHist[data] & TAG_MASK;
count = threadTag | (count + 1);
s_WarpHist[data] = count;
} while (s_WarpHist[data] != count);
}
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
#else
#define TAG_MASK 0xFFFFFFFFU
shist[tid] = 0;
__syncthreads();
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
if (y < rows)
{
atomicAdd(s_WarpHist + data, 1);
}
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
#endif
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
uint x = pos_x << 2;
unsigned int data = rowPtr[x];
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
if (cols % 4 != 0 && threadIdx.x == 0)
{
//Per-warp subhistogram storage
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
//Clear shared memory storage for current threadblock before processing
#pragma unroll
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
//Cycle through the entire data set, update subhistograms for each warp
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
__syncthreads();
const uint colsui = d_Data.step / sizeof(uint);
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
for (int x = cols_4 * 4; x < cols; ++x)
{
uint pos_y = pos / colsui;
uint pos_x = pos % colsui;
uint data = d_Data.ptr(pos_y)[pos_x];
addWord(s_WarpHist, data, tag, pos_x, cols);
unsigned int data = ((const uchar*)rowPtr)[x];
Emulation::smem::atomicAdd(&shist[data], 1);
}
//Merge per-warp histograms into per-block and write to global memory
__syncthreads();
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
{
uint sum = 0;
for (uint i = 0; i < WARP_COUNT; i++)
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output
// Run one threadblock per bin; each threadblock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
{
uint sum = 0;
#pragma unroll
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
data[threadIdx.x] = sum;
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
{
__syncthreads();
if(threadIdx.x < stride)
data[threadIdx.x] += data[threadIdx.x + stride];
}
if(threadIdx.x == 0)
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
const int histVal = shist[tid];
if (histVal > 0)
::atomicAdd(hist + tid, histVal);
}
}
void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
namespace hist
{
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
{
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
PtrStepSz<uint>(src),
buf,
static_cast<uint>(src.rows * src.step / sizeof(uint)),
src.cols);
cudaSafeCall( cudaGetLastError() );
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
::histogram256<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
/////////////////////////////////////////////////////////////////////////
namespace
{
__constant__ int c_lut[256];
__global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
struct EqualizeHist : unary_function<uchar, uchar>
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
float scale;
__host__ EqualizeHist(float _scale) : scale(_scale) {}
if (x < src.cols && y < src.rows)
__device__ __forceinline__ uchar operator ()(uchar val) const
{
const uchar val = src.ptr(y)[x];
const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
}
return __float2int_rn(scale * lut);
}
};
}
void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
namespace cv { namespace gpu { namespace device
{
template <> struct TransformFunctorTraits<EqualizeHist> : DefaultTransformFunctorTraits<EqualizeHist>
{
dim3 block(16, 16);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
enum { smart_shift = 4 };
};
}}}
namespace hist
{
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
{
if (stream == 0)
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
else
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
const float scale = 255.0f / (src.cols * src.rows);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
}
} // namespace hist
}}} // namespace cv { namespace gpu { namespace device
}
#endif /* CUDA_DISABLER */
......@@ -42,7 +42,10 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device
template<int size>
__device__ float reduce_smem(volatile float* smem)
__device__ float reduce_smem(float* smem, float val)
{
unsigned int tid = threadIdx.x;
float sum = smem[tid];
float sum = val;
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
reduce<size>(smem, sum, tid, plus<float>());
if (tid < 32)
if (size == 32)
{
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
#if __CUDA_ARCH__ >= 300
return shfl(sum, 0);
#else
return smem[0];
#endif
}
#if __CUDA_ARCH__ >= 300
if (threadIdx.x == 0)
smem[0] = sum;
#endif
__syncthreads();
sum = smem[0];
return sum;
return smem[0];
}
......@@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device
if (threadIdx.x < block_hist_size)
elem = hist[0];
squares[threadIdx.x] = elem * elem;
__syncthreads();
float sum = reduce_smem<nthreads>(squares);
float sum = reduce_smem<nthreads>(squares, elem * elem);
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
elem = ::min(elem * scale, threshold);
__syncthreads();
squares[threadIdx.x] = elem * elem;
sum = reduce_smem<nthreads>(squares, elem * elem);
__syncthreads();
sum = reduce_smem<nthreads>(squares);
scale = 1.0f / (::sqrtf(sum) + 1e-3f);
if (threadIdx.x < block_hist_size)
......@@ -355,40 +353,11 @@ namespace cv { namespace gpu { namespace device
__shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
__syncthreads();
if (nthreads >= 512)
{
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32)
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
reduce<nthreads>(products, product, tid, plus<float>());
if (threadIdx.x == 0)
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
= (float)(product + free_coef);
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
}
......@@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device
__shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
__syncthreads();
if (nthreads >= 512)
{
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32)
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
reduce<nthreads>(products, product, tid, plus<float>());
if (threadIdx.x == 0)
labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
......
This diff is collapsed.
......@@ -43,11 +43,11 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/block.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu;
......@@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
{
namespace imgproc
{
template <int cn> struct Unroll;
template <> struct Unroll<1>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
{
return thrust::tie(val1, val2);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op);
}
};
template <> struct Unroll<2>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
{
return thrust::tie(val1, val2.x, val2.y);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op);
}
};
template <> struct Unroll<3>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op);
}
};
template <> struct Unroll<4>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op, op);
}
};
__device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); }
__device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
......@@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device
sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
}
volatile __shared__ float cta_buffer[CTA_SIZE];
int tid = threadIdx.x;
cta_buffer[tid] = weights_sum;
__syncthreads();
Block::reduce<CTA_SIZE>(cta_buffer, plus());
weights_sum = cta_buffer[0];
__syncthreads();
__shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
threadIdx.x,
Unroll<VecTraits<T>::cn>::op());
for(int n = 0; n < VecTraits<T>::cn; ++n)
{
cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
__syncthreads();
Block::reduce<CTA_SIZE>(cta_buffer, plus());
reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
__syncthreads();
}
if (tid == 0)
dst = saturate_cast<T>(sum/weights_sum);
if (threadIdx.x == 0)
dst = saturate_cast<T>(sum / weights_sum);
}
__device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
......
......@@ -50,7 +50,7 @@
#include <thrust/sort.h>
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
namespace cv { namespace gpu { namespace device
......@@ -75,9 +75,9 @@ namespace cv { namespace gpu { namespace device
__global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
{
__shared__ int smem[8 * 32];
volatile int* srow = smem + threadIdx.y * blockDim.x;
__shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
__shared__ int smem2[8 * 32];
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
......@@ -109,9 +109,12 @@ namespace cv { namespace gpu { namespace device
c += Ix * Iy;
}
reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
int* srow2 = smem2 + threadIdx.y * blockDim.x;
plus<int> op;
reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
if (threadIdx.x == 0)
{
......@@ -151,9 +154,13 @@ namespace cv { namespace gpu { namespace device
__global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
{
__shared__ int smem[8 * 32];
__shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
volatile int* srow = smem + threadIdx.y * blockDim.x;
plus<int> op;
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
......@@ -167,7 +174,7 @@ namespace cv { namespace gpu { namespace device
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u);
reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
reduce<32>(srow0, m_10, threadIdx.x, op);
for (int v = 1; v <= half_k; ++v)
{
......@@ -185,8 +192,7 @@ namespace cv { namespace gpu { namespace device
m_sum += u * (val_plus + val_minus);
}
reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
m_10 += m_sum;
m_01 += v * v_sum;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -65,6 +65,8 @@
#include "NPP_staging/NPP_staging.hpp"
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
#include "opencv2/gpu/device/warp.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
//==============================================================================
......@@ -81,6 +83,20 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
//assuming size <= WARP_SIZE and size is power of 2
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{
#if __CUDA_ARCH__ >= 300
const unsigned int laneId = cv::gpu::device::Warp::laneId();
// scan on shuffl functions
#pragma unroll
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
{
const Ncv32u n = cv::gpu::device::shfl_up(idata, i);
if (laneId >= i)
idata += n;
}
return idata;
#else
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
......@@ -93,6 +109,7 @@ __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
#endif
}
__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment