Commit 281d036f authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

optimizations:

- new reduce implementation (with kepler optimizations)
- saturate_cast via asm command
- video SIMD instructions in element operations
- float arithmetics instead of double
- new deviceSupports function
parent ae6266e1
...@@ -79,6 +79,8 @@ namespace cv { namespace gpu ...@@ -79,6 +79,8 @@ namespace cv { namespace gpu
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30 WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
}; };
CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
// Gives information about what GPU archs this OpenCV GPU module was // Gives information about what GPU archs this OpenCV GPU module was
// compiled for // compiled for
class CV_EXPORTS TargetArchs class CV_EXPORTS TargetArchs
......
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/type_traits.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device ...@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
void writeScalar(const int*); void writeScalar(const int*);
void writeScalar(const float*); void writeScalar(const float*);
void writeScalar(const double*); void writeScalar(const double*);
void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t); void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
}}} }}}
...@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device ...@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D> template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
{ {
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
{ {
return saturate_cast<D>(alpha * src + beta); return saturate_cast<D>(alpha * src + beta);
} }
double alpha, beta; S alpha, beta;
}; };
namespace detail namespace detail
...@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device ...@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
}; };
} }
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> > template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
{ {
}; };
template<typename T, typename D> template<typename T, typename D, typename S>
void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream) void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
{ {
cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta); Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream); cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
} }
...@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device ...@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
static const caller_t tab[8][8] = static const caller_t tab[7][7] =
{ {
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>, {
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0}, cvt_<uchar, uchar, float>,
cvt_<uchar, schar, float>,
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>, cvt_<uchar, ushort, float>,
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0}, cvt_<uchar, short, float>,
cvt_<uchar, int, float>,
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>, cvt_<uchar, float, float>,
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0}, cvt_<uchar, double, double>
},
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>, {
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0}, cvt_<schar, uchar, float>,
cvt_<schar, schar, float>,
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>, cvt_<schar, ushort, float>,
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0}, cvt_<schar, short, float>,
cvt_<schar, int, float>,
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>, cvt_<schar, float, float>,
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0}, cvt_<schar, double, double>
},
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>, {
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0}, cvt_<ushort, uchar, float>,
cvt_<ushort, schar, float>,
{0,0,0,0,0,0,0,0} cvt_<ushort, ushort, float>,
cvt_<ushort, short, float>,
cvt_<ushort, int, float>,
cvt_<ushort, float, float>,
cvt_<ushort, double, double>
},
{
cvt_<short, uchar, float>,
cvt_<short, schar, float>,
cvt_<short, ushort, float>,
cvt_<short, short, float>,
cvt_<short, int, float>,
cvt_<short, float, float>,
cvt_<short, double, double>
},
{
cvt_<int, uchar, float>,
cvt_<int, schar, float>,
cvt_<int, ushort, float>,
cvt_<int, short, float>,
cvt_<int, int, double>,
cvt_<int, float, double>,
cvt_<int, double, double>
},
{
cvt_<float, uchar, float>,
cvt_<float, schar, float>,
cvt_<float, ushort, float>,
cvt_<float, short, float>,
cvt_<float, int, float>,
cvt_<float, float, float>,
cvt_<float, double, double>
},
{
cvt_<double, uchar, double>,
cvt_<double, schar, double>,
cvt_<double, ushort, double>,
cvt_<double, short, double>,
cvt_<double, int, double>,
cvt_<double, float, double>,
cvt_<double, double, double>
}
}; };
caller_t func = tab[sdepth][ddepth]; caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
func(src, dst, alpha, beta, stream); func(src, dst, alpha, beta, stream);
} }
......
...@@ -45,8 +45,7 @@ ...@@ -45,8 +45,7 @@
#include <iostream> #include <iostream>
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
#include <cuda.h> #include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <npp.h> #include <npp.h>
#define CUDART_MINIMUM_REQUIRED_VERSION 4010 #define CUDART_MINIMUM_REQUIRED_VERSION 4010
...@@ -69,33 +68,89 @@ using namespace cv::gpu; ...@@ -69,33 +68,89 @@ using namespace cv::gpu;
namespace namespace
{ {
// Compares value to set using the given comparator. Returns true if class CudaArch
// there is at least one element x in the set satisfying to: x cmp value {
// predicate. public:
template <typename Comparer> CudaArch();
bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
bool builtWith(FeatureSet feature_set) const;
bool hasPtx(int major, int minor) const;
bool hasBin(int major, int minor) const;
bool hasEqualOrLessPtx(int major, int minor) const;
bool hasEqualOrGreaterPtx(int major, int minor) const;
bool hasEqualOrGreaterBin(int major, int minor) const;
private:
static void fromStr(const string& set_as_str, vector<int>& arr);
vector<int> bin;
vector<int> ptx;
vector<int> features;
};
const CudaArch cudaArch;
CudaArch::CudaArch()
{
#ifdef HAVE_CUDA
fromStr(CUDA_ARCH_BIN, bin);
fromStr(CUDA_ARCH_PTX, ptx);
fromStr(CUDA_ARCH_FEATURES, features);
#endif
}
bool CudaArch::builtWith(FeatureSet feature_set) const
{
return !features.empty() && (features.back() >= feature_set);
}
bool CudaArch::hasPtx(int major, int minor) const
{
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
}
bool CudaArch::hasBin(int major, int minor) const
{
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
}
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
{
return !bin.empty() && (bin.back() >= major * 10 + minor);
}
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
{ {
if (set_as_str.find_first_not_of(" ") == string::npos) if (set_as_str.find_first_not_of(" ") == string::npos)
return false; return;
std::stringstream stream(set_as_str); istringstream stream(set_as_str);
int cur_value; int cur_value;
while (!stream.eof()) while (!stream.eof())
{ {
stream >> cur_value; stream >> cur_value;
if (cmp(cur_value, value)) arr.push_back(cur_value);
return true;
} }
return false; sort(arr.begin(), arr.end());
} }
} }
bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>()); return cudaArch.builtWith(feature_set);
#else #else
(void)feature_set; (void)feature_set;
return false; return false;
...@@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor) ...@@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
bool cv::gpu::TargetArchs::hasPtx(int major, int minor) bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>()); return cudaArch.hasPtx(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor) ...@@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasBin(int major, int minor) bool cv::gpu::TargetArchs::hasBin(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>()); return cudaArch.hasBin(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor) ...@@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, return cudaArch.hasEqualOrLessPtx(major, minor);
std::less_equal<int>());
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) ...@@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
{ {
return hasEqualOrGreaterPtx(major, minor) || return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
hasEqualOrGreaterBin(major, minor);
} }
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>()); return cudaArch.hasEqualOrGreaterPtx(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) ...@@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, return cudaArch.hasEqualOrGreaterBin(major, minor);
std::greater_equal<int>());
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
...@@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) ...@@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
#endif #endif
} }
bool cv::gpu::deviceSupports(FeatureSet feature_set)
{
static int versions[] =
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
const int devId = getDevice();
int version;
if (devId < cache_size && versions[devId] >= 0)
version = versions[devId];
else
{
DeviceInfo dev(devId);
version = dev.majorVersion() * 10 + dev.minorVersion();
if (devId < cache_size)
versions[devId] = version;
}
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
}
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
......
This diff is collapsed.
This diff is collapsed.
...@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device ...@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
{ {
return (uchar) ::max((int)v, 0); uint res = 0;
int vi = v;
asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
uint res = 0;
asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
{ {
return (uchar) ::min((uint)v, (uint)UCHAR_MAX); uint res = 0;
asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
{ {
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); uint res = 0;
asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
{ {
return (uchar) ::min(v, (uint)UCHAR_MAX); uint res = 0;
asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
return saturate_cast<uchar>((uint)v);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{ {
int iv = __float2int_rn(v); uint res = 0;
return saturate_cast<uchar>(iv); asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); uint res = 0;
return saturate_cast<uchar>(iv); asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else #else
return saturate_cast<uchar>((float)v); return saturate_cast<uchar>((float)v);
#endif #endif
...@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device ...@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v) template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
{ {
return (schar) ::min((int)v, SCHAR_MAX); uint res = 0;
uint vi = v;
asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v) template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
{ {
return (schar) ::min((uint)v, (uint)SCHAR_MAX); uint res = 0;
asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v) template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
{ {
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); uint res = 0;
asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v) template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
{ {
return saturate_cast<schar>((int)v); uint res = 0;
asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v) template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
{ {
return (schar) ::min(v, (uint)SCHAR_MAX); uint res = 0;
asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v) template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
{ {
int iv = __float2int_rn(v); uint res = 0;
return saturate_cast<schar>(iv); asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v) template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); uint res = 0;
return saturate_cast<schar>(iv); asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else #else
return saturate_cast<schar>((float)v); return saturate_cast<schar>((float)v);
#endif #endif
...@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device ...@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
{ {
return (ushort) ::max((int)v, 0); ushort res = 0;
int vi = v;
asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
{ {
return (ushort) ::max((int)v, 0); ushort res = 0;
asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
{ {
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); ushort res = 0;
asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
{ {
return (ushort) ::min(v, (uint)USHRT_MAX); ushort res = 0;
asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
{ {
int iv = __float2int_rn(v); ushort res = 0;
return saturate_cast<ushort>(iv); asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); ushort res = 0;
return saturate_cast<ushort>(iv); asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else #else
return saturate_cast<ushort>((float)v); return saturate_cast<ushort>((float)v);
#endif #endif
...@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device ...@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v) template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
{ {
return (short) ::min((int)v, SHRT_MAX); short res = 0;
asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(int v) template<> __device__ __forceinline__ short saturate_cast<short>(int v)
{ {
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); short res = 0;
asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(uint v) template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
{ {
return (short) ::min(v, (uint)SHRT_MAX); short res = 0;
asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(float v) template<> __device__ __forceinline__ short saturate_cast<short>(float v)
{ {
int iv = __float2int_rn(v); short res = 0;
return saturate_cast<short>(iv); asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(double v) template<> __device__ __forceinline__ short saturate_cast<short>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); short res = 0;
return saturate_cast<short>(iv); asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else #else
return saturate_cast<short>((float)v); return saturate_cast<short>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
{
int res = 0;
asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ int saturate_cast<int>(float v) template<> __device__ __forceinline__ int saturate_cast<int>(float v)
{ {
return __float2int_rn(v); return __float2int_rn(v);
...@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device ...@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
#endif #endif
} }
template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
{
uint res = 0;
int vi = v;
asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
{
uint res = 0;
asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
{
uint res = 0;
asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v) template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
{ {
return __float2uint_rn(v); return __float2uint_rn(v);
......
...@@ -45,7 +45,6 @@ ...@@ -45,7 +45,6 @@
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "datamov_utils.hpp" #include "datamov_utils.hpp"
#include "detail/reduction_detail.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device ...@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
} }
}; };
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
}
template <int n, typename T, typename V, typename Pred>
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
}
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Solve linear system // Solve linear system
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "utility.hpp" #include "reduce.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "detail/vec_distance_detail.hpp" #include "detail/vec_distance_detail.hpp"
...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device ...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const
...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device ...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
...@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device ...@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
...@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device ...@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
namespace cv { namespace gpu { namespace device
{
template <typename T>
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl(val, srcLane, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl((int) val, srcLane, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl(lo, srcLane, width);
hi = __shfl(hi, srcLane, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_down(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_down((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo, delta, width);
hi = __shfl_down(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_up(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_up((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_up(lo, delta, width);
hi = __shfl_up(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
}}}
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
...@@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, ...@@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
{ {
#ifndef HAVE_CUBLAS #ifndef HAVE_CUBLAS
(void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream; (void)src1;
(void)src2;
(void)alpha;
(void)src3;
(void)beta;
(void)dst;
(void)flags;
(void)stream;
CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS"); CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
#else #else
// CUBLAS works with column-major matrices // CUBLAS works with column-major matrices
CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
...@@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G ...@@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
if (src1.depth() == CV_64F) if (src1.depth() == CV_64F)
{ {
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
} }
...@@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G ...@@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
} }
cublasSafeCall( cublasDestroy_v2(handle) ); cublasSafeCall( cublasDestroy_v2(handle) );
#endif #endif
} }
...@@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s) ...@@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
} }
else // if (src.elemSize() == 8) else // if (src.elemSize() == 8)
{ {
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
NppStStreamHandler h(stream); NppStStreamHandler h(stream);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -42,9 +42,10 @@ ...@@ -42,9 +42,10 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device ...@@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
} }
__device__ __forceinline__ TransformOp() {}
__device__ __forceinline__ TransformOp(const TransformOp&) {}
}; };
void call(const PtrStepSz<float3> src, const float* rot, void call(const PtrStepSz<float3> src, const float* rot,
...@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device ...@@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
} }
__device__ __forceinline__ ProjectOp() {}
__device__ __forceinline__ ProjectOp(const ProjectOp&) {}
}; };
void call(const PtrStepSz<float3> src, const float* rot, void call(const PtrStepSz<float3> src, const float* rot,
...@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device ...@@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
return x * x; return x * x;
} }
template <int BLOCK_SIZE>
__global__ void computeHypothesisScoresKernel( __global__ void computeHypothesisScoresKernel(
const int num_points, const float3* object, const float2* image, const int num_points, const float3* object, const float2* image,
const float dist_threshold, int* g_num_inliers) const float dist_threshold, int* g_num_inliers)
...@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device ...@@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
++num_inliers; ++num_inliers;
} }
extern __shared__ float s_num_inliers[]; __shared__ int s_num_inliers[BLOCK_SIZE];
s_num_inliers[threadIdx.x] = num_inliers; reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
__syncthreads();
for (int step = blockDim.x / 2; step > 0; step >>= 1)
{
if (threadIdx.x < step)
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
__syncthreads();
}
if (threadIdx.x == 0) if (threadIdx.x == 0)
g_num_inliers[blockIdx.x] = s_num_inliers[0]; g_num_inliers[blockIdx.x] = num_inliers;
} }
void computeHypothesisScores( void computeHypothesisScores(
...@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device ...@@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
dim3 threads(256); dim3 threads(256);
dim3 grid(num_hypotheses); dim3 grid(num_hypotheses);
int smem_size = threads.x * sizeof(float);
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>( computeHypothesisScoresKernel<256><<<grid, threads>>>(
num_points, object, image, dist_threshold, hypothesis_scores); num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
......
This diff is collapsed.
This diff is collapsed.
...@@ -46,6 +46,8 @@ ...@@ -46,6 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "fgd_bgfg_common.hpp" #include "fgd_bgfg_common.hpp"
using namespace cv::gpu; using namespace cv::gpu;
...@@ -181,57 +183,8 @@ namespace bgfg ...@@ -181,57 +183,8 @@ namespace bgfg
__shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE]; __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
__shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE]; __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
data0[threadIdx.x] = sum0; plus<unsigned int> op;
data1[threadIdx.x] = sum1; reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
data2[threadIdx.x] = sum2;
__syncthreads();
if (threadIdx.x < 128)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
}
__syncthreads();
if (threadIdx.x < 64)
{
data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
}
__syncthreads();
if (threadIdx.x < 32)
{
volatile unsigned int* vdata0 = data0;
volatile unsigned int* vdata1 = data1;
volatile unsigned int* vdata2 = data2;
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
}
if(threadIdx.x == 0) if(threadIdx.x == 0)
{ {
...@@ -245,9 +198,9 @@ namespace bgfg ...@@ -245,9 +198,9 @@ namespace bgfg
void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
int cc, cudaStream_t stream) bool cc20, cudaStream_t stream)
{ {
const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6; const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE; const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>( calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
...@@ -261,10 +214,10 @@ namespace bgfg ...@@ -261,10 +214,10 @@ namespace bgfg
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream); template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream); template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream); template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream); template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// calcDiffThreshMask // calcDiffThreshMask
......
...@@ -125,7 +125,7 @@ namespace bgfg ...@@ -125,7 +125,7 @@ namespace bgfg
void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
int cc, cudaStream_t stream); bool cc20, cudaStream_t stream);
template <typename PT, typename CT> template <typename PT, typename CT>
void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream); void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
......
...@@ -43,12 +43,10 @@ ...@@ -43,12 +43,10 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "thrust/device_ptr.h" #include <thrust/device_ptr.h>
#include "thrust/remove.h" #include <thrust/remove.h>
#include "thrust/functional.h" #include <thrust/functional.h>
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
using namespace thrust;
namespace cv { namespace gpu { namespace device { namespace globmotion { namespace cv { namespace gpu { namespace device { namespace globmotion {
...@@ -61,10 +59,10 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask) ...@@ -61,10 +59,10 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
thrust::device_ptr<float2> dpoints1((float2*)points1); thrust::device_ptr<float2> dpoints1((float2*)points1);
thrust::device_ptr<const uchar> dmask(mask); thrust::device_ptr<const uchar> dmask(mask);
return thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)), return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)), thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
dmask, thrust::not1(thrust::identity<uchar>())) dmask, thrust::not1(thrust::identity<uchar>()))
- make_zip_iterator(make_tuple(dpoints0, dpoints1)); - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
} }
......
...@@ -43,182 +43,112 @@ ...@@ -43,182 +43,112 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/emulation.hpp"
#include "opencv2/gpu/device/transform.hpp"
namespace cv { namespace gpu { namespace device using namespace cv::gpu;
{ using namespace cv::gpu::device;
#define UINT_BITS 32U
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
#define HISTOGRAM256_BIN_COUNT 256
//Shared memory per threadblock
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
#define PARTIAL_HISTOGRAM256_COUNT 240
#define MERGE_THREADBLOCK_SIZE 256
#define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)) namespace hist
{
namespace hist __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
{ {
#if (!USE_SMEM_ATOMICS) __shared__ int shist[256];
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag) const int y = blockIdx.x * blockDim.y + threadIdx.y;
{ const int tid = threadIdx.y * blockDim.x + threadIdx.x;
uint count;
do
{
count = s_WarpHist[data] & TAG_MASK;
count = threadTag | (count + 1);
s_WarpHist[data] = count;
} while (s_WarpHist[data] != count);
}
#else shist[tid] = 0;
__syncthreads();
#define TAG_MASK 0xFFFFFFFFU
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag) if (y < rows)
{ {
atomicAdd(s_WarpHist + data, 1); const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
}
#endif const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
{ {
uint x = pos_x << 2; unsigned int data = rowPtr[x];
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
} }
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols) if (cols % 4 != 0 && threadIdx.x == 0)
{ {
//Per-warp subhistogram storage for (int x = cols_4 * 4; x < cols; ++x)
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
//Clear shared memory storage for current threadblock before processing
#pragma unroll
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
//Cycle through the entire data set, update subhistograms for each warp
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
__syncthreads();
const uint colsui = d_Data.step / sizeof(uint);
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
{ {
uint pos_y = pos / colsui; unsigned int data = ((const uchar*)rowPtr)[x];
uint pos_x = pos % colsui; Emulation::smem::atomicAdd(&shist[data], 1);
uint data = d_Data.ptr(pos_y)[pos_x];
addWord(s_WarpHist, data, tag, pos_x, cols);
} }
//Merge per-warp histograms into per-block and write to global memory
__syncthreads();
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
{
uint sum = 0;
for (uint i = 0; i < WARP_COUNT; i++)
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
} }
} }
////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output
// Run one threadblock per bin; each threadblock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
{
uint sum = 0;
#pragma unroll
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
data[threadIdx.x] = sum;
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
{
__syncthreads(); __syncthreads();
if(threadIdx.x < stride)
data[threadIdx.x] += data[threadIdx.x + stride];
}
if(threadIdx.x == 0) const int histVal = shist[tid];
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]); if (histVal > 0)
::atomicAdd(hist + tid, histVal);
} }
void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream) void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
{ {
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>( const dim3 block(32, 8);
PtrStepSz<uint>(src), const dim3 grid(divUp(src.rows, block.y));
buf,
static_cast<uint>(src.rows * src.step / sizeof(uint)),
src.cols);
cudaSafeCall( cudaGetLastError() );
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}
/////////////////////////////////////////////////////////////////////////
namespace hist
{
__constant__ int c_lut[256]; __constant__ int c_lut[256];
__global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst) struct EqualizeHist : unary_function<uchar, uchar>
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; float scale;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__host__ EqualizeHist(float _scale) : scale(_scale) {}
if (x < src.cols && y < src.rows) __device__ __forceinline__ uchar operator ()(uchar val) const
{ {
const uchar val = src.ptr(y)[x];
const int lut = c_lut[val]; const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut); return __float2int_rn(scale * lut);
}
} }
};
}
void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream) namespace cv { namespace gpu { namespace device
{
template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
{ {
dim3 block(16, 16); enum { smart_shift = 4 };
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); };
}}}
namespace hist
{
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
{
if (stream == 0)
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) ); cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
else
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
equalizeHist<<<grid, block, 0, stream>>>(src, dst); const float scale = 255.0f / (src.cols * src.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
cudaSafeCall( cudaDeviceSynchronize() );
} }
} // namespace hist }
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */ #endif /* CUDA_DISABLER */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -47,10 +47,11 @@ ...@@ -47,10 +47,11 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include <thrust/device_ptr.h>
#include <thrust/sort.h> #include <thrust/sort.h>
#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
...@@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device ...@@ -75,9 +76,9 @@ namespace cv { namespace gpu { namespace device
__global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k) __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
{ {
__shared__ int smem[8 * 32]; __shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
volatile int* srow = smem + threadIdx.y * blockDim.x; __shared__ int smem2[8 * 32];
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
...@@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device ...@@ -109,9 +110,12 @@ namespace cv { namespace gpu { namespace device
c += Ix * Iy; c += Ix * Iy;
} }
reduce<32>(srow, a, threadIdx.x, plus<volatile int>()); int* srow0 = smem0 + threadIdx.y * blockDim.x;
reduce<32>(srow, b, threadIdx.x, plus<volatile int>()); int* srow1 = smem1 + threadIdx.y * blockDim.x;
reduce<32>(srow, c, threadIdx.x, plus<volatile int>()); int* srow2 = smem2 + threadIdx.y * blockDim.x;
plus<int> op;
reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
...@@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device ...@@ -151,9 +155,13 @@ namespace cv { namespace gpu { namespace device
__global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k) __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
{ {
__shared__ int smem[8 * 32]; __shared__ int smem0[8 * 32];
__shared__ int smem1[8 * 32];
int* srow0 = smem0 + threadIdx.y * blockDim.x;
int* srow1 = smem1 + threadIdx.y * blockDim.x;
volatile int* srow = smem + threadIdx.y * blockDim.x; plus<int> op;
const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
...@@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device ...@@ -167,7 +175,7 @@ namespace cv { namespace gpu { namespace device
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x) for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u); m_10 += u * image(loc.y, loc.x + u);
reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>()); reduce<32>(srow0, m_10, threadIdx.x, op);
for (int v = 1; v <= half_k; ++v) for (int v = 1; v <= half_k; ++v)
{ {
...@@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device ...@@ -185,8 +193,7 @@ namespace cv { namespace gpu { namespace device
m_sum += u * (val_plus + val_minus); m_sum += u * (val_plus + val_minus);
} }
reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>()); reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
m_10 += m_sum; m_10 += m_sum;
m_01 += v * v_sum; m_01 += v * v_sum;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -294,9 +294,8 @@ void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf& ...@@ -294,9 +294,8 @@ void cv::gpu::HoughCircles(const GpuMat& src, GpuMat& circles, HoughCirclesBuf&
ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles); ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);
DeviceInfo devInfo;
const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles, const int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, circles.ptr<float3>(), maxCircles,
dp, minRadius, maxRadius, votesThreshold, devInfo.supports(FEATURE_SET_COMPUTE_20)); dp, minRadius, maxRadius, votesThreshold, deviceSupports(FEATURE_SET_COMPUTE_20));
if (circlesCount > 0) if (circlesCount > 0)
circles.cols = circlesCount; circles.cols = circlesCount;
...@@ -531,7 +530,7 @@ namespace ...@@ -531,7 +530,7 @@ namespace
const func_t func = funcs[dx.depth()]; const func_t func = funcs[dx.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
edgePointList.cols = edgePointList.step / sizeof(int); edgePointList.cols = (int) (edgePointList.step / sizeof(int));
ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList); ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);
edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1)); edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment