Commit 54337fd5 authored by Anton Obukhov's avatar Anton Obukhov

[+] CUDA path for NCVImagePyramid

parent 4136855b
......@@ -331,6 +331,7 @@ enum
NCV_HAAR_XML_LOADING_EXCEPTION,
NCV_NOIMPL_HAAR_TILTED_FEATURES,
NCV_NOT_IMPLEMENTED,
NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,
......
......@@ -45,6 +45,15 @@
#include "NCV.hpp"
template <class T>
static void swap(T &p1, T &p2)
{
T tmp = p1;
p1 = p2;
p2 = tmp;
}
template<typename T>
static T divUp(T a, T b)
{
......
......@@ -46,25 +46,25 @@
#include <float.h>
#include "NCV.hpp"
template<typename TBase> inline TBase _pixMaxVal();
template<> static inline Ncv8u _pixMaxVal<Ncv8u>() {return UCHAR_MAX;}
template<> static inline Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
template<> static inline Ncv32u _pixMaxVal<Ncv32u>() {return UINT_MAX;}
template<> static inline Ncv8s _pixMaxVal<Ncv8s>() {return CHAR_MAX;}
template<> static inline Ncv16s _pixMaxVal<Ncv16s>() {return SHRT_MAX;}
template<> static inline Ncv32s _pixMaxVal<Ncv32s>() {return INT_MAX;}
template<> static inline Ncv32f _pixMaxVal<Ncv32f>() {return FLT_MAX;}
template<> static inline Ncv64f _pixMaxVal<Ncv64f>() {return DBL_MAX;}
template<typename TBase> inline TBase _pixMinVal();
template<> static inline Ncv8u _pixMinVal<Ncv8u>() {return 0;}
template<> static inline Ncv16u _pixMinVal<Ncv16u>() {return 0;}
template<> static inline Ncv32u _pixMinVal<Ncv32u>() {return 0;}
template<> static inline Ncv8s _pixMinVal<Ncv8s>() {return CHAR_MIN;}
template<> static inline Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
template<> static inline Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
template<> static inline Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
template<> static inline Ncv64f _pixMinVal<Ncv64f>() {return DBL_MIN;}
template<typename TBase> inline __host__ __device__ TBase _pixMaxVal();
template<> static inline __host__ __device__ Ncv8u _pixMaxVal<Ncv8u>() {return UCHAR_MAX;}
template<> static inline __host__ __device__ Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
template<> static inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return UINT_MAX;}
template<> static inline __host__ __device__ Ncv8s _pixMaxVal<Ncv8s>() {return CHAR_MAX;}
template<> static inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return SHRT_MAX;}
template<> static inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return INT_MAX;}
template<> static inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return FLT_MAX;}
template<> static inline __host__ __device__ Ncv64f _pixMaxVal<Ncv64f>() {return DBL_MAX;}
template<typename TBase> inline __host__ __device__ TBase _pixMinVal();
template<> static inline __host__ __device__ Ncv8u _pixMinVal<Ncv8u>() {return 0;}
template<> static inline __host__ __device__ Ncv16u _pixMinVal<Ncv16u>() {return 0;}
template<> static inline __host__ __device__ Ncv32u _pixMinVal<Ncv32u>() {return 0;}
template<> static inline __host__ __device__ Ncv8s _pixMinVal<Ncv8s>() {return CHAR_MIN;}
template<> static inline __host__ __device__ Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
template<> static inline __host__ __device__ Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
template<> static inline __host__ __device__ Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
template<> static inline __host__ __device__ Ncv64f _pixMinVal<Ncv64f>() {return DBL_MIN;}
template<typename Tvec> struct TConvVec2Base;
template<> struct TConvVec2Base<uchar1> {typedef Ncv8u TBase;};
......@@ -103,55 +103,55 @@ template<> struct TConvBase2Vec<Ncv64f, 3> {typedef double3 TVec;};
template<> struct TConvBase2Vec<Ncv64f, 4> {typedef double4 TVec;};
//TODO: consider using CUDA intrinsics to avoid branching
template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);};
template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);}
template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);}
template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);};
template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);}
template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);}
template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
//TODO: consider using CUDA intrinsics to avoid branching
template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);}
template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);}
template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);}
template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);}
template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
template<typename Tout> inline Tout _pixMakeZero();
template<> static inline uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
template<> static inline uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
template<> static inline uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
template<> static inline ushort1 _pixMakeZero<ushort1>() {return make_ushort1(0);}
template<> static inline ushort3 _pixMakeZero<ushort3>() {return make_ushort3(0,0,0);}
template<> static inline ushort4 _pixMakeZero<ushort4>() {return make_ushort4(0,0,0,0);}
template<> static inline uint1 _pixMakeZero<uint1>() {return make_uint1(0);}
template<> static inline uint3 _pixMakeZero<uint3>() {return make_uint3(0,0,0);}
template<> static inline uint4 _pixMakeZero<uint4>() {return make_uint4(0,0,0,0);}
template<> static inline float1 _pixMakeZero<float1>() {return make_float1(0.f);}
template<> static inline float3 _pixMakeZero<float3>() {return make_float3(0.f,0.f,0.f);}
template<> static inline float4 _pixMakeZero<float4>() {return make_float4(0.f,0.f,0.f,0.f);}
template<> static inline double1 _pixMakeZero<double1>() {return make_double1(0.);}
template<> static inline double3 _pixMakeZero<double3>() {return make_double3(0.,0.,0.);}
template<> static inline double4 _pixMakeZero<double4>() {return make_double4(0.,0.,0.,0.);}
static inline uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);}
static inline uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);}
static inline uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);}
static inline ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);}
static inline ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);}
static inline ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);}
static inline uint1 _pixMake(Ncv32u x) {return make_uint1(x);}
static inline uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);}
static inline uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);}
static inline float1 _pixMake(Ncv32f x) {return make_float1(x);}
static inline float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);}
static inline float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);}
static inline double1 _pixMake(Ncv64f x) {return make_double1(x);}
static inline double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);}
static inline double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);}
template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampZ_CN {static Tout _pixDemoteClampZ_CN(Tin &pix);};
template<> static inline __host__ __device__ uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
template<> static inline __host__ __device__ uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
template<> static inline __host__ __device__ uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
template<> static inline __host__ __device__ ushort1 _pixMakeZero<ushort1>() {return make_ushort1(0);}
template<> static inline __host__ __device__ ushort3 _pixMakeZero<ushort3>() {return make_ushort3(0,0,0);}
template<> static inline __host__ __device__ ushort4 _pixMakeZero<ushort4>() {return make_ushort4(0,0,0,0);}
template<> static inline __host__ __device__ uint1 _pixMakeZero<uint1>() {return make_uint1(0);}
template<> static inline __host__ __device__ uint3 _pixMakeZero<uint3>() {return make_uint3(0,0,0);}
template<> static inline __host__ __device__ uint4 _pixMakeZero<uint4>() {return make_uint4(0,0,0,0);}
template<> static inline __host__ __device__ float1 _pixMakeZero<float1>() {return make_float1(0.f);}
template<> static inline __host__ __device__ float3 _pixMakeZero<float3>() {return make_float3(0.f,0.f,0.f);}
template<> static inline __host__ __device__ float4 _pixMakeZero<float4>() {return make_float4(0.f,0.f,0.f,0.f);}
template<> static inline __host__ __device__ double1 _pixMakeZero<double1>() {return make_double1(0.);}
template<> static inline __host__ __device__ double3 _pixMakeZero<double3>() {return make_double3(0.,0.,0.);}
template<> static inline __host__ __device__ double4 _pixMakeZero<double4>() {return make_double4(0.,0.,0.,0.);}
static inline __host__ __device__ uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);}
static inline __host__ __device__ uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);}
static inline __host__ __device__ uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);}
static inline __host__ __device__ ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);}
static inline __host__ __device__ ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);}
static inline __host__ __device__ ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);}
static inline __host__ __device__ uint1 _pixMake(Ncv32u x) {return make_uint1(x);}
static inline __host__ __device__ uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);}
static inline __host__ __device__ uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);}
static inline __host__ __device__ float1 _pixMake(Ncv32f x) {return make_float1(x);}
static inline __host__ __device__ float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);}
static inline __host__ __device__ float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);}
static inline __host__ __device__ double1 _pixMake(Ncv64f x) {return make_double1(x);}
static inline __host__ __device__ double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);}
static inline __host__ __device__ double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);}
template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampZ_CN {static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix);};
template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 1> {
static Tout _pixDemoteClampZ_CN(Tin &pix)
static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
{
Tout out;
_TDemoteClampZ(pix.x, out.x);
......@@ -159,7 +159,7 @@ static Tout _pixDemoteClampZ_CN(Tin &pix)
}};
template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 3> {
static Tout _pixDemoteClampZ_CN(Tin &pix)
static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
{
Tout out;
_TDemoteClampZ(pix.x, out.x);
......@@ -169,7 +169,7 @@ static Tout _pixDemoteClampZ_CN(Tin &pix)
}};
template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 4> {
static Tout _pixDemoteClampZ_CN(Tin &pix)
static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
{
Tout out;
_TDemoteClampZ(pix.x, out.x);
......@@ -179,16 +179,16 @@ static Tout _pixDemoteClampZ_CN(Tin &pix)
return out;
}};
template<typename Tin, typename Tout> static inline Tout _pixDemoteClampZ(Tin &pix)
template<typename Tin, typename Tout> static inline __host__ __device__ Tout _pixDemoteClampZ(Tin &pix)
{
return __pixDemoteClampZ_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampZ_CN(pix);
}
template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampNN_CN {static Tout _pixDemoteClampNN_CN(Tin &pix);};
template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampNN_CN {static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix);};
template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 1> {
static Tout _pixDemoteClampNN_CN(Tin &pix)
static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
{
Tout out;
_TDemoteClampNN(pix.x, out.x);
......@@ -196,7 +196,7 @@ static Tout _pixDemoteClampNN_CN(Tin &pix)
}};
template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 3> {
static Tout _pixDemoteClampNN_CN(Tin &pix)
static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
{
Tout out;
_TDemoteClampNN(pix.x, out.x);
......@@ -206,7 +206,7 @@ static Tout _pixDemoteClampNN_CN(Tin &pix)
}};
template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 4> {
static Tout _pixDemoteClampNN_CN(Tin &pix)
static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
{
Tout out;
_TDemoteClampNN(pix.x, out.x);
......@@ -216,16 +216,16 @@ static Tout _pixDemoteClampNN_CN(Tin &pix)
return out;
}};
template<typename Tin, typename Tout> static inline Tout _pixDemoteClampNN(Tin &pix)
template<typename Tin, typename Tout> static inline __host__ __device__ Tout _pixDemoteClampNN(Tin &pix)
{
return __pixDemoteClampNN_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampNN_CN(pix);
}
template<typename Tin, typename Tout, typename Tw, Ncv32u CN> struct __pixScale_CN {static Tout _pixScale_CN(Tin &pix, Tw w);};
template<typename Tin, typename Tout, typename Tw, Ncv32u CN> struct __pixScale_CN {static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w);};
template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 1> {
static Tout _pixScale_CN(Tin &pix, Tw w)
static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
{
Tout out;
typedef typename TConvVec2Base<Tout>::TBase TBout;
......@@ -234,7 +234,7 @@ static Tout _pixScale_CN(Tin &pix, Tw w)
}};
template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 3> {
static Tout _pixScale_CN(Tin &pix, Tw w)
static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
{
Tout out;
typedef typename TConvVec2Base<Tout>::TBase TBout;
......@@ -245,7 +245,7 @@ static Tout _pixScale_CN(Tin &pix, Tw w)
}};
template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 4> {
static Tout _pixScale_CN(Tin &pix, Tw w)
static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
{
Tout out;
typedef typename TConvVec2Base<Tout>::TBase TBout;
......@@ -256,16 +256,16 @@ static Tout _pixScale_CN(Tin &pix, Tw w)
return out;
}};
template<typename Tin, typename Tout, typename Tw> static Tout _pixScale(Tin &pix, Tw w)
template<typename Tin, typename Tout, typename Tw> static __host__ __device__ Tout _pixScale(Tin &pix, Tw w)
{
return __pixScale_CN<Tin, Tout, Tw, NC(Tin)>::_pixScale_CN(pix, w);
}
template<typename Tin, typename Tout, Ncv32u CN> struct __pixAdd_CN {static Tout _pixAdd_CN(Tout &pix1, Tin &pix2);};
template<typename Tin, typename Tout, Ncv32u CN> struct __pixAdd_CN {static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2);};
template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 1> {
static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
{
Tout out;
out.x = pix1.x + pix2.x;
......@@ -273,7 +273,7 @@ static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
}};
template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 3> {
static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
{
Tout out;
out.x = pix1.x + pix2.x;
......@@ -283,7 +283,7 @@ static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
}};
template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 4> {
static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
{
Tout out;
out.x = pix1.x + pix2.x;
......@@ -293,33 +293,33 @@ static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
return out;
}};
template<typename Tin, typename Tout> static Tout _pixAdd(Tout &pix1, Tin &pix2)
template<typename Tin, typename Tout> static __host__ __device__ Tout _pixAdd(Tout &pix1, Tin &pix2)
{
return __pixAdd_CN<Tin, Tout, NC(Tin)>::_pixAdd_CN(pix1, pix2);
}
template<typename Tin, typename Tout, Ncv32u CN> struct __pixDist_CN {static Tout _pixDist_CN(Tin &pix1, Tin &pix2);};
template<typename Tin, typename Tout, Ncv32u CN> struct __pixDist_CN {static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2);};
template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 1> {
static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
{
return Tout(SQR(pix1.x - pix2.x));
}};
template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 3> {
static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
{
return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z));
}};
template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 4> {
static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
{
return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z) + SQR(pix1.w - pix2.w));
}};
template<typename Tin, typename Tout> static Tout _pixDist(Tin &pix1, Tin &pix2)
template<typename Tin, typename Tout> static __host__ __device__ Tout _pixDist(Tin &pix1, Tin &pix2)
{
return __pixDist_CN<Tin, Tout, NC(Tin)>::_pixDist_CN(pix1, pix2);
}
......
......@@ -43,15 +43,16 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include "NCV.hpp"
#include "NCVAlg.hpp"
#include "NCVPyramid.hpp"
#include "NCVPixelOperations.hpp"
#ifdef _WIN32
#ifdef _WIN32
template<typename T, Ncv32u CN> struct __average4_CN {static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
template<typename T, Ncv32u CN> struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
template<typename T> struct __average4_CN<T, 1> {
static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
{
T out;
out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
......@@ -59,7 +60,7 @@ static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
}};
template<> struct __average4_CN<float1, 1> {
static float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
static __host__ __device__ float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
{
float1 out;
out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
......@@ -67,7 +68,7 @@ static float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p
}};
template<> struct __average4_CN<double1, 1> {
static double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
static __host__ __device__ double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
{
double1 out;
out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
......@@ -75,7 +76,7 @@ static double1 _average4_CN(const double1 &p00, const double1 &p01, const double
}};
template<typename T> struct __average4_CN<T, 3> {
static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
{
T out;
out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
......@@ -85,7 +86,7 @@ static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
}};
template<> struct __average4_CN<float3, 3> {
static float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
static __host__ __device__ float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
{
float3 out;
out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
......@@ -95,7 +96,7 @@ static float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p
}};
template<> struct __average4_CN<double3, 3> {
static double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
static __host__ __device__ double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
{
double3 out;
out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
......@@ -105,7 +106,7 @@ static double3 _average4_CN(const double3 &p00, const double3 &p01, const double
}};
template<typename T> struct __average4_CN<T, 4> {
static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
{
T out;
out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
......@@ -116,7 +117,7 @@ static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
}};
template<> struct __average4_CN<float4, 4> {
static float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
static __host__ __device__ float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
{
float4 out;
out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
......@@ -127,7 +128,7 @@ static float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p
}};
template<> struct __average4_CN<double4, 4> {
static double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
static __host__ __device__ double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
{
double4 out;
out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
......@@ -137,23 +138,23 @@ static double4 _average4_CN(const double4 &p00, const double4 &p01, const double
return out;
}};
template<typename T> static T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
template<typename T> static __host__ __device__ T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
{
return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
}
template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
{
typedef typename TConvVec2Base<Tout>::TBase TB;
return _pixMake(TB(b.x * d + a.x * (1 - d)));
}};
template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
{
typedef typename TConvVec2Base<Tout>::TBase TB;
return _pixMake(TB(b.x * d + a.x * (1 - d)),
......@@ -162,7 +163,7 @@ static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
}};
template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
{
typedef typename TConvVec2Base<Tout>::TBase TB;
return _pixMake(TB(b.x * d + a.x * (1 - d)),
......@@ -171,7 +172,7 @@ static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
TB(b.w * d + a.w * (1 - d)));
}};
template<typename Tin, typename Tout> static Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
template<typename Tin, typename Tout> static __host__ __device__ Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
{
return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
}
......@@ -208,6 +209,74 @@ static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
}
template<typename T>
__global__ void kernelDownsampleX2(T *d_src,
Ncv32u srcPitch,
T *d_dst,
Ncv32u dstPitch,
NcvSize32u dstRoi)
{
Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < dstRoi.height && j < dstRoi.width)
{
T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
T p00 = d_src_line1[2*j+0];
T p01 = d_src_line1[2*j+1];
T p10 = d_src_line2[2*j+0];
T p11 = d_src_line2[2*j+1];
d_dst_line[j] = _average4(p00, p01, p10, p11);
}
}
template<typename T>
__global__ void kernelInterpolateFrom1(T *d_srcTop,
Ncv32u srcTopPitch,
NcvSize32u szTopRoi,
T *d_dst,
Ncv32u dstPitch,
NcvSize32u dstRoi)
{
Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < dstRoi.height && j < dstRoi.width)
{
Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
Ncv32u xl = (Ncv32u)ptTopX;
Ncv32u xh = xl+1;
Ncv32f dx = ptTopX - xl;
Ncv32u yl = (Ncv32u)ptTopY;
Ncv32u yh = yl+1;
Ncv32f dy = ptTopY - yl;
T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
T p00, p01, p10, p11;
p00 = d_src_line1[xl];
p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
T outPix = _pixDemoteClampZ<TVFlt, T>(mixture);
d_dst_line[j] = outPix;
}
}
template <class T>
NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
Ncv8u numLayers,
......@@ -215,7 +284,7 @@ NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
cudaStream_t cuStream)
{
this->_isInitialized = false;
ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid_host::ctor error", );
ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid::ctor error", );
this->layer0 = &img;
NcvSize32u szLastLayer(img.width(), img.height());
......@@ -229,6 +298,10 @@ NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
numLayers = 255; //it will cut-off when any of the dimensions goes 1
}
#ifdef SELF_CHECK_GPU
NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
#endif
for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
{
NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
......@@ -238,7 +311,7 @@ NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
}
this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid_host::ctor error", );
ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid::ctor error", );
this->nLayers++;
//fill in the layer
......@@ -249,7 +322,37 @@ NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
if (bDeviceCode)
{
//TODO: in cuStream
dim3 bDim(16, 8);
dim3 gDim(divUp(szCurLayer.width, bDim.x), divUp(szCurLayer.height, bDim.y));
kernelDownsampleX2<<<gDim, bDim, 0, cuStream>>>(prevLayer->ptr(),
prevLayer->pitch(),
curLayer->ptr(),
curLayer->pitch(),
szCurLayer);
ncvAssertPrintReturn(cudaSuccess == cudaGetLastError(), "NCVImagePyramid::ctor error", );
#ifdef SELF_CHECK_GPU
NCVMatrixAlloc<T> h_prevLayer(allocCPU, prevLayer->width(), prevLayer->height());
ncvAssertPrintReturn(h_prevLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
NCVMatrixAlloc<T> h_curLayer(allocCPU, curLayer->width(), curLayer->height());
ncvAssertPrintReturn(h_curLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
ncvAssertPrintReturn(NCV_SUCCESS == prevLayer->copy2D(h_prevLayer, prevLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
ncvAssertPrintReturn(NCV_SUCCESS == curLayer->copy2D(h_curLayer, curLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
ncvAssertPrintReturn(cudaSuccess == cudaStreamSynchronize(cuStream), "Validation failure in NCVImagePyramid::ctor", );
for (Ncv32u i=0; i<szCurLayer.height; i++)
{
for (Ncv32u j=0; j<szCurLayer.width; j++)
{
T p00 = h_prevLayer.at(2*j+0, 2*i+0);
T p01 = h_prevLayer.at(2*j+1, 2*i+0);
T p10 = h_prevLayer.at(2*j+0, 2*i+1);
T p11 = h_prevLayer.at(2*j+1, 2*i+1);
T outGold = _average4(p00, p01, p10, p11);
T outGPU = h_curLayer.at(j, i);
ncvAssertPrintReturn(0 == memcmp(&outGold, &outGPU, sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelDownsampleX2", );
}
}
#endif
}
else
{
......@@ -340,11 +443,45 @@ NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
#ifdef SELF_CHECK_GPU
NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
#endif
NCV_SKIP_COND_BEGIN
if (bDeviceCode)
{
//TODO: in cuStream
ncvAssertReturn(bUse2Refs == false, NCV_NOT_IMPLEMENTED);
dim3 bDim(16, 8);
dim3 gDim(divUp(outRoi.width, bDim.x), divUp(outRoi.height, bDim.y));
kernelInterpolateFrom1<<<gDim, bDim, 0, cuStream>>>(lastLayer->ptr(),
lastLayer->pitch(),
lastLayer->size(),
outImg.ptr(),
outImg.pitch(),
outRoi);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
#ifdef SELF_CHECK_GPU
ncvSafeMatAlloc(h_lastLayer, T, allocCPU, lastLayer->width(), lastLayer->height(), NCV_ALLOCATOR_BAD_ALLOC);
ncvSafeMatAlloc(h_outImg, T, allocCPU, outImg.width(), outImg.height(), NCV_ALLOCATOR_BAD_ALLOC);
ncvAssertReturnNcvStat(lastLayer->copy2D(h_lastLayer, lastLayer->size(), cuStream));
ncvAssertReturnNcvStat(outImg.copy2D(h_outImg, outRoi, cuStream));
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
for (Ncv32u i=0; i<outRoi.height; i++)
{
for (Ncv32u j=0; j<outRoi.width; j++)
{
NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
T outGold = _interpBilinear(h_lastLayer, ptTopX, ptTopY);
ncvAssertPrintReturn(0 == memcmp(&outGold, &h_outImg.at(j,i), sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelInterpolateFrom1", NCV_UNKNOWN_ERROR);
}
}
#endif
}
else
{
......@@ -395,6 +532,6 @@ template class NCVImagePyramid<uint3>;
template class NCVImagePyramid<uint4>;
template class NCVImagePyramid<float1>;
template class NCVImagePyramid<float3>;
template class NCVImagePyramid<float4>;
template class NCVImagePyramid<float4>;
#endif //_WIN32
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment