Commit ee11a2d2 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

fully implemented SSE and NEON cases of intrin.hpp; extended the HAL with some basic math functions

parent a2bba1b9
...@@ -381,7 +381,7 @@ TEST_F(fisheyeTest, EtimateUncertainties) ...@@ -381,7 +381,7 @@ TEST_F(fisheyeTest, EtimateUncertainties)
EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.890439368129246, 0.816096854937896), 1e-10); EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.890439368129246, 0.816096854937896), 1e-10);
EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.00516248605191506, 0.0168181467500934, 0.0213118690274604, 0.00916010877545648), 1e-10); EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.00516248605191506, 0.0168181467500934, 0.0213118690274604, 0.00916010877545648), 1e-10);
EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10); EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10);
CV_Assert(abs(rms - 0.263782587133546) < 1e-10); CV_Assert(fabs(rms - 0.263782587133546) < 1e-10);
CV_Assert(errors.alpha == 0); CV_Assert(errors.alpha == 0);
} }
......
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
#include "opencv2/core/cvdef.h" #include "opencv2/core/cvdef.h"
#include "opencv2/core/cvstd.hpp" #include "opencv2/core/cvstd.hpp"
#include "opencv2/hal.hpp"
namespace cv namespace cv
{ {
...@@ -419,6 +420,12 @@ typedef Hamming HammingLUT; ...@@ -419,6 +420,12 @@ typedef Hamming HammingLUT;
/////////////////////////////////// inline norms //////////////////////////////////// /////////////////////////////////// inline norms ////////////////////////////////////
template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
inline int cv_abs(uchar x) { return x; }
inline int cv_abs(schar x) { return std::abs(x); }
inline int cv_abs(ushort x) { return x; }
inline int cv_abs(short x) { return std::abs(x); }
template<typename _Tp, typename _AccTp> static inline template<typename _Tp, typename _AccTp> static inline
_AccTp normL2Sqr(const _Tp* a, int n) _AccTp normL2Sqr(const _Tp* a, int n)
{ {
...@@ -447,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n) ...@@ -447,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n)
#if CV_ENABLE_UNROLLED #if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4 ) for(; i <= n - 4; i += 4 )
{ {
s += (_AccTp)std::abs(a[i]) + (_AccTp)std::abs(a[i+1]) + s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
(_AccTp)std::abs(a[i+2]) + (_AccTp)std::abs(a[i+3]); (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
} }
#endif #endif
for( ; i < n; i++ ) for( ; i < n; i++ )
s += std::abs(a[i]); s += cv_abs(a[i]);
return s; return s;
} }
...@@ -461,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n) ...@@ -461,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n)
{ {
_AccTp s = 0; _AccTp s = 0;
for( int i = 0; i < n; i++ ) for( int i = 0; i < n; i++ )
s = std::max(s, (_AccTp)std::abs(a[i])); s = std::max(s, (_AccTp)cv_abs(a[i]));
return s; return s;
} }
...@@ -485,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n) ...@@ -485,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
return s; return s;
} }
template<> inline inline float normL2Sqr(const float* a, const float* b, int n)
float normL2Sqr(const float* a, const float* b, int n)
{ {
if( n >= 8 ) if( n >= 8 )
return normL2Sqr_(a, b, n); return hal::normL2Sqr_(a, b, n);
float s = 0; float s = 0;
for( int i = 0; i < n; i++ ) for( int i = 0; i < n; i++ )
{ {
...@@ -519,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n) ...@@ -519,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n)
return s; return s;
} }
template<> inline inline float normL1(const float* a, const float* b, int n)
float normL1(const float* a, const float* b, int n)
{ {
if( n >= 8 ) if( n >= 8 )
return normL1_(a, b, n); return hal::normL1_(a, b, n);
float s = 0; float s = 0;
for( int i = 0; i < n; i++ ) for( int i = 0; i < n; i++ )
{ {
...@@ -533,10 +538,9 @@ float normL1(const float* a, const float* b, int n) ...@@ -533,10 +538,9 @@ float normL1(const float* a, const float* b, int n)
return s; return s;
} }
template<> inline inline int normL1(const uchar* a, const uchar* b, int n)
int normL1(const uchar* a, const uchar* b, int n)
{ {
return normL1_(a, b, n); return hal::normL1_(a, b, n);
} }
template<typename _Tp, typename _AccTp> static inline template<typename _Tp, typename _AccTp> static inline
...@@ -551,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n) ...@@ -551,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
return s; return s;
} }
/** @brief Computes the cube root of an argument.
The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
single-precision data.
@param val A function argument.
*/
CV_EXPORTS_W float cubeRoot(float val);
/** @brief Calculates the angle of a 2D vector in degrees.
The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
@param x x-coordinate of the vector.
@param y y-coordinate of the vector.
*/
CV_EXPORTS_W float fastAtan2(float y, float x);
////////////////// forward declarations for important OpenCV types ////////////////// ////////////////// forward declarations for important OpenCV types //////////////////
......
...@@ -427,7 +427,7 @@ template<typename _Tp, int m> struct Matx_DetOp ...@@ -427,7 +427,7 @@ template<typename _Tp, int m> struct Matx_DetOp
double operator ()(const Matx<_Tp, m, m>& a) const double operator ()(const Matx<_Tp, m, m>& a) const
{ {
Matx<_Tp, m, m> temp = a; Matx<_Tp, m, m> temp = a;
double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0); double p = hal::LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
if( p == 0 ) if( p == 0 )
return p; return p;
for( int i = 0; i < m; i++ ) for( int i = 0; i < m; i++ )
......
...@@ -72,9 +72,9 @@ template<typename _Tp, int m> struct Matx_FastInvOp ...@@ -72,9 +72,9 @@ template<typename _Tp, int m> struct Matx_FastInvOp
b(i, i) = (_Tp)1; b(i, i) = (_Tp)1;
if( method == DECOMP_CHOLESKY ) if( method == DECOMP_CHOLESKY )
return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m); return hal::Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0; return hal::LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
} }
}; };
......
...@@ -79,7 +79,7 @@ public: ...@@ -79,7 +79,7 @@ public:
for ( int i = begin; i<end; i++ ) for ( int i = begin; i<end; i++ )
{ {
tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]); tdist2[i] = std::min(normL2Sqr(data + step*i, data + stepci, dims), dist[i]);
} }
} }
...@@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers, ...@@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
for( i = 0; i < N; i++ ) for( i = 0; i < N; i++ )
{ {
dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims); dist[i] = normL2Sqr(data + step*i, data + step*centers[0], dims);
sum0 += dist[i]; sum0 += dist[i];
} }
...@@ -189,7 +189,7 @@ public: ...@@ -189,7 +189,7 @@ public:
for( int k = 0; k < K; k++ ) for( int k = 0; k < K; k++ )
{ {
const float* center = centers.ptr<float>(k); const float* center = centers.ptr<float>(k);
const double dist = normL2Sqr_(sample, center, dims); const double dist = normL2Sqr(sample, center, dims);
if( min_dist > dist ) if( min_dist > dist )
{ {
...@@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K, ...@@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K,
if( labels[i] != max_k ) if( labels[i] != max_k )
continue; continue;
sample = data.ptr<float>(i); sample = data.ptr<float>(i);
double dist = normL2Sqr_(sample, _old_center, dims); double dist = normL2Sqr(sample, _old_center, dims);
if( max_dist <= dist ) if( max_dist <= dist )
{ {
......
...@@ -50,168 +50,6 @@ ...@@ -50,168 +50,6 @@
namespace cv namespace cv
{ {
/****************************************************************************************\
* LU & Cholesky implementation for small matrices *
\****************************************************************************************/
template<typename _Tp> static inline int
LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
{
int i, j, k, p = 1;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
k = i;
for( j = i+1; j < m; j++ )
if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
k = j;
if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
return 0;
if( k != i )
{
for( j = i; j < m; j++ )
std::swap(A[i*astep + j], A[k*astep + j]);
if( b )
for( j = 0; j < n; j++ )
std::swap(b[i*bstep + j], b[k*bstep + j]);
p = -p;
}
_Tp d = -1/A[i*astep + i];
for( j = i+1; j < m; j++ )
{
_Tp alpha = A[j*astep + i]*d;
for( k = i+1; k < m; k++ )
A[j*astep + k] += alpha*A[i*astep + k];
if( b )
for( k = 0; k < n; k++ )
b[j*bstep + k] += alpha*b[i*bstep + k];
}
A[i*astep + i] = -d;
}
if( b )
{
for( i = m-1; i >= 0; i-- )
for( j = 0; j < n; j++ )
{
_Tp s = b[i*bstep + j];
for( k = i+1; k < m; k++ )
s -= A[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = s*A[i*astep + i];
}
}
return p;
}
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n);
}
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n);
}
template<typename _Tp> static inline bool
CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
{
_Tp* L = A;
int i, j, k;
double s;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
for( j = 0; j < i; j++ )
{
s = A[i*astep + j];
for( k = 0; k < j; k++ )
s -= L[i*astep + k]*L[j*astep + k];
L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
}
s = A[i*astep + i];
for( k = 0; k < j; k++ )
{
double t = L[i*astep + k];
s -= t*t;
}
if( s < std::numeric_limits<_Tp>::epsilon() )
return false;
L[i*astep + i] = (_Tp)(1./std::sqrt(s));
}
if( !b )
return true;
// LLt x = b
// 1: L y = b
// 2. Lt x = y
/*
[ L00 ] y0 b0
[ L10 L11 ] y1 = b1
[ L20 L21 L22 ] y2 b2
[ L30 L31 L32 L33 ] y3 b3
[ L00 L10 L20 L30 ] x0 y0
[ L11 L21 L31 ] x1 = y1
[ L22 L32 ] x2 y2
[ L33 ] x3 y3
*/
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = 0; k < i; k++ )
s -= L[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
for( i = m-1; i >= 0; i-- )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = m-1; k > i; k-- )
s -= L[k*astep + i]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
return true;
}
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b) template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
{ {
a = std::abs(a); a = std::abs(a);
...@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat ) ...@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
Mat a(rows, rows, CV_32F, (uchar*)buffer); Mat a(rows, rows, CV_32F, (uchar*)buffer);
mat.copyTo(a); mat.copyTo(a);
result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0); result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
if( result ) if( result )
{ {
for( int i = 0; i < rows; i++ ) for( int i = 0; i < rows; i++ )
...@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat ) ...@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
Mat a(rows, rows, CV_64F, (uchar*)buffer); Mat a(rows, rows, CV_64F, (uchar*)buffer);
mat.copyTo(a); mat.copyTo(a);
result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0); result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
if( result ) if( result )
{ {
for( int i = 0; i < rows; i++ ) for( int i = 0; i < rows; i++ )
...@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) ...@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
setIdentity(dst); setIdentity(dst);
if( method == DECOMP_LU && type == CV_32F ) if( method == DECOMP_LU && type == CV_32F )
result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0; result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
else if( method == DECOMP_LU && type == CV_64F ) else if( method == DECOMP_LU && type == CV_64F )
result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0; result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
else if( method == DECOMP_CHOLESKY && type == CV_32F ) else if( method == DECOMP_CHOLESKY && type == CV_32F )
result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n); result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
else else
result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n); result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
if( !result ) if( !result )
dst = Scalar(0); dst = Scalar(0);
...@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth ...@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
if( method == DECOMP_LU ) if( method == DECOMP_LU )
{ {
if( type == CV_32F ) if( type == CV_32F )
result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0; result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
else else
result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0; result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
} }
else if( method == DECOMP_CHOLESKY ) else if( method == DECOMP_CHOLESKY )
{ {
if( type == CV_32F ) if( type == CV_32F )
result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb); result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
else else
result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb); result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
} }
else else
{ {
......
This diff is collapsed.
...@@ -2416,140 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal, ...@@ -2416,140 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
namespace cv namespace cv
{ {
float normL2Sqr_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SSE
if( USE_SSE2 )
{
float CV_DECL_ALIGNED(16) buf[4];
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
for( ; j <= n - 8; j += 8 )
{
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
}
_mm_store_ps(buf, _mm_add_ps(d0, d1));
d = buf[0] + buf[1] + buf[2] + buf[3];
}
else
#endif
{
for( ; j <= n - 4; j += 4 )
{
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
}
}
for( ; j < n; j++ )
{
float t = a[j] - b[j];
d += t*t;
}
return d;
}
float normL1_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SSE
if( USE_SSE2 )
{
float CV_DECL_ALIGNED(16) buf[4];
static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
__m128 absmask = _mm_load_ps((const float*)absbuf);
for( ; j <= n - 8; j += 8 )
{
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
}
_mm_store_ps(buf, _mm_add_ps(d0, d1));
d = buf[0] + buf[1] + buf[2] + buf[3];
}
else
#elif CV_NEON
float32x4_t v_sum = vdupq_n_f32(0.0f);
for ( ; j <= n - 4; j += 4)
v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
float CV_DECL_ALIGNED(16) buf[4];
vst1q_f32(buf, v_sum);
d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
{
for( ; j <= n - 4; j += 4 )
{
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
}
int normL1_(const uchar* a, const uchar* b, int n)
{
int j = 0, d = 0;
#if CV_SSE
if( USE_SSE2 )
{
__m128i d0 = _mm_setzero_si128();
for( ; j <= n - 16; j += 16 )
{
__m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
__m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
}
for( ; j <= n - 4; j += 4 )
{
__m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
__m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
}
d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
}
else
#elif CV_NEON
uint32x4_t v_sum = vdupq_n_u32(0.0f);
for ( ; j <= n - 16; j += 16)
{
uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
}
uint CV_DECL_ALIGNED(16) buf[4];
vst1q_u32(buf, v_sum);
d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
{
for( ; j <= n - 4; j += 4 )
{
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
}
template<typename T, typename ST> int template<typename T, typename ST> int
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
{ {
...@@ -2564,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) ...@@ -2564,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
if( mask[i] ) if( mask[i] )
{ {
for( int k = 0; k < cn; k++ ) for( int k = 0; k < cn; k++ )
result = std::max(result, ST(std::abs(src[k]))); result = std::max(result, ST(cv_abs(src[k])));
} }
} }
*_result = result; *_result = result;
...@@ -2585,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) ...@@ -2585,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
if( mask[i] ) if( mask[i] )
{ {
for( int k = 0; k < cn; k++ ) for( int k = 0; k < cn; k++ )
result += std::abs(src[k]); result += cv_abs(src[k]);
} }
} }
*_result = result; *_result = result;
...@@ -2684,9 +2550,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le ...@@ -2684,9 +2550,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
{ {
int result = 0; return cv::hal::normHamming(a, b, size);
cv::hal::normHamming(a, b, size, result);
return result;
} }
#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \ #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
...@@ -3037,16 +2901,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) ...@@ -3037,16 +2901,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
if( normType == NORM_HAMMING ) if( normType == NORM_HAMMING )
{ {
int result = 0; return hal::normHamming(data, (int)len);
cv::hal::normHamming(data, (int)len, result);
return result;
} }
if( normType == NORM_HAMMING2 ) if( normType == NORM_HAMMING2 )
{ {
int result = 0; return hal::normHamming(data, (int)len, 2);
hal::normHamming(data, (int)len, 2, result);
return result;
} }
} }
} }
...@@ -3072,9 +2932,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) ...@@ -3072,9 +2932,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
for( size_t i = 0; i < it.nplanes; i++, ++it ) for( size_t i = 0; i < it.nplanes; i++, ++it )
{ {
int one = 0; result += hal::normHamming(ptrs[0], total, cellSize);
cv::hal::normHamming(ptrs[0], total, cellSize, one);
result += one;
} }
return result; return result;
...@@ -3558,9 +3416,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m ...@@ -3558,9 +3416,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
for( size_t i = 0; i < it.nplanes; i++, ++it ) for( size_t i = 0; i < it.nplanes; i++, ++it )
{ {
int one = 0; result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
hal::normHamming(ptrs[0], ptrs[1], total, cellSize, one);
result += one;
} }
return result; return result;
...@@ -3698,7 +3554,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2, ...@@ -3698,7 +3554,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
if( !mask ) if( !mask )
{ {
for( int i = 0; i < nvecs; i++ ) for( int i = 0; i < nvecs; i++ )
hal::normHamming(src1, src2 + step2*i, len, dist[i]); dist[i] = hal::normHamming(src1, src2 + step2*i, len);
} }
else else
{ {
...@@ -3706,7 +3562,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2, ...@@ -3706,7 +3562,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
for( int i = 0; i < nvecs; i++ ) for( int i = 0; i < nvecs; i++ )
{ {
if (mask[i]) if (mask[i])
hal::normHamming(src1, src2 + step2*i, len, dist[i]); dist[i] = hal::normHamming(src1, src2 + step2*i, len);
else else
dist[i] = val0; dist[i] = val0;
} }
...@@ -3720,7 +3576,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2 ...@@ -3720,7 +3576,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
if( !mask ) if( !mask )
{ {
for( int i = 0; i < nvecs; i++ ) for( int i = 0; i < nvecs; i++ )
hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]); dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
} }
else else
{ {
...@@ -3728,7 +3584,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2 ...@@ -3728,7 +3584,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
for( int i = 0; i < nvecs; i++ ) for( int i = 0; i < nvecs; i++ )
{ {
if (mask[i]) if (mask[i])
hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]); dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
else else
dist[i] = val0; dist[i] = val0;
} }
......
...@@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE ...@@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
} }
} }
} }
fastAtan2(resY, resX, Ang, ang_size, false); hal::fastAtan2(resY, resX, Ang, ang_size, false);
// Loop slides pi/3 window around feature point // Loop slides pi/3 window around feature point
for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) { for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0)); ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
......
...@@ -81,28 +81,17 @@ float normL1_(const float* a, const float* b, int n); ...@@ -81,28 +81,17 @@ float normL1_(const float* a, const float* b, int n);
float normL2Sqr_(const float* a, const float* b, int n); float normL2Sqr_(const float* a, const float* b, int n);
void exp(const float* src, float* dst, int n); void exp(const float* src, float* dst, int n);
void exp(const double* src, double* dst, int n);
void log(const float* src, float* dst, int n); void log(const float* src, float* dst, int n);
void log(const double* src, double* dst, int n);
void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees); void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
void magnitude(const float* x, const float* y, float* dst, int n); void magnitude(const float* x, const float* y, float* dst, int n);
void magnitude(const double* x, const double* y, double* dst, int n);
/** @brief Computes the cube root of an argument. void sqrt(const float* src, float* dst, int len);
void sqrt(const double* src, double* dst, int len);
The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly. void invSqrt(const float* src, float* dst, int len);
NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for void invSqrt(const double* src, double* dst, int len);
single-precision data.
@param val A function argument.
*/
float cubeRoot(float val);
/** @brief Calculates the angle of a 2D vector in degrees.
The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
@param x x-coordinate of the vector.
@param y y-coordinate of the vector.
*/
float fastAtan2(float y, float x);
}} //cv::hal }} //cv::hal
......
...@@ -380,7 +380,7 @@ cvRound( double value ) ...@@ -380,7 +380,7 @@ cvRound( double value )
TEGRA_ROUND_DBL(value); TEGRA_ROUND_DBL(value);
#elif defined CV_ICC || defined __GNUC__ #elif defined CV_ICC || defined __GNUC__
# if CV_VFP # if CV_VFP
ARM_ROUND_DBL(value) ARM_ROUND_DBL(value);
# else # else
return (int)lrint(value); return (int)lrint(value);
# endif # endif
...@@ -488,7 +488,7 @@ CV_INLINE int cvRound(float value) ...@@ -488,7 +488,7 @@ CV_INLINE int cvRound(float value)
TEGRA_ROUND_FLT(value); TEGRA_ROUND_FLT(value);
#elif defined CV_ICC || defined __GNUC__ #elif defined CV_ICC || defined __GNUC__
# if CV_VFP # if CV_VFP
ARM_ROUND_FLT(value) ARM_ROUND_FLT(value);
# else # else
return (int)lrintf(value); return (int)lrintf(value);
# endif # endif
......
This diff is collapsed.
This diff is collapsed.
...@@ -44,4 +44,165 @@ ...@@ -44,4 +44,165 @@
namespace cv { namespace hal { namespace cv { namespace hal {
/****************************************************************************************\
* LU & Cholesky implementation for small matrices *
\****************************************************************************************/
template<typename _Tp> static inline int
LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
{
int i, j, k, p = 1;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
k = i;
for( j = i+1; j < m; j++ )
if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
k = j;
if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
return 0;
if( k != i )
{
for( j = i; j < m; j++ )
std::swap(A[i*astep + j], A[k*astep + j]);
if( b )
for( j = 0; j < n; j++ )
std::swap(b[i*bstep + j], b[k*bstep + j]);
p = -p;
}
_Tp d = -1/A[i*astep + i];
for( j = i+1; j < m; j++ )
{
_Tp alpha = A[j*astep + i]*d;
for( k = i+1; k < m; k++ )
A[j*astep + k] += alpha*A[i*astep + k];
if( b )
for( k = 0; k < n; k++ )
b[j*bstep + k] += alpha*b[i*bstep + k];
}
A[i*astep + i] = -d;
}
if( b )
{
for( i = m-1; i >= 0; i-- )
for( j = 0; j < n; j++ )
{
_Tp s = b[i*bstep + j];
for( k = i+1; k < m; k++ )
s -= A[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = s*A[i*astep + i];
}
}
return p;
}
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n);
}
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n);
}
template<typename _Tp> static inline bool
CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
{
_Tp* L = A;
int i, j, k;
double s;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
for( j = 0; j < i; j++ )
{
s = A[i*astep + j];
for( k = 0; k < j; k++ )
s -= L[i*astep + k]*L[j*astep + k];
L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
}
s = A[i*astep + i];
for( k = 0; k < j; k++ )
{
double t = L[i*astep + k];
s -= t*t;
}
if( s < std::numeric_limits<_Tp>::epsilon() )
return false;
L[i*astep + i] = (_Tp)(1./std::sqrt(s));
}
if( !b )
return true;
// LLt x = b
// 1: L y = b
// 2. Lt x = y
/*
[ L00 ] y0 b0
[ L10 L11 ] y1 = b1
[ L20 L21 L22 ] y2 b2
[ L30 L31 L32 L33 ] y3 b3
[ L00 L10 L20 L30 ] x0 y0
[ L11 L21 L31 ] x1 = y1
[ L22 L32 ] x2 y2
[ L33 ] x3 y3
*/
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = 0; k < i; k++ )
s -= L[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
for( i = m-1; i >= 0; i-- )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = m-1; k > i; k-- )
s -= L[k*astep + i]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
return true;
}
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
}} }}
...@@ -42,3 +42,7 @@ ...@@ -42,3 +42,7 @@
#include "opencv2/hal.hpp" #include "opencv2/hal.hpp"
#include "opencv2/hal/intrin.hpp" #include "opencv2/hal/intrin.hpp"
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <float.h>
This diff is collapsed.
...@@ -44,6 +44,9 @@ ...@@ -44,6 +44,9 @@
#ifndef __OPENCV_DENOISING_ARRAYS_HPP__ #ifndef __OPENCV_DENOISING_ARRAYS_HPP__
#define __OPENCV_DENOISING_ARRAYS_HPP__ #define __OPENCV_DENOISING_ARRAYS_HPP__
namespace cv
{
template <class T> template <class T>
struct Array2d struct Array2d
{ {
...@@ -176,4 +179,6 @@ struct Array4d ...@@ -176,4 +179,6 @@ struct Array4d
} }
}; };
}
#endif #endif
...@@ -49,7 +49,7 @@ namespace { ...@@ -49,7 +49,7 @@ namespace {
template<typename _Tp> static inline bool template<typename _Tp> static inline bool
decomposeCholesky(_Tp* A, size_t astep, int m) decomposeCholesky(_Tp* A, size_t astep, int m)
{ {
if (!Cholesky(A, astep, m, 0, 0, 0)) if (!hal::Cholesky(A, astep, m, 0, 0, 0))
return false; return false;
astep /= sizeof(A[0]); astep /= sizeof(A[0]);
for (int i = 0; i < m; ++i) for (int i = 0; i < m; ++i)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment