Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
4b968d1f
Commit
4b968d1f
authored
Oct 02, 2017
by
Sayed Adel
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added universal intrinsic for VSX
parent
d0777780
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
932 additions
and
0 deletions
+932
-0
intrin.hpp
modules/core/include/opencv2/core/hal/intrin.hpp
+5
-0
intrin_vsx.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+927
-0
No files found.
modules/core/include/opencv2/core/hal/intrin.hpp
View file @
4b968d1f
...
@@ -308,6 +308,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
...
@@ -308,6 +308,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#ifdef CV_DOXYGEN
#ifdef CV_DOXYGEN
# undef CV_SSE2
# undef CV_SSE2
# undef CV_NEON
# undef CV_NEON
# undef CV_VSX
#endif
#endif
#if CV_SSE2
#if CV_SSE2
...
@@ -318,6 +319,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
...
@@ -318,6 +319,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#include "opencv2/core/hal/intrin_neon.hpp"
#include "opencv2/core/hal/intrin_neon.hpp"
#elif CV_VSX
#include "opencv2/core/hal/intrin_vsx.hpp"
#else
#else
#include "opencv2/core/hal/intrin_cpp.hpp"
#include "opencv2/core/hal/intrin_cpp.hpp"
...
...
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
0 → 100644
View file @
4b968d1f
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_HAL_VSX_HPP
#define OPENCV_HAL_VSX_HPP
#include <algorithm>
#include "opencv2/core/utility.hpp"
#define CV_SIMD128 1
#define CV_SIMD128_64F 1
/**
* todo: supporting half precision for power9
* convert instractions xvcvhpsp, xvcvsphp
**/
namespace
cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
///////// Types ////////////
struct
v_uint8x16
{
typedef
uchar
lane_type
;
enum
{
nlanes
=
16
};
vec_uchar16
val
;
explicit
v_uint8x16
(
const
vec_uchar16
&
v
)
:
val
(
v
)
{}
v_uint8x16
()
:
val
(
vec_uchar16_z
)
{}
v_uint8x16
(
vec_bchar16
v
)
:
val
(
vec_uchar16_c
(
v
))
{}
v_uint8x16
(
uchar
v0
,
uchar
v1
,
uchar
v2
,
uchar
v3
,
uchar
v4
,
uchar
v5
,
uchar
v6
,
uchar
v7
,
uchar
v8
,
uchar
v9
,
uchar
v10
,
uchar
v11
,
uchar
v12
,
uchar
v13
,
uchar
v14
,
uchar
v15
)
:
val
(
vec_uchar16_set
(
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
,
v8
,
v9
,
v10
,
v11
,
v12
,
v13
,
v14
,
v15
))
{}
uchar
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_int8x16
{
typedef
schar
lane_type
;
enum
{
nlanes
=
16
};
vec_char16
val
;
explicit
v_int8x16
(
const
vec_char16
&
v
)
:
val
(
v
)
{}
v_int8x16
()
:
val
(
vec_char16_z
)
{}
v_int8x16
(
vec_bchar16
v
)
:
val
(
vec_char16_c
(
v
))
{}
v_int8x16
(
schar
v0
,
schar
v1
,
schar
v2
,
schar
v3
,
schar
v4
,
schar
v5
,
schar
v6
,
schar
v7
,
schar
v8
,
schar
v9
,
schar
v10
,
schar
v11
,
schar
v12
,
schar
v13
,
schar
v14
,
schar
v15
)
:
val
(
vec_char16_set
(
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
,
v8
,
v9
,
v10
,
v11
,
v12
,
v13
,
v14
,
v15
))
{}
schar
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_uint16x8
{
typedef
ushort
lane_type
;
enum
{
nlanes
=
8
};
vec_ushort8
val
;
explicit
v_uint16x8
(
const
vec_ushort8
&
v
)
:
val
(
v
)
{}
v_uint16x8
()
:
val
(
vec_ushort8_z
)
{}
v_uint16x8
(
vec_bshort8
v
)
:
val
(
vec_ushort8_c
(
v
))
{}
v_uint16x8
(
ushort
v0
,
ushort
v1
,
ushort
v2
,
ushort
v3
,
ushort
v4
,
ushort
v5
,
ushort
v6
,
ushort
v7
)
:
val
(
vec_ushort8_set
(
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
))
{}
ushort
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_int16x8
{
typedef
short
lane_type
;
enum
{
nlanes
=
8
};
vec_short8
val
;
explicit
v_int16x8
(
const
vec_short8
&
v
)
:
val
(
v
)
{}
v_int16x8
()
:
val
(
vec_short8_z
)
{}
v_int16x8
(
vec_bshort8
v
)
:
val
(
vec_short8_c
(
v
))
{}
v_int16x8
(
short
v0
,
short
v1
,
short
v2
,
short
v3
,
short
v4
,
short
v5
,
short
v6
,
short
v7
)
:
val
(
vec_short8_set
(
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
))
{}
short
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_uint32x4
{
typedef
unsigned
lane_type
;
enum
{
nlanes
=
4
};
vec_uint4
val
;
explicit
v_uint32x4
(
const
vec_uint4
&
v
)
:
val
(
v
)
{}
v_uint32x4
()
:
val
(
vec_uint4_z
)
{}
v_uint32x4
(
vec_bint4
v
)
:
val
(
vec_uint4_c
(
v
))
{}
v_uint32x4
(
unsigned
v0
,
unsigned
v1
,
unsigned
v2
,
unsigned
v3
)
:
val
(
vec_uint4_set
(
v0
,
v1
,
v2
,
v3
))
{}
uint
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_int32x4
{
typedef
int
lane_type
;
enum
{
nlanes
=
4
};
vec_int4
val
;
explicit
v_int32x4
(
const
vec_int4
&
v
)
:
val
(
v
)
{}
v_int32x4
()
:
val
(
vec_int4_z
)
{}
v_int32x4
(
vec_bint4
v
)
:
val
(
vec_int4_c
(
v
))
{}
v_int32x4
(
int
v0
,
int
v1
,
int
v2
,
int
v3
)
:
val
(
vec_int4_set
(
v0
,
v1
,
v2
,
v3
))
{}
int
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_float32x4
{
typedef
float
lane_type
;
enum
{
nlanes
=
4
};
vec_float4
val
;
explicit
v_float32x4
(
const
vec_float4
&
v
)
:
val
(
v
)
{}
v_float32x4
()
:
val
(
vec_float4_z
)
{}
v_float32x4
(
vec_bint4
v
)
:
val
(
vec_float4_c
(
v
))
{}
v_float32x4
(
float
v0
,
float
v1
,
float
v2
,
float
v3
)
:
val
(
vec_float4_set
(
v0
,
v1
,
v2
,
v3
))
{}
float
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_uint64x2
{
typedef
uint64
lane_type
;
enum
{
nlanes
=
2
};
vec_udword2
val
;
explicit
v_uint64x2
(
const
vec_udword2
&
v
)
:
val
(
v
)
{}
v_uint64x2
()
:
val
(
vec_udword2_z
)
{}
v_uint64x2
(
vec_bdword2
v
)
:
val
(
vec_udword2_c
(
v
))
{}
v_uint64x2
(
uint64
v0
,
uint64
v1
)
:
val
(
vec_udword2_set
(
v0
,
v1
))
{}
uint64
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_int64x2
{
typedef
int64
lane_type
;
enum
{
nlanes
=
2
};
vec_dword2
val
;
explicit
v_int64x2
(
const
vec_dword2
&
v
)
:
val
(
v
)
{}
v_int64x2
()
:
val
(
vec_dword2_z
)
{}
v_int64x2
(
vec_bdword2
v
)
:
val
(
vec_dword2_c
(
v
))
{}
v_int64x2
(
int64
v0
,
int64
v1
)
:
val
(
vec_dword2_set
(
v0
,
v1
))
{}
int64
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
struct
v_float64x2
{
typedef
double
lane_type
;
enum
{
nlanes
=
2
};
vec_double2
val
;
explicit
v_float64x2
(
const
vec_double2
&
v
)
:
val
(
v
)
{}
v_float64x2
()
:
val
(
vec_double2_z
)
{}
v_float64x2
(
vec_bdword2
v
)
:
val
(
vec_double2_c
(
v
))
{}
v_float64x2
(
double
v0
,
double
v1
)
:
val
(
vec_double2_set
(
v0
,
v1
))
{}
double
get0
()
const
{
return
vec_extract
(
val
,
0
);
}
};
//////////////// Load and store operations ///////////////
/*
* clang-5 aborted during parse "vec_xxx_c" only if it's
* inside a function template which is defined by preprocessor macro.
*
* if vec_xxx_c defined as C++ cast, clang-5 will pass it
*/
#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
{ return _Tpvec((cast)a.val); }
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_uint8x16
,
uchar
,
u8
,
vec_uchar16
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_int8x16
,
schar
,
s8
,
vec_char16
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_uint16x8
,
ushort
,
u16
,
vec_ushort8
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_int16x8
,
short
,
s16
,
vec_short8
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_uint32x4
,
uint
,
u32
,
vec_uint4
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_int32x4
,
int
,
s32
,
vec_int4
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_uint64x2
,
uint64
,
u64
,
vec_udword2
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_int64x2
,
int64
,
s64
,
vec_dword2
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_float32x4
,
float
,
f32
,
vec_float4
)
OPENCV_HAL_IMPL_VSX_INITVEC
(
v_float64x2
,
double
,
f64
,
vec_double2
)
#define OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(_Tpvec, _Tp, ld_func, st_func) \
inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(ld_func(0, ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(ld_func(0, ptr)); } \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ st_func(a.val, 0, ptr); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ st_func(a.val, 0, ptr); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vec_st_l8(a.val, ptr); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
{ vec_st_h8(a.val, ptr); }
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_uint8x16
,
uchar
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_int8x16
,
schar
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_uint16x8
,
ushort
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_int16x8
,
short
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_uint32x4
,
uint
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_int32x4
,
int
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_float32x4
,
float
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_float64x2
,
double
,
vsx_ld
,
vsx_st
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_uint64x2
,
uint64
,
vsx_ld2
,
vsx_st2
)
OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP
(
v_int64x2
,
int64
,
vsx_ld2
,
vsx_st2
)
//////////////// Value reordering ///////////////
/* de&interleave */
#define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \
inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
{ vec_ld_deinterleave(ptr, a.val, b.val);} \
inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
_Tpvec& b, _Tpvec& c) \
{ vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \
inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
_Tpvec& c, _Tpvec& d) \
{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
{ vec_st_interleave(a.val, b.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
const _Tpvec& b, const _Tpvec& c) \
{ vec_st_interleave(a.val, b.val, c.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
const _Tpvec& c, const _Tpvec& d) \
{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
uchar
,
v_uint8x16
)
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
schar
,
v_int8x16
)
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
ushort
,
v_uint16x8
)
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
short
,
v_int16x8
)
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
uint
,
v_uint32x4
)
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
int
,
v_int32x4
)
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
float
,
v_float32x4
)
OPENCV_HAL_IMPL_VSX_INTERLEAVE
(
double
,
v_float64x2
)
/* Expand */
#define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
{ \
b0.val = fh(a.val); \
b1.val = fl(a.val); \
} \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ return _Tpwvec(fh(vsx_ld(0, ptr))); }
OPENCV_HAL_IMPL_VSX_EXPAND
(
v_uint8x16
,
v_uint16x8
,
uchar
,
vec_unpacklu
,
vec_unpackhu
)
OPENCV_HAL_IMPL_VSX_EXPAND
(
v_int8x16
,
v_int16x8
,
schar
,
vec_unpackl
,
vec_unpackh
)
OPENCV_HAL_IMPL_VSX_EXPAND
(
v_uint16x8
,
v_uint32x4
,
ushort
,
vec_unpacklu
,
vec_unpackhu
)
OPENCV_HAL_IMPL_VSX_EXPAND
(
v_int16x8
,
v_int32x4
,
short
,
vec_unpackl
,
vec_unpackh
)
OPENCV_HAL_IMPL_VSX_EXPAND
(
v_uint32x4
,
v_uint64x2
,
uint
,
vec_unpacklu
,
vec_unpackhu
)
OPENCV_HAL_IMPL_VSX_EXPAND
(
v_int32x4
,
v_int64x2
,
int
,
vec_unpackl
,
vec_unpackh
)
inline
v_uint32x4
v_load_expand_q
(
const
uchar
*
ptr
)
{
return
v_uint32x4
(
vec_ld_buw
(
ptr
));
}
inline
v_int32x4
v_load_expand_q
(
const
schar
*
ptr
)
{
return
v_int32x4
(
vec_ld_bsw
(
ptr
));
}
/* pack */
#define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
{ \
return _Tpvec(pkfnc(a.val, b.val)); \
} \
inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
{ \
vec_st_l8(pkfnc(a.val, a.val), ptr); \
} \
template<int n> \
inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
{ \
const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \
} \
template<int n> \
inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
{ \
const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \
}
OPENCV_HAL_IMPL_VSX_PACK
(
v_uint8x16
,
uchar
,
v_uint16x8
,
unsigned
short
,
unsigned
short
,
vec_sr
,
vec_packs
,
vec_adds
,
pack
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_int8x16
,
schar
,
v_int16x8
,
unsigned
short
,
short
,
vec_sra
,
vec_packs
,
vec_adds
,
pack
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_uint16x8
,
ushort
,
v_uint32x4
,
unsigned
int
,
unsigned
int
,
vec_sr
,
vec_packs
,
vec_add
,
pack
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_int16x8
,
short
,
v_int32x4
,
unsigned
int
,
int
,
vec_sra
,
vec_packs
,
vec_add
,
pack
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_uint32x4
,
uint
,
v_uint64x2
,
unsigned
long
long
,
unsigned
long
long
,
vec_sr
,
vec_packs
,
vec_add
,
pack
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_int32x4
,
int
,
v_int64x2
,
unsigned
long
long
,
long
long
,
vec_sra
,
vec_packs
,
vec_add
,
pack
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_uint8x16
,
uchar
,
v_int16x8
,
unsigned
short
,
short
,
vec_sra
,
vec_packsu
,
vec_adds
,
pack_u
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_uint16x8
,
ushort
,
v_int32x4
,
unsigned
int
,
int
,
vec_sra
,
vec_packsu
,
vec_add
,
pack_u
)
OPENCV_HAL_IMPL_VSX_PACK
(
v_uint32x4
,
uint
,
v_int64x2
,
unsigned
long
long
,
long
long
,
vec_sra
,
vec_packsu
,
vec_add
,
pack_u
)
/* Recombine */
template
<
typename
_Tpvec
>
inline
void
v_zip
(
const
_Tpvec
&
a0
,
const
_Tpvec
&
a1
,
_Tpvec
&
b0
,
_Tpvec
&
b1
)
{
b0
.
val
=
vec_mergeh
(
a0
.
val
,
a1
.
val
);
b1
.
val
=
vec_mergel
(
a0
.
val
,
a1
.
val
);
}
template
<
typename
_Tpvec
>
inline
_Tpvec
v_combine_high
(
const
_Tpvec
&
a
,
const
_Tpvec
&
b
)
{
return
_Tpvec
(
vec_mergesql
(
a
.
val
,
b
.
val
));
}
template
<
typename
_Tpvec
>
inline
_Tpvec
v_combine_low
(
const
_Tpvec
&
a
,
const
_Tpvec
&
b
)
{
return
_Tpvec
(
vec_mergesqh
(
a
.
val
,
b
.
val
));
}
template
<
typename
_Tpvec
>
inline
void
v_recombine
(
const
_Tpvec
&
a
,
const
_Tpvec
&
b
,
_Tpvec
&
c
,
_Tpvec
&
d
)
{
c
.
val
=
vec_mergesqh
(
a
.
val
,
b
.
val
);
d
.
val
=
vec_mergesql
(
a
.
val
,
b
.
val
);
}
/* Extract */
template
<
int
s
,
typename
_Tpvec
>
inline
_Tpvec
v_extract
(
const
_Tpvec
&
a
,
const
_Tpvec
&
b
)
{
const
int
w
=
sizeof
(
typename
_Tpvec
::
lane_type
);
const
int
n
=
_Tpvec
::
nlanes
;
const
unsigned
int
sf
=
((
w
*
n
)
-
(
s
*
w
));
if
(
s
==
0
)
return
_Tpvec
(
a
.
val
);
else
if
(
sf
>
15
)
return
_Tpvec
();
// bitwise it just to make xlc happy
return
_Tpvec
(
vec_sld
(
b
.
val
,
a
.
val
,
sf
&
15
));
}
#define OPENCV_HAL_IMPL_VSX_EXTRACT_2(_Tpvec) \
template<int s> \
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
{ \
switch(s) { \
case 0: return _Tpvec(a.val); \
case 2: return _Tpvec(b.val); \
case 1: return _Tpvec(vec_sldw(b.val, a.val, 2)); \
default: return _Tpvec(); \
} \
}
OPENCV_HAL_IMPL_VSX_EXTRACT_2
(
v_uint64x2
)
OPENCV_HAL_IMPL_VSX_EXTRACT_2
(
v_int64x2
)
////////// Arithmetic, bitwise and comparison operations /////////
/* Element-wise binary and unary operations */
/** Arithmetics **/
#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); } \
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
{ a.val = intrin(a.val, b.val); return a; }
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint8x16
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint8x16
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int8x16
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int8x16
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint16x8
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint16x8
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_uint16x8
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int16x8
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int16x8
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_int16x8
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint32x4
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint32x4
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_uint32x4
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int32x4
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int32x4
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_int32x4
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_float32x4
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_float32x4
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_float32x4
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
/
,
v_float32x4
,
vec_div
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_float64x2
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_float64x2
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_float64x2
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
/
,
v_float64x2
,
vec_div
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint64x2
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint64x2
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int64x2
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int64x2
,
vec_sub
)
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
v_int32x4
&
c
,
v_int32x4
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackh
(
a
.
val
),
vec_unpackh
(
b
.
val
));
d
.
val
=
vec_mul
(
vec_unpackl
(
a
.
val
),
vec_unpackl
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
,
v_uint32x4
&
c
,
v_uint32x4
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
));
d
.
val
=
vec_mul
(
vec_unpacklu
(
a
.
val
),
vec_unpacklu
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
,
v_uint64x2
&
c
,
v_uint64x2
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
));
d
.
val
=
vec_mul
(
vec_unpacklu
(
a
.
val
),
vec_unpacklu
(
b
.
val
));
}
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
template<typename _Tpvec> \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_add_wrap
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_sub_wrap
,
vec_sub
)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } \
template<int imm> inline _Tpuvec v_shl(const _Tpuvec& a) \
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
template<int imm> inline _Tpuvec v_shr(const _Tpuvec& a) \
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); }
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint8x16
,
vec_uchar16_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int8x16
,
vec_uchar16_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint16x8
,
vec_ushort8_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int16x8
,
vec_ushort8_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint32x4
,
vec_uint4_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int32x4
,
vec_uint4_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint64x2
,
vec_udword2_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int64x2
,
vec_udword2_sp
)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
inline _Tpvec operator ~ (const _Tpvec& a) \
{ return _Tpvec(vec_not(a.val)); }
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_uint8x16
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_int8x16
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_uint16x8
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_int16x8
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_uint32x4
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_int32x4
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_uint64x2
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_int64x2
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_float32x4
)
OPENCV_HAL_IMPL_VSX_LOGIC_OP
(
v_float64x2
)
/** Bitwise select **/
#define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
OPENCV_HAL_IMPL_VSX_SELECT
(
v_uint8x16
,
vec_bchar16_c
)
OPENCV_HAL_IMPL_VSX_SELECT
(
v_int8x16
,
vec_bchar16_c
)
OPENCV_HAL_IMPL_VSX_SELECT
(
v_uint16x8
,
vec_bshort8_c
)
OPENCV_HAL_IMPL_VSX_SELECT
(
v_int16x8
,
vec_bshort8_c
)
OPENCV_HAL_IMPL_VSX_SELECT
(
v_uint32x4
,
vec_bint4_c
)
OPENCV_HAL_IMPL_VSX_SELECT
(
v_int32x4
,
vec_bint4_c
)
OPENCV_HAL_IMPL_VSX_SELECT
(
v_float32x4
,
vec_bint4_c
)
OPENCV_HAL_IMPL_VSX_SELECT
(
v_float64x2
,
vec_bdword2_c
)
/** Comparison **/
#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpeq(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpne(a.val, b.val)); } \
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmplt(a.val, b.val)); } \
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpgt(a.val, b.val)); } \
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmple(a.val, b.val)); } \
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_cmpge(a.val, b.val)); }
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_uint8x16
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_int8x16
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_uint16x8
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_int16x8
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_uint32x4
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_int32x4
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_float32x4
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_float64x2
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_uint64x2
)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP
(
v_int64x2
)
/** min/max **/
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_max
,
vec_max
)
////////// Reduce and mask /////////
/** Reduce **/
inline
short
v_reduce_sum
(
const
v_int16x8
&
a
)
{
const
vec_int4
zero
=
vec_int4_z
;
return
saturate_cast
<
short
>
(
vec_extract
(
vec_sums
(
vec_sum4s
(
a
.
val
,
zero
),
zero
),
3
));
}
inline
ushort
v_reduce_sum
(
const
v_uint16x8
&
a
)
{
const
vec_int4
v4
=
vec_int4_c
(
vec_unpackhu
(
vec_adds
(
a
.
val
,
vec_sld
(
a
.
val
,
a
.
val
,
8
))));
return
saturate_cast
<
ushort
>
(
vec_extract
(
vec_sums
(
v4
,
vec_int4_z
),
3
));
}
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
inline scalartype v_reduce_##suffix(const _Tpvec& a) \
{ \
const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \
}
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_uint32x4
,
vec_uint4
,
uint
,
sum
,
vec_add
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_uint32x4
,
vec_uint4
,
uint
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_uint32x4
,
vec_uint4
,
uint
,
min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_int32x4
,
vec_int4
,
int
,
sum
,
vec_add
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_int32x4
,
vec_int4
,
int
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_int32x4
,
vec_int4
,
int
,
min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_float32x4
,
vec_float4
,
float
,
sum
,
vec_add
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_float32x4
,
vec_float4
,
float
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_float32x4
,
vec_float4
,
float
,
min
,
vec_min
)
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
inline scalartype v_reduce_##suffix(const _Tpvec& a) \
{ \
_Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
rs = func(rs, vec_sld(rs, rs, 4)); \
return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \
}
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint16x8
,
vec_ushort8
,
ushort
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint16x8
,
vec_ushort8
,
ushort
,
min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int16x8
,
vec_short8
,
short
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int16x8
,
vec_short8
,
short
,
min
,
vec_min
)
inline
v_float32x4
v_reduce_sum4
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
,
const
v_float32x4
&
c
,
const
v_float32x4
&
d
)
{
vec_float4
ac
=
vec_add
(
vec_mergel
(
a
.
val
,
c
.
val
),
vec_mergeh
(
a
.
val
,
c
.
val
));
ac
=
vec_add
(
ac
,
vec_sld
(
ac
,
ac
,
8
));
vec_float4
bd
=
vec_add
(
vec_mergel
(
b
.
val
,
d
.
val
),
vec_mergeh
(
b
.
val
,
d
.
val
));
bd
=
vec_add
(
bd
,
vec_sld
(
bd
,
bd
,
8
));
return
v_float32x4
(
vec_mergeh
(
ac
,
bd
));
}
/** Popcount **/
#define OPENCV_HAL_IMPL_VSX_POPCOUNT_8(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
vec_uchar16 v16 = vec_popcntu(a.val); \
vec_ushort8 v8 = vec_add(vec_unpacklu(v16), vec_unpackhu(v16)); \
return v_uint32x4(vec_add(vec_unpacklu(v8), vec_unpackhu(v8))); \
}
OPENCV_HAL_IMPL_VSX_POPCOUNT_8
(
v_int8x16
)
OPENCV_HAL_IMPL_VSX_POPCOUNT_8
(
v_uint8x16
)
#define OPENCV_HAL_IMPL_VSX_POPCOUNT_16(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
vec_ushort8 v8 = vec_popcntu(a.val); \
return v_uint32x4(vec_add(vec_unpacklu(v8), vec_unpackhu(v8))); \
}
OPENCV_HAL_IMPL_VSX_POPCOUNT_16
(
v_int16x8
)
OPENCV_HAL_IMPL_VSX_POPCOUNT_16
(
v_uint16x8
)
#define OPENCV_HAL_IMPL_VSX_POPCOUNT_32(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ return v_uint32x4(vec_popcntu(a.val)); }
OPENCV_HAL_IMPL_VSX_POPCOUNT_32
(
v_int32x4
)
OPENCV_HAL_IMPL_VSX_POPCOUNT_32
(
v_uint32x4
)
/** Mask **/
inline
int
v_signmask
(
const
v_uint8x16
&
a
)
{
vec_uchar16
sv
=
vec_sr
(
a
.
val
,
vec_uchar16_sp
(
7
));
static
const
vec_uchar16
slm
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
sv
=
vec_sl
(
sv
,
slm
);
vec_uint4
sv4
=
vec_sum4s
(
sv
,
vec_uint4_z
);
static
const
vec_uint4
slm4
=
{
0
,
0
,
8
,
8
};
sv4
=
vec_sl
(
sv4
,
slm4
);
return
vec_extract
(
vec_sums
((
vec_int4
)
sv4
,
vec_int4_z
),
3
);
}
inline
int
v_signmask
(
const
v_int8x16
&
a
)
{
return
v_signmask
(
v_reinterpret_as_u8
(
a
));
}
inline
int
v_signmask
(
const
v_int16x8
&
a
)
{
static
const
vec_ushort8
slm
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
vec_short8
sv
=
vec_sr
(
a
.
val
,
vec_ushort8_sp
(
15
));
sv
=
vec_sl
(
sv
,
slm
);
vec_int4
svi
=
vec_int4_z
;
svi
=
vec_sums
(
vec_sum4s
(
sv
,
svi
),
svi
);
return
vec_extract
(
svi
,
3
);
}
inline
int
v_signmask
(
const
v_uint16x8
&
a
)
{
return
v_signmask
(
v_reinterpret_as_s16
(
a
));
}
inline
int
v_signmask
(
const
v_int32x4
&
a
)
{
static
const
vec_uint4
slm
=
{
0
,
1
,
2
,
3
};
vec_int4
sv
=
vec_sr
(
a
.
val
,
vec_uint4_sp
(
31
));
sv
=
vec_sl
(
sv
,
slm
);
sv
=
vec_sums
(
sv
,
vec_int4_z
);
return
vec_extract
(
sv
,
3
);
}
inline
int
v_signmask
(
const
v_uint32x4
&
a
)
{
return
v_signmask
(
v_reinterpret_as_s32
(
a
));
}
inline
int
v_signmask
(
const
v_float32x4
&
a
)
{
return
v_signmask
(
v_reinterpret_as_s32
(
a
));
}
inline
int
v_signmask
(
const
v_int64x2
&
a
)
{
const
vec_dword2
sv
=
vec_sr
(
a
.
val
,
vec_udword2_sp
(
63
));
return
(
int
)
vec_extract
(
sv
,
0
)
|
(
int
)
vec_extract
(
sv
,
1
)
<<
1
;
}
inline
int
v_signmask
(
const
v_uint64x2
&
a
)
{
return
v_signmask
(
v_reinterpret_as_s64
(
a
));
}
inline
int
v_signmask
(
const
v_float64x2
&
a
)
{
return
v_signmask
(
v_reinterpret_as_s64
(
a
));
}
template
<
typename
_Tpvec
>
inline
bool
v_check_all
(
const
_Tpvec
&
a
)
{
return
vec_all_lt
(
a
.
val
,
_Tpvec
().
val
);}
inline
bool
v_check_all
(
const
v_uint8x16
&
a
)
{
return
v_check_all
(
v_reinterpret_as_s8
(
a
));
}
inline
bool
v_check_all
(
const
v_uint16x8
&
a
)
{
return
v_check_all
(
v_reinterpret_as_s16
(
a
));
}
inline
bool
v_check_all
(
const
v_uint32x4
&
a
)
{
return
v_check_all
(
v_reinterpret_as_s32
(
a
));
}
template
<
typename
_Tpvec
>
inline
bool
v_check_any
(
const
_Tpvec
&
a
)
{
return
vec_any_lt
(
a
.
val
,
_Tpvec
().
val
);}
inline
bool
v_check_any
(
const
v_uint8x16
&
a
)
{
return
v_check_any
(
v_reinterpret_as_s8
(
a
));
}
inline
bool
v_check_any
(
const
v_uint16x8
&
a
)
{
return
v_check_any
(
v_reinterpret_as_s16
(
a
));
}
inline
bool
v_check_any
(
const
v_uint32x4
&
a
)
{
return
v_check_any
(
v_reinterpret_as_s32
(
a
));
}
////////// Other math /////////
/** Some frequent operations **/
inline
v_float32x4
v_sqrt
(
const
v_float32x4
&
x
)
{
return
v_float32x4
(
vec_sqrt
(
x
.
val
));
}
inline
v_float64x2
v_sqrt
(
const
v_float64x2
&
x
)
{
return
v_float64x2
(
vec_sqrt
(
x
.
val
));
}
inline
v_float32x4
v_invsqrt
(
const
v_float32x4
&
x
)
{
return
v_float32x4
(
vec_rsqrt
(
x
.
val
));
}
inline
v_float64x2
v_invsqrt
(
const
v_float64x2
&
x
)
{
return
v_float64x2
(
vec_rsqrt
(
x
.
val
));
}
#define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }
OPENCV_HAL_IMPL_VSX_MULADD
(
v_float32x4
)
OPENCV_HAL_IMPL_VSX_MULADD
(
v_float64x2
)
// TODO: exp, log, sin, cos
/** Absolute values **/
inline
v_uint8x16
v_abs
(
const
v_int8x16
&
x
)
{
return
v_uint8x16
(
vec_uchar16_c
(
vec_abs
(
x
.
val
)));
}
inline
v_uint16x8
v_abs
(
const
v_int16x8
&
x
)
{
return
v_uint16x8
(
vec_ushort8_c
(
vec_abs
(
x
.
val
)));
}
inline
v_uint32x4
v_abs
(
const
v_int32x4
&
x
)
{
return
v_uint32x4
(
vec_uint4_c
(
vec_abs
(
x
.
val
)));
}
inline
v_float32x4
v_abs
(
const
v_float32x4
&
x
)
{
return
v_float32x4
(
vec_abs
(
x
.
val
));
}
inline
v_float64x2
v_abs
(
const
v_float64x2
&
x
)
{
return
v_float64x2
(
vec_abs
(
x
.
val
));
}
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_absdiff
,
vec_absd
)
#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec2(cast(intrin(a.val, b.val))); }
OPENCV_HAL_IMPL_VSX_BIN_FUNC2
(
v_int8x16
,
v_uint8x16
,
vec_uchar16_c
,
v_absdiff
,
vec_absd
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC2
(
v_int16x8
,
v_uint16x8
,
vec_ushort8_c
,
v_absdiff
,
vec_absd
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC2
(
v_int32x4
,
v_uint32x4
,
vec_uint4_c
,
v_absdiff
,
vec_absd
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC2
(
v_int64x2
,
v_uint64x2
,
vec_udword2_c
,
v_absdiff
,
vec_absd
)
////////// Conversions /////////
/** Rounding **/
inline
v_int32x4
v_round
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
vec_round
(
a
.
val
),
0
));
}
inline
v_int32x4
v_round
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
vec_round
(
a
.
val
)),
perm
));
}
inline
v_int32x4
v_floor
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
vec_floor
(
a
.
val
),
0
));
}
inline
v_int32x4
v_floor
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
vec_floor
(
a
.
val
)),
perm
));
}
inline
v_int32x4
v_ceil
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
vec_ceil
(
a
.
val
),
0
));
}
inline
v_int32x4
v_ceil
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
vec_ceil
(
a
.
val
)),
perm
));
}
inline
v_int32x4
v_trunc
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
a
.
val
,
0
));
}
inline
v_int32x4
v_trunc
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
a
.
val
),
perm
));
}
/** To float **/
inline
v_float32x4
v_cvt_f32
(
const
v_int32x4
&
a
)
{
return
v_float32x4
(
vec_ctf
(
a
.
val
,
0
));
}
inline
v_float32x4
v_cvt_f32
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_float32x4
(
vec_perm
(
vec_float4_z
,
vec_cvf
(
a
.
val
),
perm
));
}
inline
v_float64x2
v_cvt_f64
(
const
v_int32x4
&
a
)
{
return
v_float64x2
(
vec_ctd
(
vec_mergeh
(
a
.
val
,
a
.
val
),
0
));
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_int32x4
&
a
)
{
return
v_float64x2
(
vec_ctd
(
vec_mergel
(
a
.
val
,
a
.
val
),
0
));
}
inline
v_float64x2
v_cvt_f64
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
vec_cvf
(
vec_mergeh
(
a
.
val
,
a
.
val
)));
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
vec_cvf
(
vec_mergel
(
a
.
val
,
a
.
val
)));
}
/** Reinterpret **/
/** its up there with load and store operations **/
////////// Matrix operations /////////
inline
v_int32x4
v_dotprod
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
)
{
return
v_int32x4
(
vec_msum
(
a
.
val
,
b
.
val
,
vec_int4_z
));
}
inline
v_float32x4
v_matmul
(
const
v_float32x4
&
v
,
const
v_float32x4
&
m0
,
const
v_float32x4
&
m1
,
const
v_float32x4
&
m2
,
const
v_float32x4
&
m3
)
{
const
vec_float4
v0
=
vec_splat
(
v
.
val
,
0
);
const
vec_float4
v1
=
vec_splat
(
v
.
val
,
1
);
const
vec_float4
v2
=
vec_splat
(
v
.
val
,
2
);
const
vec_float4
v3
=
vec_splat
(
v
.
val
,
3
);
return
v_float32x4
(
vec_madd
(
v0
,
m0
.
val
,
vec_madd
(
v1
,
m1
.
val
,
vec_madd
(
v2
,
m2
.
val
,
vec_mul
(
v3
,
m3
.
val
)))));
}
#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
const _Tpvec& a2, const _Tpvec& a3, \
_Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
{ \
_Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \
_Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \
b0.val = vec_mergeh(a02, a13); \
b1.val = vec_mergel(a02, a13); \
a02 = vec_mergel(a0.val, a2.val); \
a13 = vec_mergel(a1.val, a3.val); \
b2.val = vec_mergeh(a02, a13); \
b3.val = vec_mergel(a02, a13); \
}
OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4
(
v_uint32x4
,
vec_uint4
)
OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4
(
v_int32x4
,
vec_int4
)
OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4
(
v_float32x4
,
vec_float4
)
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
static
inline
bool
hasSIMD128
()
{
return
(
CV_CPU_HAS_SUPPORT_VSX
)
?
true
:
false
;
}
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
}
#endif // OPENCV_HAL_VSX_HPP
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment