Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
c8d77fd9
Commit
c8d77fd9
authored
Sep 07, 2016
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #7233 from tomoaki0705:featureUniversalIntrinsicFp16
parents
48af5e55
903789f7
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
149 additions
and
36 deletions
+149
-36
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+51
-0
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+39
-0
convert.cpp
modules/core/src/convert.cpp
+6
-36
test_intrin.cpp
modules/core/test/test_intrin.cpp
+53
-0
No files found.
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
c8d77fd9
...
@@ -275,6 +275,39 @@ struct v_float64x2
...
@@ -275,6 +275,39 @@ struct v_float64x2
};
};
#endif
#endif
#if defined (HAVE_FP16)
// Workaround for old comiplers
template
<
typename
T
>
static
inline
int16x4_t
vreinterpret_s16_f16
(
T
a
)
{
return
(
int16x4_t
)
a
;
}
template
<
typename
T
>
static
inline
float16x4_t
vreinterpret_f16_s16
(
T
a
)
{
return
(
float16x4_t
)
a
;
}
template
<
typename
T
>
static
inline
float16x4_t
vld1_f16
(
const
T
*
ptr
)
{
return
vreinterpret_f16_s16
(
vld1_s16
((
const
short
*
)
ptr
));
}
template
<
typename
T
>
static
inline
void
vst1_f16
(
T
*
ptr
,
float16x4_t
a
)
{
vst1_s16
((
short
*
)
ptr
,
vreinterpret_s16_f16
(
a
));
}
static
inline
short
vget_lane_f16
(
float16x4_t
a
,
int
b
)
{
return
vget_lane_s16
(
vreinterpret_s16_f16
(
a
),
b
);
}
struct
v_float16x4
{
typedef
short
lane_type
;
enum
{
nlanes
=
4
};
v_float16x4
()
{}
explicit
v_float16x4
(
float16x4_t
v
)
:
val
(
v
)
{}
v_float16x4
(
short
v0
,
short
v1
,
short
v2
,
short
v3
)
{
short
v
[]
=
{
v0
,
v1
,
v2
,
v3
};
val
=
vld1_f16
(
v
);
}
short
get0
()
const
{
return
vget_lane_f16
(
val
,
0
);
}
float16x4_t
val
;
};
#endif
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
...
@@ -734,6 +767,14 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
...
@@ -734,6 +767,14 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_float64x2
,
double
,
f64
)
#endif
#endif
#if defined (HAVE_FP16)
// Workaround for old comiplers
inline
v_float16x4
v_load_f16
(
const
short
*
ptr
)
{
return
v_float16x4
(
vld1_f16
(
ptr
));
}
inline
void
v_store_f16
(
short
*
ptr
,
v_float16x4
&
a
)
{
vst1_f16
(
ptr
,
a
.
val
);
}
#endif
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
{ \
...
@@ -1146,7 +1187,17 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
...
@@ -1146,7 +1187,17 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
}
}
#endif
#endif
#if defined (HAVE_FP16)
inline
v_float32x4
v_cvt_f32
(
const
v_float16x4
&
a
)
{
return
v_float32x4
(
vcvt_f32_f16
(
a
.
val
));
}
inline
v_float16x4
v_cvt_f16
(
const
v_float32x4
&
a
)
{
return
v_float16x4
(
vcvt_f16_f32
(
a
.
val
));
}
#endif
//! @endcond
//! @endcond
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
c8d77fd9
...
@@ -252,6 +252,26 @@ struct v_float64x2
...
@@ -252,6 +252,26 @@ struct v_float64x2
__m128d
val
;
__m128d
val
;
};
};
#if defined(HAVE_FP16)
struct
v_float16x4
{
typedef
short
lane_type
;
enum
{
nlanes
=
4
};
v_float16x4
()
{}
explicit
v_float16x4
(
__m128i
v
)
:
val
(
v
)
{}
v_float16x4
(
short
v0
,
short
v1
,
short
v2
,
short
v3
)
{
val
=
_mm_setr_epi16
(
v0
,
v1
,
v2
,
v3
,
0
,
0
,
0
,
0
);
}
short
get0
()
const
{
return
(
short
)
_mm_cvtsi128_si32
(
val
);
}
__m128i
val
;
};
#endif
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
...
@@ -1021,6 +1041,13 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
...
@@ -1021,6 +1041,13 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP
(
v_float32x4
,
float
,
ps
)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP
(
v_float32x4
,
float
,
ps
)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP
(
v_float64x2
,
double
,
pd
)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP
(
v_float64x2
,
double
,
pd
)
#if defined(HAVE_FP16)
inline
v_float16x4
v_load_f16
(
const
short
*
ptr
)
{
return
v_float16x4
(
_mm_loadl_epi64
((
const
__m128i
*
)
ptr
));
}
inline
void
v_store_f16
(
short
*
ptr
,
v_float16x4
&
a
)
{
_mm_storel_epi64
((
__m128i
*
)
ptr
,
a
.
val
);
}
#endif
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
{ \
...
@@ -1626,6 +1653,18 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
...
@@ -1626,6 +1653,18 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
return
v_float64x2
(
_mm_cvtps_pd
(
_mm_castsi128_ps
(
_mm_srli_si128
(
_mm_castps_si128
(
a
.
val
),
8
))));
return
v_float64x2
(
_mm_cvtps_pd
(
_mm_castsi128_ps
(
_mm_srli_si128
(
_mm_castps_si128
(
a
.
val
),
8
))));
}
}
#if defined(HAVE_FP16)
inline
v_float32x4
v_cvt_f32
(
const
v_float16x4
&
a
)
{
return
v_float32x4
(
_mm_cvtph_ps
(
a
.
val
));
}
inline
v_float16x4
v_cvt_f16
(
const
v_float32x4
&
a
)
{
return
v_float16x4
(
_mm_cvtps_ph
(
a
.
val
,
0
));
}
#endif
//! @endcond
//! @endcond
}
}
...
...
modules/core/src/convert.cpp
View file @
c8d77fd9
...
@@ -4537,16 +4537,6 @@ static short convertFp16SW(float fp32)
...
@@ -4537,16 +4537,6 @@ static short convertFp16SW(float fp32)
}
}
#endif
#endif
#if CV_FP16 && (defined __GNUC__) && (defined __arm__ || defined __aarch64__)
#if 5 <= __GNUC__
static
inline
float16x4_t
load_f16
(
const
short
*
p
)
{
return
vld1_f16
((
const
float16_t
*
)
p
);
}
static
inline
void
store_f16
(
short
*
p
,
float16x4_t
v
)
{
vst1_f16
((
float16_t
*
)
p
,
v
);
}
#else
static
inline
float16x4_t
load_f16
(
const
short
*
p
)
{
return
(
float16x4_t
)
vld1_s16
(
p
);
}
static
inline
void
store_f16
(
short
*
p
,
float16x4_t
v
)
{
vst1_s16
(
p
,
(
int16x4_t
)
v
);
}
#endif
#endif
// template for FP16 HW conversion function
// template for FP16 HW conversion function
template
<
typename
T
,
typename
DT
>
static
void
template
<
typename
T
,
typename
DT
>
static
void
cvtScaleHalf_
(
const
T
*
src
,
size_t
sstep
,
DT
*
dst
,
size_t
dstep
,
Size
size
);
cvtScaleHalf_
(
const
T
*
src
,
size_t
sstep
,
DT
*
dst
,
size_t
dstep
,
Size
size
);
...
@@ -4570,21 +4560,11 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
...
@@ -4570,21 +4560,11 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
#if CV_FP16
#if CV_FP16
for
(
;
x
<=
size
.
width
-
4
;
x
+=
4
)
for
(
;
x
<=
size
.
width
-
4
;
x
+=
4
)
{
{
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
v_float32x4
v_src
=
v_load
(
src
+
x
);
__m128
v_src
=
_mm_loadu_ps
(
src
+
x
);
__m128i
v_dst
=
_mm_cvtps_ph
(
v_src
,
0
);
v_float16x4
v_dst
=
v_cvt_f16
(
v_src
);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
v_dst
);
v_store_f16
(
dst
+
x
,
v_dst
);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float32x4_t
v_src
=
vld1q_f32
(
src
+
x
);
float16x4_t
v_dst
=
vcvt_f16_f32
(
v_src
);
store_f16
(
dst
+
x
,
v_dst
);
#else
#error "Configuration error"
#endif
}
}
#endif
#endif
}
}
...
@@ -4626,21 +4606,11 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
...
@@ -4626,21 +4606,11 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
#if CV_FP16
#if CV_FP16
for
(
;
x
<=
size
.
width
-
4
;
x
+=
4
)
for
(
;
x
<=
size
.
width
-
4
;
x
+=
4
)
{
{
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
v_float16x4
v_src
=
v_load_f16
(
src
+
x
);
__m128i
v_src
=
_mm_loadl_epi64
((
__m128i
*
)(
src
+
x
));
__m128
v_dst
=
_mm_cvtph_ps
(
v_src
);
_mm_storeu_ps
(
dst
+
x
,
v_dst
);
v_float32x4
v_dst
=
v_cvt_f32
(
v_src
);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float16x4_t
v_src
=
load_f16
(
src
+
x
);
float32x4_t
v_dst
=
vcvt_f32_f16
(
v_src
);
v_store
(
dst
+
x
,
v_dst
);
vst1q_f32
(
dst
+
x
,
v_dst
);
#else
#error "Configuration error"
#endif
}
}
#endif
#endif
}
}
...
...
modules/core/test/test_intrin.cpp
View file @
c8d77fd9
#include "test_precomp.hpp"
#include "test_intrin_utils.hpp"
#include "test_intrin_utils.hpp"
#include <climits>
#include <climits>
...
@@ -710,6 +711,49 @@ template<typename R> struct TheTest
...
@@ -710,6 +711,49 @@ template<typename R> struct TheTest
return
*
this
;
return
*
this
;
}
}
#if CV_FP16
TheTest
&
test_loadstore_fp16
()
{
AlignedData
<
R
>
data
;
AlignedData
<
R
>
out
;
// check if addresses are aligned and unaligned respectively
EXPECT_EQ
((
size_t
)
0
,
(
size_t
)
&
data
.
a
.
d
%
16
);
EXPECT_NE
((
size_t
)
0
,
(
size_t
)
&
data
.
u
.
d
%
16
);
EXPECT_EQ
((
size_t
)
0
,
(
size_t
)
&
out
.
a
.
d
%
16
);
EXPECT_NE
((
size_t
)
0
,
(
size_t
)
&
out
.
u
.
d
%
16
);
// check some initialization methods
R
r1
=
data
.
u
;
R
r2
=
v_load_f16
(
data
.
a
.
d
);
R
r3
(
r2
);
EXPECT_EQ
(
data
.
u
[
0
],
r1
.
get0
());
EXPECT_EQ
(
data
.
a
[
0
],
r2
.
get0
());
EXPECT_EQ
(
data
.
a
[
0
],
r3
.
get0
());
// check some store methods
out
.
a
.
clear
();
v_store_f16
(
out
.
a
.
d
,
r1
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
return
*
this
;
}
TheTest
&
test_float_cvt_fp16
()
{
AlignedData
<
v_float32x4
>
data
;
// check conversion
v_float32x4
r1
=
v_load
(
data
.
a
.
d
);
v_float16x4
r2
=
v_cvt_f16
(
r1
);
v_float32x4
r3
=
v_cvt_f32
(
r2
);
EXPECT_EQ
(
0x3c00
,
r2
.
get0
());
EXPECT_EQ
(
r3
.
get0
(),
r1
.
get0
());
return
*
this
;
}
#endif
};
};
...
@@ -915,6 +959,15 @@ TEST(hal_intrin, float64x2) {
...
@@ -915,6 +959,15 @@ TEST(hal_intrin, float64x2) {
}
}
#endif
#endif
#if CV_FP16
TEST
(
hal_intrin
,
float16x4
)
{
TheTest
<
v_float16x4
>
()
.
test_loadstore_fp16
()
.
test_float_cvt_fp16
()
;
}
#endif
};
};
};
};
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment