Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
69103a9f
Commit
69103a9f
authored
Sep 29, 2017
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #9740 from tomoaki0705:universalArithm
parents
a729f985
e1872196
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
122 additions
and
396 deletions
+122
-396
arithm.cpp
modules/core/src/arithm.cpp
+122
-396
No files found.
modules/core/src/arithm.cpp
View file @
69103a9f
...
...
@@ -1368,29 +1368,25 @@ struct InRange_SIMD
}
};
#if CV_S
SE2
#if CV_S
IMD128
template
<>
struct
InRange_SIMD
<
uchar
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
const
uchar
*
src3
,
uchar
*
dst
,
int
len
)
const
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
const
int
width
=
v_uint8x16
::
nlanes
;
if
(
USE_SSE2
)
for
(;
x
<=
len
-
width
;
x
+=
width
)
{
__m128i
v_full
=
_mm_set1_epi8
(
-
1
),
v_128
=
_mm_set1_epi8
(
-
128
);
v_uint8x16
values
=
v_load
(
src1
+
x
);
v_uint8x16
low
=
v_load
(
src2
+
x
);
v_uint8x16
high
=
v_load
(
src3
+
x
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_add_epi8
(
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
)),
v_128
);
__m128i
v_mask1
=
_mm_cmpgt_epi8
(
_mm_add_epi8
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
)),
v_128
),
v_src
);
__m128i
v_mask2
=
_mm_cmpgt_epi8
(
v_src
,
_mm_add_epi8
(
_mm_loadu_si128
((
const
__m128i
*
)(
src3
+
x
)),
v_128
));
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_andnot_si128
(
_mm_or_si128
(
v_mask1
,
v_mask2
),
v_full
));
}
v_store
(
dst
+
x
,
(
values
>=
low
)
&
(
high
>=
values
));
}
return
x
;
}
};
...
...
@@ -1399,23 +1395,19 @@ template <>
struct
InRange_SIMD
<
schar
>
{
int
operator
()
(
const
schar
*
src1
,
const
schar
*
src2
,
const
schar
*
src3
,
uchar
*
dst
,
int
len
)
const
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
const
int
width
=
v_int8x16
::
nlanes
;
if
(
USE_SSE2
)
for
(;
x
<=
len
-
width
;
x
+=
width
)
{
__m128i
v_full
=
_mm_set1_epi8
(
-
1
);
v_int8x16
values
=
v_load
(
src1
+
x
);
v_int8x16
low
=
v_load
(
src2
+
x
);
v_int8x16
high
=
v_load
(
src3
+
x
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
v_mask1
=
_mm_cmpgt_epi8
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
)),
v_src
);
__m128i
v_mask2
=
_mm_cmpgt_epi8
(
v_src
,
_mm_loadu_si128
((
const
__m128i
*
)(
src3
+
x
)));
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_andnot_si128
(
_mm_or_si128
(
v_mask1
,
v_mask2
),
v_full
));
}
v_store
((
schar
*
)(
dst
+
x
),
(
values
>=
low
)
&
(
high
>=
values
));
}
return
x
;
}
};
...
...
@@ -1424,181 +1416,22 @@ template <>
struct
InRange_SIMD
<
ushort
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
const
ushort
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
if
(
USE_SSE2
)
{
__m128i
v_zero
=
_mm_setzero_si128
(),
v_full
=
_mm_set1_epi16
(
-
1
),
v_32768
=
_mm_set1_epi16
(
-
32768
);
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_add_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
)),
v_32768
);
__m128i
v_mask1
=
_mm_cmpgt_epi16
(
_mm_add_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
)),
v_32768
),
v_src
);
__m128i
v_mask2
=
_mm_cmpgt_epi16
(
v_src
,
_mm_add_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
src3
+
x
)),
v_32768
));
__m128i
v_res
=
_mm_andnot_si128
(
_mm_or_si128
(
v_mask1
,
v_mask2
),
v_full
);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
_mm_srli_epi16
(
v_res
,
8
),
v_zero
));
}
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
short
>
{
int
operator
()
(
const
short
*
src1
,
const
short
*
src2
,
const
short
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
if
(
USE_SSE2
)
{
__m128i
v_zero
=
_mm_setzero_si128
(),
v_full
=
_mm_set1_epi16
(
-
1
);
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
v_mask1
=
_mm_cmpgt_epi16
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
)),
v_src
);
__m128i
v_mask2
=
_mm_cmpgt_epi16
(
v_src
,
_mm_loadu_si128
((
const
__m128i
*
)(
src3
+
x
)));
__m128i
v_res
=
_mm_andnot_si128
(
_mm_or_si128
(
v_mask1
,
v_mask2
),
v_full
);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
_mm_srli_epi16
(
v_res
,
8
),
v_zero
));
}
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
int
>
{
int
operator
()
(
const
int
*
src1
,
const
int
*
src2
,
const
int
*
src3
,
uchar
*
dst
,
int
len
)
const
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
const
int
width
=
v_uint16x8
::
nlanes
*
2
;
if
(
USE_SSE2
)
for
(;
x
<=
len
-
width
;
x
+=
width
)
{
__m128i
v_zero
=
_mm_setzero_si128
(),
v_full
=
_mm_set1_epi32
(
-
1
);
v_uint16x8
values1
=
v_load
(
src1
+
x
);
v_uint16x8
low1
=
v_load
(
src2
+
x
);
v_uint16x8
high1
=
v_load
(
src3
+
x
);
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
v_res1
=
_mm_or_si128
(
_mm_cmpgt_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
)),
v_src
),
_mm_cmpgt_epi32
(
v_src
,
_mm_loadu_si128
((
const
__m128i
*
)(
src3
+
x
))));
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
+
4
));
__m128i
v_res2
=
_mm_or_si128
(
_mm_cmpgt_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
+
4
)),
v_src
),
_mm_cmpgt_epi32
(
v_src
,
_mm_loadu_si128
((
const
__m128i
*
)(
src3
+
x
+
4
))));
__m128i
v_res
=
_mm_packs_epi32
(
_mm_srli_epi32
(
_mm_andnot_si128
(
v_res1
,
v_full
),
16
),
_mm_srli_epi32
(
_mm_andnot_si128
(
v_res2
,
v_full
),
16
));
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
v_res
,
v_zero
));
}
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
const
float
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
if
(
USE_SSE2
)
{
__m128i
v_zero
=
_mm_setzero_si128
();
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128
v_src
=
_mm_loadu_ps
(
src1
+
x
);
__m128
v_res1
=
_mm_and_ps
(
_mm_cmple_ps
(
_mm_loadu_ps
(
src2
+
x
),
v_src
),
_mm_cmple_ps
(
v_src
,
_mm_loadu_ps
(
src3
+
x
)));
v_uint16x8
values2
=
v_load
(
src1
+
x
+
v_uint16x8
::
nlanes
);
v_uint16x8
low2
=
v_load
(
src2
+
x
+
v_uint16x8
::
nlanes
);
v_uint16x8
high2
=
v_load
(
src3
+
x
+
v_uint16x8
::
nlanes
);
v_src
=
_mm_loadu_ps
(
src1
+
x
+
4
);
__m128
v_res2
=
_mm_and_ps
(
_mm_cmple_ps
(
_mm_loadu_ps
(
src2
+
x
+
4
),
v_src
),
_mm_cmple_ps
(
v_src
,
_mm_loadu_ps
(
src3
+
x
+
4
)));
__m128i
v_res1i
=
_mm_cvtps_epi32
(
v_res1
),
v_res2i
=
_mm_cvtps_epi32
(
v_res2
);
__m128i
v_res
=
_mm_packs_epi32
(
_mm_srli_epi32
(
v_res1i
,
16
),
_mm_srli_epi32
(
v_res2i
,
16
));
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
v_res
,
v_zero
));
}
}
return
x
;
}
};
#elif CV_NEON
template
<>
struct
InRange_SIMD
<
uchar
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
const
uchar
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
uint8x16_t
values
=
vld1q_u8
(
src1
+
x
);
uint8x16_t
low
=
vld1q_u8
(
src2
+
x
);
uint8x16_t
high
=
vld1q_u8
(
src3
+
x
);
vst1q_u8
(
dst
+
x
,
vandq_u8
(
vcgeq_u8
(
values
,
low
),
vcgeq_u8
(
high
,
values
)));
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
schar
>
{
int
operator
()
(
const
schar
*
src1
,
const
schar
*
src2
,
const
schar
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
int8x16_t
values
=
vld1q_s8
(
src1
+
x
);
int8x16_t
low
=
vld1q_s8
(
src2
+
x
);
int8x16_t
high
=
vld1q_s8
(
src3
+
x
);
vst1q_u8
(
dst
+
x
,
vandq_u8
(
vcgeq_s8
(
values
,
low
),
vcgeq_s8
(
high
,
values
)));
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
ushort
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
const
ushort
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
uint16x8_t
values
=
vld1q_u16
((
const
uint16_t
*
)(
src1
+
x
));
uint16x8_t
low
=
vld1q_u16
((
const
uint16_t
*
)(
src2
+
x
));
uint16x8_t
high
=
vld1q_u16
((
const
uint16_t
*
)(
src3
+
x
));
uint8x8_t
r1
=
vmovn_u16
(
vandq_u16
(
vcgeq_u16
(
values
,
low
),
vcgeq_u16
(
high
,
values
)));
values
=
vld1q_u16
((
const
uint16_t
*
)(
src1
+
x
+
8
));
low
=
vld1q_u16
((
const
uint16_t
*
)(
src2
+
x
+
8
));
high
=
vld1q_u16
((
const
uint16_t
*
)(
src3
+
x
+
8
));
uint8x8_t
r2
=
vmovn_u16
(
vandq_u16
(
vcgeq_u16
(
values
,
low
),
vcgeq_u16
(
high
,
values
)));
vst1q_u8
(
dst
+
x
,
vcombine_u8
(
r1
,
r2
));
v_store
(
dst
+
x
,
v_pack
((
values1
>=
low1
)
&
(
high1
>=
values1
),
(
values2
>=
low2
)
&
(
high2
>=
values2
)));
}
return
x
;
}
...
...
@@ -1608,23 +1441,22 @@ template <>
struct
InRange_SIMD
<
short
>
{
int
operator
()
(
const
short
*
src1
,
const
short
*
src2
,
const
short
*
src3
,
uchar
*
dst
,
int
len
)
const
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
const
int
width
=
(
int
)
v_int16x8
::
nlanes
*
2
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
for
(
;
x
<=
len
-
width
;
x
+=
width
)
{
int16x8_t
values
=
vld1q_s16
((
const
int16_t
*
)(
src1
+
x
));
int16x8_t
low
=
vld1q_s16
((
const
int16_t
*
)(
src2
+
x
));
int16x8_t
high
=
vld1q_s16
((
const
int16_t
*
)(
src3
+
x
));
uint8x8_t
r1
=
vmovn_u16
(
vandq_u16
(
vcgeq_s16
(
values
,
low
),
vcgeq_s16
(
high
,
values
)));
v_int16x8
values1
=
v_load
(
src1
+
x
);
v_int16x8
low1
=
v_load
(
src2
+
x
);
v_int16x8
high1
=
v_load
(
src3
+
x
);
values
=
vld1q_s16
((
const
int16_t
*
)(
src1
+
x
+
8
));
low
=
vld1q_s16
((
const
int16_t
*
)(
src2
+
x
+
8
));
high
=
vld1q_s16
((
const
int16_t
*
)(
src3
+
x
+
8
));
uint8x8_t
r2
=
vmovn_u16
(
vandq_u16
(
vcgeq_s16
(
values
,
low
),
vcgeq_s16
(
high
,
values
)));
v_int16x8
values2
=
v_load
(
src1
+
x
+
v_int16x8
::
nlanes
);
v_int16x8
low2
=
v_load
(
src2
+
x
+
v_int16x8
::
nlanes
);
v_int16x8
high2
=
v_load
(
src3
+
x
+
v_int16x8
::
nlanes
);
v
st1q_u8
(
dst
+
x
,
vcombine_u8
(
r1
,
r2
));
v
_store
((
schar
*
)(
dst
+
x
),
v_pack
((
values1
>=
low1
)
&
(
high1
>=
values1
),
(
values2
>=
low2
)
&
(
high2
>=
values2
)
));
}
return
x
;
}
...
...
@@ -1634,27 +1466,22 @@ template <>
struct
InRange_SIMD
<
int
>
{
int
operator
()
(
const
int
*
src1
,
const
int
*
src2
,
const
int
*
src3
,
uchar
*
dst
,
int
len
)
const
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
const
int
width
=
(
int
)
v_int32x4
::
nlanes
*
2
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
width
;
x
+=
width
)
{
int32x4_t
values
=
vld1q_s32
((
const
int32_t
*
)(
src1
+
x
));
int32x4_t
low
=
vld1q_s32
((
const
int32_t
*
)(
src2
+
x
));
int32x4_t
high
=
vld1q_s32
((
const
int32_t
*
)(
src3
+
x
));
uint16x4_t
r1
=
vmovn_u32
(
vandq_u32
(
vcgeq_s32
(
values
,
low
),
vcgeq_s32
(
high
,
values
)));
values
=
vld1q_s32
((
const
int32_t
*
)(
src1
+
x
+
4
));
low
=
vld1q_s32
((
const
int32_t
*
)(
src2
+
x
+
4
));
high
=
vld1q_s32
((
const
int32_t
*
)(
src3
+
x
+
4
));
uint16x4_t
r2
=
vmovn_u32
(
vandq_u32
(
vcgeq_s32
(
values
,
low
),
vcgeq_s32
(
high
,
values
)));
v_int32x4
values1
=
v_load
(
src1
+
x
);
v_int32x4
low1
=
v_load
(
src2
+
x
);
v_int32x4
high1
=
v_load
(
src3
+
x
);
uint16x8_t
res_16
=
vcombine_u16
(
r1
,
r2
);
v_int32x4
values2
=
v_load
(
src1
+
x
+
v_int32x4
::
nlanes
);
v_int32x4
low2
=
v_load
(
src2
+
x
+
v_int32x4
::
nlanes
);
v_int32x4
high2
=
v_load
(
src3
+
x
+
v_int32x4
::
nlanes
);
v
st1_u8
(
dst
+
x
,
vmovn_u16
(
res_16
));
v
_pack_store
(
dst
+
x
,
v_reinterpret_as_u16
(
v_pack
((
values1
>=
low1
)
&
(
high1
>=
values1
),
(
values2
>=
low2
)
&
(
high2
>=
values2
))
));
}
return
x
;
}
...
...
@@ -1664,27 +1491,22 @@ template <>
struct
InRange_SIMD
<
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
const
float
*
src3
,
uchar
*
dst
,
int
len
)
const
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
const
int
width
=
(
int
)
v_float32x4
::
nlanes
*
2
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
width
;
x
+=
width
)
{
float32x4_t
values
=
vld1q_f32
((
const
float32_t
*
)(
src1
+
x
)
);
float32x4_t
low
=
vld1q_f32
((
const
float32_t
*
)(
src2
+
x
)
);
float32x4_t
high
=
vld1q_f32
((
const
float32_t
*
)(
src3
+
x
)
);
v_float32x4
values1
=
v_load
(
src1
+
x
);
v_float32x4
low1
=
v_load
(
src2
+
x
);
v_float32x4
high1
=
v_load
(
src3
+
x
);
uint16x4_t
r1
=
vmovn_u32
(
vandq_u32
(
vcgeq_f32
(
values
,
low
),
vcgeq_f32
(
high
,
values
)));
v_float32x4
values2
=
v_load
(
src1
+
x
+
v_float32x4
::
nlanes
);
v_float32x4
low2
=
v_load
(
src2
+
x
+
v_float32x4
::
nlanes
);
v_float32x4
high2
=
v_load
(
src3
+
x
+
v_float32x4
::
nlanes
);
values
=
vld1q_f32
((
const
float32_t
*
)(
src1
+
x
+
4
));
low
=
vld1q_f32
((
const
float32_t
*
)(
src2
+
x
+
4
));
high
=
vld1q_f32
((
const
float32_t
*
)(
src3
+
x
+
4
));
uint16x4_t
r2
=
vmovn_u32
(
vandq_u32
(
vcgeq_f32
(
values
,
low
),
vcgeq_f32
(
high
,
values
)));
uint16x8_t
res_16
=
vcombine_u16
(
r1
,
r2
);
vst1_u8
(
dst
+
x
,
vmovn_u16
(
res_16
));
v_pack_store
(
dst
+
x
,
v_pack
(
v_reinterpret_as_u32
((
values1
>=
low1
)
&
(
high1
>=
values1
)),
v_reinterpret_as_u32
((
values2
>=
low2
)
&
(
high2
>=
values2
))));
}
return
x
;
}
...
...
@@ -2829,33 +2651,17 @@ void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
for
(
;
height
--
;
src1
+=
step1
,
src2
+=
step2
,
dst
+=
step
)
{
int
x
=
0
;
#if CV_SSE2
if
(
USE_SSE2
)
#if CV_SIMD128
if
(
hasSIMD128
()
)
{
__m128i
m128
=
code
==
CMP_GT
?
_mm_setzero_si128
()
:
_mm_set1_epi8
(
-
1
);
__m128i
c128
=
_mm_set1_epi8
(
-
128
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
r00
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
r10
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
// no simd for 8u comparison, that's why we need the trick
r00
=
_mm_sub_epi8
(
r00
,
c128
);
r10
=
_mm_sub_epi8
(
r10
,
c128
);
r00
=
_mm_xor_si128
(
_mm_cmpgt_epi8
(
r00
,
r10
),
m128
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r00
);
v_uint8x16
mask
=
v_setall_u8
((
uchar
)
m
);
for
(
;
x
<=
width
-
v_uint8x16
::
nlanes
;
x
+=
v_uint8x16
::
nlanes
)
{
v_store
(
dst
+
x
,
(
v_load
(
src1
+
x
)
>
v_load
(
src2
+
x
))
^
mask
);
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_GT
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vcgtq_u8
(
vld1q_u8
(
src1
+
x
),
vld1q_u8
(
src2
+
x
)),
mask
));
}
#endif
#endif
for
(
;
x
<
width
;
x
++
){
dst
[
x
]
=
(
uchar
)(
-
(
src1
[
x
]
>
src2
[
x
])
^
m
);
...
...
@@ -2868,26 +2674,17 @@ void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
for
(
;
height
--
;
src1
+=
step1
,
src2
+=
step2
,
dst
+=
step
)
{
int
x
=
0
;
#if CV_SSE2
if
(
USE_SSE2
)
#if CV_SIMD128
if
(
hasSIMD128
()
)
{
__m128i
m128
=
code
==
CMP_EQ
?
_mm_setzero_si128
()
:
_mm_set1_epi8
(
-
1
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
v_uint8x16
mask
=
v_setall_u8
((
uchar
)
m
);
for
(
;
x
<=
width
-
v_uint8x16
::
nlanes
;
x
+=
v_uint8x16
::
nlanes
)
{
__m128i
r00
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
r10
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
r00
=
_mm_xor_si128
(
_mm_cmpeq_epi8
(
r00
,
r10
),
m128
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r00
);
v_store
(
dst
+
x
,
(
v_load
(
src1
+
x
)
==
v_load
(
src2
+
x
))
^
mask
);
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_EQ
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vceqq_u8
(
vld1q_u8
(
src1
+
x
),
vld1q_u8
(
src2
+
x
)),
mask
));
}
#endif
#endif
for
(
;
x
<
width
;
x
++
)
dst
[
x
]
=
(
uchar
)(
-
(
src1
[
x
]
==
src2
[
x
])
^
m
);
}
...
...
@@ -2932,49 +2729,26 @@ void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
for
(
;
height
--
;
src1
+=
step1
,
src2
+=
step2
,
dst
+=
step
)
{
int
x
=
0
;
#if CV_SSE2
if
(
USE_SSE2
)
#if CV_SIMD128
if
(
hasSIMD128
()
)
{
__m128i
m128
=
code
==
CMP_GT
?
_mm_setzero_si128
()
:
_mm_set1_epi16
(
-
1
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
r00
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
r10
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
r00
=
_mm_xor_si128
(
_mm_cmpgt_epi16
(
r00
,
r10
),
m128
);
__m128i
r01
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
+
8
));
__m128i
r11
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
+
8
));
r01
=
_mm_xor_si128
(
_mm_cmpgt_epi16
(
r01
,
r11
),
m128
);
r11
=
_mm_packs_epi16
(
r00
,
r01
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r11
);
}
if
(
x
<=
width
-
8
)
{
__m128i
r00
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
r10
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
r00
=
_mm_xor_si128
(
_mm_cmpgt_epi16
(
r00
,
r10
),
m128
);
r10
=
_mm_packs_epi16
(
r00
,
r00
);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
r10
);
x
+=
8
;
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_GT
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
v_uint8x16
mask
=
v_setall_u8
((
uchar
)
m
);
const
int
dWidth
=
v_uint8x16
::
nlanes
;
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
int16x8_t
in1
=
vld1q_s16
(
src1
+
x
);
int16x8_t
in2
=
vld1q_s16
(
src2
+
x
);
uint8x8_t
t1
=
vmovn_u16
(
vcgtq_s16
(
in1
,
in2
)
);
for
(
;
x
<=
width
-
dWidth
;
x
+=
dWidth
)
{
v_int16x8
in1
=
v_load
(
src1
+
x
);
v_int16x8
in2
=
v_load
(
src2
+
x
);
v_uint16x8
t1
=
v_reinterpret_as_u16
(
in1
>
in2
);
in1
=
vld1q_s16
(
src1
+
x
+
8
);
in2
=
vld1q_s16
(
src2
+
x
+
8
);
uint8x8_t
t2
=
vmovn_u16
(
vcgtq_s16
(
in1
,
in2
)
);
in1
=
v_load
(
src1
+
x
+
v_uint16x8
::
nlanes
);
in2
=
v_load
(
src2
+
x
+
v_uint16x8
::
nlanes
);
v_uint16x8
t2
=
v_reinterpret_as_u16
(
in1
>
in2
);
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vcombine_u8
(
t1
,
t2
),
mask
));
v_store
(
dst
+
x
,
(
v_pack
(
t1
,
t2
))
^
mask
);
}
}
#endif
#endif
for
(
;
x
<
width
;
x
++
){
dst
[
x
]
=
(
uchar
)(
-
(
src1
[
x
]
>
src2
[
x
])
^
m
);
}
...
...
@@ -2986,48 +2760,26 @@ void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
for
(
;
height
--
;
src1
+=
step1
,
src2
+=
step2
,
dst
+=
step
)
{
int
x
=
0
;
#if CV_SSE2
if
(
USE_SSE2
)
#if CV_SIMD128
if
(
hasSIMD128
()
)
{
__m128i
m128
=
code
==
CMP_EQ
?
_mm_setzero_si128
()
:
_mm_set1_epi16
(
-
1
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
r00
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
r10
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
r00
=
_mm_xor_si128
(
_mm_cmpeq_epi16
(
r00
,
r10
),
m128
);
__m128i
r01
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
+
8
));
__m128i
r11
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
+
8
));
r01
=
_mm_xor_si128
(
_mm_cmpeq_epi16
(
r01
,
r11
),
m128
);
r11
=
_mm_packs_epi16
(
r00
,
r01
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r11
);
}
if
(
x
<=
width
-
8
)
{
__m128i
r00
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
r10
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
r00
=
_mm_xor_si128
(
_mm_cmpeq_epi16
(
r00
,
r10
),
m128
);
r10
=
_mm_packs_epi16
(
r00
,
r00
);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
r10
);
v_uint8x16
mask
=
v_setall_u8
((
uchar
)
m
);
const
int
dWidth
=
v_uint8x16
::
nlanes
;
x
+=
8
;
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_EQ
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
int16x8_t
in1
=
vld1q_s16
(
src1
+
x
);
int16x8_t
in2
=
vld1q_s16
(
src2
+
x
);
uint8x8_t
t1
=
vmovn_u16
(
vceqq_s16
(
in1
,
in2
));
for
(
;
x
<=
width
-
dWidth
;
x
+=
dWidth
)
{
v_int16x8
in1
=
v_load
(
src1
+
x
);
v_int16x8
in2
=
v_load
(
src2
+
x
);
v_uint16x8
t1
=
v_reinterpret_as_u16
(
in1
==
in2
);
in1
=
vld1q_s16
(
src1
+
x
+
8
);
in2
=
vld1q_s16
(
src2
+
x
+
8
);
uint8x8_t
t2
=
vmovn_u16
(
vceqq_s16
(
in1
,
in2
)
);
in1
=
v_load
(
src1
+
x
+
8
);
in2
=
v_load
(
src2
+
x
+
8
);
v_uint16x8
t2
=
v_reinterpret_as_u16
(
in1
==
in2
);
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vcombine_u8
(
t1
,
t2
),
mask
));
v_store
(
dst
+
x
,
(
v_pack
(
t1
,
t2
)
^
mask
));
}
}
#endif
#endif
for
(
;
x
<
width
;
x
++
)
dst
[
x
]
=
(
uchar
)(
-
(
src1
[
x
]
==
src2
[
x
])
^
m
);
}
...
...
@@ -3280,60 +3032,34 @@ addWeighted8u( const uchar* src1, size_t step1,
{
int
x
=
0
;
#if CV_S
SE2
if
(
USE_SSE2
)
#if CV_S
IMD128
if
(
hasSIMD128
()
)
{
__m128
a4
=
_mm_set1_ps
(
alpha
),
b4
=
_mm_set1_ps
(
beta
),
g4
=
_mm_set1_ps
(
gamma
);
__m128i
z
=
_mm_setzero_si128
();
v_float32x4
g
=
v_setall_f32
(
gamma
);
v_float32x4
a
=
v_setall_f32
(
alpha
);
v_float32x4
b
=
v_setall_f32
(
beta
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
for
(
;
x
<=
width
-
v_uint16x8
::
nlanes
;
x
+=
v_uint16x8
::
nlanes
)
{
__m128i
u
=
_mm_unpacklo_epi8
(
_mm_loadl_epi64
((
const
__m128i
*
)(
src1
+
x
)),
z
);
__m128i
v
=
_mm_unpacklo_epi8
(
_mm_loadl_epi64
((
const
__m128i
*
)(
src2
+
x
)),
z
);
__m128
u0
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
u
,
z
));
__m128
u1
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
u
,
z
));
__m128
v0
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v
,
z
));
__m128
v1
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v
,
z
));
u0
=
_mm_add_ps
(
_mm_mul_ps
(
u0
,
a4
),
_mm_mul_ps
(
v0
,
b4
));
u1
=
_mm_add_ps
(
_mm_mul_ps
(
u1
,
a4
),
_mm_mul_ps
(
v1
,
b4
));
u0
=
_mm_add_ps
(
u0
,
g4
);
u1
=
_mm_add_ps
(
u1
,
g4
);
u
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
u0
),
_mm_cvtps_epi32
(
u1
));
u
=
_mm_packus_epi16
(
u
,
u
);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
u
);
v_uint16x8
in1_16
=
v_load_expand
(
src1
+
x
);
v_int32x4
in1_32_l
,
in1_32_h
;
v_expand
(
v_reinterpret_as_s16
(
in1_16
),
in1_32_l
,
in1_32_h
);
v_float32x4
in1_f_l
=
v_cvt_f32
(
in1_32_l
);
v_float32x4
in1_f_h
=
v_cvt_f32
(
in1_32_h
);
v_uint16x8
in2_16
=
v_load_expand
(
src2
+
x
);
v_int32x4
in2_32_l
,
in2_32_h
;
v_expand
(
v_reinterpret_as_s16
(
in2_16
),
in2_32_l
,
in2_32_h
);
v_float32x4
in2_f_l
=
v_cvt_f32
(
in2_32_l
);
v_float32x4
in2_f_h
=
v_cvt_f32
(
in2_32_h
);
v_int32x4
out_l
=
v_round
(
in1_f_l
*
a
+
in2_f_l
*
b
+
g
);
v_int32x4
out_h
=
v_round
(
in1_f_h
*
a
+
in2_f_h
*
b
+
g
);
v_int16x8
out_16
=
v_pack
(
out_l
,
out_h
);
v_pack_u_store
(
dst
+
x
,
out_16
);
}
}
#elif CV_NEON
float32x4_t
g
=
vdupq_n_f32
(
gamma
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
uint8x8_t
in1
=
vld1_u8
(
src1
+
x
);
uint16x8_t
in1_16
=
vmovl_u8
(
in1
);
float32x4_t
in1_f_l
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_low_u16
(
in1_16
)));
float32x4_t
in1_f_h
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_high_u16
(
in1_16
)));
uint8x8_t
in2
=
vld1_u8
(
src2
+
x
);
uint16x8_t
in2_16
=
vmovl_u8
(
in2
);
float32x4_t
in2_f_l
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_low_u16
(
in2_16
)));
float32x4_t
in2_f_h
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_high_u16
(
in2_16
)));
float32x4_t
out_f_l
=
vaddq_f32
(
vmulq_n_f32
(
in1_f_l
,
alpha
),
vmulq_n_f32
(
in2_f_l
,
beta
));
float32x4_t
out_f_h
=
vaddq_f32
(
vmulq_n_f32
(
in1_f_h
,
alpha
),
vmulq_n_f32
(
in2_f_h
,
beta
));
out_f_l
=
vaddq_f32
(
out_f_l
,
g
);
out_f_h
=
vaddq_f32
(
out_f_h
,
g
);
uint16x4_t
out_16_l
=
vqmovun_s32
(
cv_vrndq_s32_f32
(
out_f_l
));
uint16x4_t
out_16_h
=
vqmovun_s32
(
cv_vrndq_s32_f32
(
out_f_h
));
uint16x8_t
out_16
=
vcombine_u16
(
out_16_l
,
out_16_h
);
uint8x8_t
out
=
vqmovn_u16
(
out_16
);
vst1_u8
(
dst
+
x
,
out
);
}
#endif
#if CV_ENABLE_UNROLLED
for
(
;
x
<=
width
-
4
;
x
+=
4
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment