Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d9107601
Commit
d9107601
authored
Sep 25, 2017
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #9690 from tomoaki0705:universalSmooth
parents
a08044d6
e932160a
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
171 additions
and
515 deletions
+171
-515
smooth.cpp
modules/imgproc/src/smooth.cpp
+171
-515
No files found.
modules/imgproc/src/smooth.cpp
View file @
d9107601
...
...
@@ -299,11 +299,9 @@ struct ColumnSum<int, uchar> :
bool
haveScale
=
scale
!=
1
;
double
_scale
=
scale
;
#if CV_SSE2
bool
haveSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#elif CV_NEON
bool
haveNEON
=
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
#if CV_SIMD128
bool
haveSIMD128
=
hasSIMD128
();
#endif
if
(
width
!=
(
int
)
sum
.
size
()
)
{
...
...
@@ -319,23 +317,15 @@ struct ColumnSum<int, uchar> :
{
const
int
*
Sp
=
(
const
int
*
)
src
[
0
];
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
for
(;
i
<=
width
-
4
;
i
+=
4
)
{
__m128i
_sum
=
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
));
__m128i
_sp
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_add_epi32
(
_sum
,
_sp
));
v_store
(
SUM
+
i
,
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
));
}
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
vst1q_s32
(
SUM
+
i
,
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)));
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
SUM
[
i
]
+=
Sp
[
i
];
}
...
...
@@ -354,51 +344,27 @@ struct ColumnSum<int, uchar> :
if
(
haveScale
)
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
const
__m128
scale4
=
_mm_set1_ps
((
float
)
_scale
);
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_sm1
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
+
4
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
__m128i
_s01
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
+
4
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
+
4
)));
__m128i
_s0T
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
scale4
,
_mm_cvtepi32_ps
(
_s0
)));
__m128i
_s0T1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
scale4
,
_mm_cvtepi32_ps
(
_s01
)));
_s0T
=
_mm_packs_epi32
(
_s0T
,
_s0T1
);
_mm_storel_epi64
((
__m128i
*
)(
D
+
i
),
_mm_packus_epi16
(
_s0T
,
_s0T
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
+
4
),
_mm_sub_epi32
(
_s01
,
_sm1
));
}
}
#elif CV_NEON
if
(
haveNEON
)
{
float32x4_t
v_scale
=
vdupq_n_f32
((
float
)
_scale
);
v_float32x4
v_scale
=
v_setall_f32
((
float
)
_scale
);
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)
);
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
)
);
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
uint32x4_t
v_s0d
=
cv_vrndq_u32_f32
(
vmulq_f32
(
vcvtq_f32_s32
(
v_s0
),
v_scale
));
uint32x4_t
v_s01d
=
cv_vrndq_u32_f32
(
vmulq_f32
(
vcvtq_f32_s32
(
v_s01
),
v_scale
));
v_uint32x4
v_s0d
=
v_reinterpret_as_u32
(
v_round
(
v_cvt_f32
(
v_s0
)
*
v_scale
));
v_uint32x4
v_s01d
=
v_reinterpret_as_u32
(
v_round
(
v_cvt_f32
(
v_s01
)
*
v_scale
));
uint16x8_t
v_dst
=
vcombine_u16
(
vqmovn_u32
(
v_s0d
),
vqmovn_u32
(
v_s01d
)
);
v
st1_u8
(
D
+
i
,
vqmovn_u16
(
v_dst
)
);
v_uint16x8
v_dst
=
v_pack
(
v_s0d
,
v_s01d
);
v
_pack_store
(
D
+
i
,
v_dst
);
v
st1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)
));
v
st1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)
));
v
_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v
_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -409,43 +375,22 @@ struct ColumnSum<int, uchar> :
else
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_sm1
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
+
4
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
__m128i
_s01
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
+
4
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
+
4
)));
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
__m128i
_s0T
=
_mm_packs_epi32
(
_s0
,
_s01
);
v_uint16x8
v_dst
=
v_pack
(
v_reinterpret_as_u32
(
v_s0
),
v_reinterpret_as_u32
(
v_s01
));
v_pack_store
(
D
+
i
,
v_dst
);
_mm_storel_epi64
((
__m128i
*
)(
D
+
i
),
_mm_packus_epi16
(
_s0T
,
_s0T
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
+
4
),
_mm_sub_epi32
(
_s01
,
_sm1
));
v_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
));
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
));
uint16x8_t
v_dst
=
vcombine_u16
(
vqmovun_s32
(
v_s0
),
vqmovun_s32
(
v_s01
));
vst1_u8
(
D
+
i
,
vqmovn_u16
(
v_dst
));
vst1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)));
vst1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
...
...
@@ -502,10 +447,8 @@ public BaseColumnFilter
ushort
*
SUM
;
const
bool
haveScale
=
scale
!=
1
;
#if CV_SSE2
bool
haveSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#elif CV_NEON
bool
haveNEON
=
checkHardwareSupport
(
CV_CPU_NEON
);
#if CV_SIMD128
bool
haveSIMD128
=
hasSIMD128
();
#endif
if
(
width
!=
(
int
)
sum
.
size
()
)
...
...
@@ -522,21 +465,13 @@ public BaseColumnFilter
{
const
ushort
*
Sp
=
(
const
ushort
*
)
src
[
0
];
int
i
=
0
;
#if CV_S
SE2
if
(
haveSSE2
)
#if CV_S
IMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
__m128i
_sum
=
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
));
__m128i
_sp
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_add_epi16
(
_sum
,
_sp
));
}
v_store
(
SUM
+
i
,
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
));
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
vst1q_u16
(
SUM
+
i
,
vaddq_u16
(
vld1q_u16
(
SUM
+
i
),
vld1q_u16
(
Sp
+
i
)));
}
#endif
for
(
;
i
<
width
;
i
++
)
...
...
@@ -641,11 +576,9 @@ struct ColumnSum<int, short> :
bool
haveScale
=
scale
!=
1
;
double
_scale
=
scale
;
#if CV_SSE2
bool
haveSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#elif CV_NEON
bool
haveNEON
=
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
#if CV_SIMD128
bool
haveSIMD128
=
hasSIMD128
();
#endif
if
(
width
!=
(
int
)
sum
.
size
()
)
{
...
...
@@ -661,22 +594,14 @@ struct ColumnSum<int, short> :
{
const
int
*
Sp
=
(
const
int
*
)
src
[
0
];
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
__m128i
_sum
=
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
));
__m128i
_sp
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_add_epi32
(
_sum
,
_sp
));
v_store
(
SUM
+
i
,
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
));
}
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
vst1q_s32
(
SUM
+
i
,
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)));
}
#endif
for
(
;
i
<
width
;
i
++
)
SUM
[
i
]
+=
Sp
[
i
];
...
...
@@ -696,47 +621,24 @@ struct ColumnSum<int, short> :
if
(
haveScale
)
{
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
{
const
__m128
scale4
=
_mm_set1_ps
((
float
)
_scale
);
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_sm1
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
+
4
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
__m128i
_s01
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
+
4
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
+
4
)));
__m128i
_s0T
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
scale4
,
_mm_cvtepi32_ps
(
_s0
)));
__m128i
_s0T1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
scale4
,
_mm_cvtepi32_ps
(
_s01
)));
_mm_storeu_si128
((
__m128i
*
)(
D
+
i
),
_mm_packs_epi32
(
_s0T
,
_s0T1
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
+
4
),
_mm_sub_epi32
(
_s01
,
_sm1
));
}
}
#elif CV_NEON
if
(
haveNEON
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
float32x4_t
v_scale
=
vdupq_n
_f32
((
float
)
_scale
);
v_float32x4
v_scale
=
v_setall
_f32
((
float
)
_scale
);
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)
);
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
)
);
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
int32x4_t
v_s0d
=
cv_vrndq_s32_f32
(
vmulq_f32
(
vcvtq_f32_s32
(
v_s0
),
v_scale
)
);
int32x4_t
v_s01d
=
cv_vrndq_s32_f32
(
vmulq_f32
(
vcvtq_f32_s32
(
v_s01
),
v_scale
)
);
v
st1q_s16
(
D
+
i
,
vcombine_s16
(
vqmovn_s32
(
v_s0d
),
vqmovn_s32
(
v_s01d
)
));
v_int32x4
v_s0d
=
v_round
(
v_cvt_f32
(
v_s0
)
*
v_scale
);
v_int32x4
v_s01d
=
v_round
(
v_cvt_f32
(
v_s01
)
*
v_scale
);
v
_store
(
D
+
i
,
v_pack
(
v_s0d
,
v_s01d
));
v
st1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)
));
v
st1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)
));
v
_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v
_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -747,41 +649,21 @@ struct ColumnSum<int, short> :
else
{
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_sm1
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
+
4
));
v_store
(
D
+
i
,
v_pack
(
v_s0
,
v_s01
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
__m128i
_s01
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
+
4
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
+
4
)));
_mm_storeu_si128
((
__m128i
*
)(
D
+
i
),
_mm_packs_epi32
(
_s0
,
_s01
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
+
4
),
_mm_sub_epi32
(
_s01
,
_sm1
));
v_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
));
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
));
vst1q_s16
(
D
+
i
,
vcombine_s16
(
vqmovn_s32
(
v_s0
),
vqmovn_s32
(
v_s01
)));
vst1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)));
vst1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
...
...
@@ -821,11 +703,9 @@ struct ColumnSum<int, ushort> :
bool
haveScale
=
scale
!=
1
;
double
_scale
=
scale
;
#if CV_SSE2
bool
haveSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#elif CV_NEON
bool
haveNEON
=
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
#if CV_SIMD128
bool
haveSIMD128
=
hasSIMD128
();
#endif
if
(
width
!=
(
int
)
sum
.
size
()
)
{
...
...
@@ -841,23 +721,15 @@ struct ColumnSum<int, ushort> :
{
const
int
*
Sp
=
(
const
int
*
)
src
[
0
];
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
for
(;
i
<=
width
-
4
;
i
+=
4
)
{
__m128i
_sum
=
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
));
__m128i
_sp
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_add_epi32
(
_sum
,
_sp
));
}
v_store
(
SUM
+
i
,
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
));
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
vst1q_s32
(
SUM
+
i
,
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)));
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
SUM
[
i
]
+=
Sp
[
i
];
}
...
...
@@ -876,46 +748,24 @@ struct ColumnSum<int, ushort> :
if
(
haveScale
)
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
{
const
__m128
scale4
=
_mm_set1_ps
((
float
)
_scale
);
const
__m128i
delta0
=
_mm_set1_epi32
(
0x8000
);
const
__m128i
delta1
=
_mm_set1_epi32
(
0x80008000
);
for
(
;
i
<
width
-
4
;
i
+=
4
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
__m128i
_res
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
scale4
,
_mm_cvtepi32_ps
(
_s0
)));
_res
=
_mm_sub_epi32
(
_res
,
delta0
);
_res
=
_mm_add_epi16
(
_mm_packs_epi32
(
_res
,
_res
),
delta1
);
_mm_storel_epi64
((
__m128i
*
)(
D
+
i
),
_res
);
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
}
}
#elif CV_NEON
if
(
haveNEON
)
{
float32x4_t
v_scale
=
vdupq_n_f32
((
float
)
_scale
);
v_float32x4
v_scale
=
v_setall_f32
((
float
)
_scale
);
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)
);
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
)
);
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
uint32x4_t
v_s0d
=
cv_vrndq_u32_f32
(
vmulq_f32
(
vcvtq_f32_s32
(
v_s0
),
v_scale
));
uint32x4_t
v_s01d
=
cv_vrndq_u32_f32
(
vmulq_f32
(
vcvtq_f32_s32
(
v_s01
),
v_scale
));
v
st1q_u16
(
D
+
i
,
vcombine_u16
(
vqmovn_u32
(
v_s0d
),
vqmovn_u32
(
v_s01d
)
));
v_uint32x4
v_s0d
=
v_reinterpret_as_u32
(
v_round
(
v_cvt_f32
(
v_s0
)
*
v_scale
));
v_uint32x4
v_s01d
=
v_reinterpret_as_u32
(
v_round
(
v_cvt_f32
(
v_s01
)
*
v_scale
));
v
_store
(
D
+
i
,
v_pack
(
v_s0d
,
v_s01d
));
v
st1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)
));
v
st1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)
));
v
_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v
_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -926,41 +776,21 @@ struct ColumnSum<int, ushort> :
else
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
{
const
__m128i
delta0
=
_mm_set1_epi32
(
0x8000
);
const
__m128i
delta1
=
_mm_set1_epi32
(
0x80008000
);
for
(
;
i
<
width
-
4
;
i
+=
4
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
__m128i
_res
=
_mm_sub_epi32
(
_s0
,
delta0
);
_res
=
_mm_add_epi16
(
_mm_packs_epi32
(
_res
,
_res
),
delta1
);
_mm_storel_epi64
((
__m128i
*
)(
D
+
i
),
_res
);
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
}
}
#elif CV_NEON
if
(
haveNEON
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)
);
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
)
);
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
v
st1q_u16
(
D
+
i
,
vcombine_u16
(
vqmovun_s32
(
v_s0
),
vqmovun_s
32
(
v_s01
)));
v
_store
(
D
+
i
,
v_pack
(
v_reinterpret_as_u32
(
v_s0
),
v_reinterpret_as_u
32
(
v_s01
)));
v
st1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)
));
v
st1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)
));
v
_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v
_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -998,11 +828,9 @@ struct ColumnSum<int, int> :
bool
haveScale
=
scale
!=
1
;
double
_scale
=
scale
;
#if CV_SSE2
bool
haveSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#elif CV_NEON
bool
haveNEON
=
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
#if CV_SIMD128
bool
haveSIMD128
=
hasSIMD128
();
#endif
if
(
width
!=
(
int
)
sum
.
size
()
)
{
...
...
@@ -1018,23 +846,15 @@ struct ColumnSum<int, int> :
{
const
int
*
Sp
=
(
const
int
*
)
src
[
0
];
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
__m128i
_sum
=
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
));
__m128i
_sp
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_add_epi32
(
_sum
,
_sp
));
}
v_store
(
SUM
+
i
,
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
));
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
vst1q_s32
(
SUM
+
i
,
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)));
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
SUM
[
i
]
+=
Sp
[
i
];
}
...
...
@@ -1053,38 +873,20 @@ struct ColumnSum<int, int> :
if
(
haveScale
)
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
const
__m128
scale4
=
_mm_set1_ps
((
float
)
_scale
);
v_float32x4
v_scale
=
v_setall_f32
((
float
)
_scale
);
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s0d
=
v_round
(
v_cvt_f32
(
v_s0
)
*
v_scale
);
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
__m128i
_s0T
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
scale4
,
_mm_cvtepi32_ps
(
_s0
)));
_mm_storeu_si128
((
__m128i
*
)(
D
+
i
),
_s0T
);
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
v_store
(
D
+
i
,
v_s0d
);
v_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
}
}
#elif CV_NEON
if
(
haveNEON
)
{
float32x4_t
v_scale
=
vdupq_n_f32
((
float
)
_scale
);
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
));
int32x4_t
v_s0d
=
cv_vrndq_s32_f32
(
vmulq_f32
(
vcvtq_f32_s32
(
v_s0
),
v_scale
));
vst1q_s32
(
D
+
i
,
v_s0d
);
vst1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -1095,32 +897,18 @@ struct ColumnSum<int, int> :
else
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
_mm_storeu_si128
((
__m128i
*
)(
D
+
i
),
_s0
);
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
}
}
#elif CV_NEON
if
(
haveNEON
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)
);
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v
st1q_s32
(
D
+
i
,
v_s0
);
v
st1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)
));
v
_store
(
D
+
i
,
v_s0
);
v
_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -1159,11 +947,9 @@ struct ColumnSum<int, float> :
bool
haveScale
=
scale
!=
1
;
double
_scale
=
scale
;
#if CV_SSE2
bool
haveSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#elif CV_NEON
bool
haveNEON
=
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
#if CV_SIMD128
bool
haveSIMD128
=
hasSIMD128
();
#endif
if
(
width
!=
(
int
)
sum
.
size
()
)
{
...
...
@@ -1179,23 +965,15 @@ struct ColumnSum<int, float> :
{
const
int
*
Sp
=
(
const
int
*
)
src
[
0
];
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
__m128i
_sum
=
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
));
__m128i
_sp
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_add_epi32
(
_sum
,
_sp
));
}
v_store
(
SUM
+
i
,
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
));
}
#elif CV_NEON
if
(
haveNEON
)
{
for
(
;
i
<=
width
-
4
;
i
+=
4
)
vst1q_s32
(
SUM
+
i
,
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)));
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
SUM
[
i
]
+=
Sp
[
i
];
...
...
@@ -1216,39 +994,23 @@ struct ColumnSum<int, float> :
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
{
const
__m128
scale4
=
_mm_set1_ps
((
float
)
_scale
);
for
(
;
i
<
width
-
4
;
i
+=
4
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
_mm_storeu_ps
(
D
+
i
,
_mm_mul_ps
(
scale4
,
_mm_cvtepi32_ps
(
_s0
)));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
}
}
#elif CV_NEON
if
(
haveNEON
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
float32x4_t
v_scale
=
vdupq_n
_f32
((
float
)
_scale
);
for
(
;
i
<=
width
-
8
;
i
+=
8
)
v_float32x4
v_scale
=
v_setall
_f32
((
float
)
_scale
);
for
(;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)
);
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
)
);
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
v
st1q_f32
(
D
+
i
,
vmulq_f32
(
vcvtq_f32_s32
(
v_s0
),
v_scale
)
);
v
st1q_f32
(
D
+
i
+
4
,
vmulq_f32
(
vcvtq_f32_s32
(
v_s01
),
v_scale
)
);
v
_store
(
D
+
i
,
v_cvt_f32
(
v_s0
)
*
v_scale
);
v
_store
(
D
+
i
+
4
,
v_cvt_f32
(
v_s01
)
*
v_scale
);
v
st1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)
));
v
st1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)
));
v
_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v
_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -1260,36 +1022,22 @@ struct ColumnSum<int, float> :
{
int
i
=
0
;
#if CV_SSE2
if
(
haveSSE2
)
{
for
(
;
i
<
width
-
4
;
i
+=
4
)
{
__m128i
_sm
=
_mm_loadu_si128
((
const
__m128i
*
)(
Sm
+
i
));
__m128i
_s0
=
_mm_add_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
SUM
+
i
)),
_mm_loadu_si128
((
const
__m128i
*
)(
Sp
+
i
)));
_mm_storeu_ps
(
D
+
i
,
_mm_cvtepi32_ps
(
_s0
));
_mm_storeu_si128
((
__m128i
*
)(
SUM
+
i
),
_mm_sub_epi32
(
_s0
,
_sm
));
}
}
#elif CV_NEON
if
(
haveNEON
)
#if CV_SIMD128
if
(
haveSIMD128
)
{
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
int32x4_t
v_s0
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
),
vld1q_s32
(
Sp
+
i
)
);
int32x4_t
v_s01
=
vaddq_s32
(
vld1q_s32
(
SUM
+
i
+
4
),
vld1q_s32
(
Sp
+
i
+
4
)
);
v_int32x4
v_s0
=
v_load
(
SUM
+
i
)
+
v_load
(
Sp
+
i
);
v_int32x4
v_s01
=
v_load
(
SUM
+
i
+
4
)
+
v_load
(
Sp
+
i
+
4
);
v
st1q_f32
(
D
+
i
,
vcvtq_f32_s
32
(
v_s0
));
v
st1q_f32
(
D
+
i
+
4
,
vcvtq_f32_s
32
(
v_s01
));
v
_store
(
D
+
i
,
v_cvt_f
32
(
v_s0
));
v
_store
(
D
+
i
+
4
,
v_cvt_f
32
(
v_s01
));
v
st1q_s32
(
SUM
+
i
,
vsubq_s32
(
v_s0
,
vld1q_s32
(
Sm
+
i
)
));
v
st1q_s32
(
SUM
+
i
+
4
,
vsubq_s32
(
v_s01
,
vld1q_s32
(
Sm
+
i
+
4
)
));
v
_store
(
SUM
+
i
,
v_s0
-
v_load
(
Sm
+
i
));
v
_store
(
SUM
+
i
+
4
,
v_s01
-
v_load
(
Sm
+
i
+
4
));
}
}
#endif
#endif
for
(
;
i
<
width
;
i
++
)
{
int
s0
=
SUM
[
i
]
+
Sp
[
i
];
...
...
@@ -2395,46 +2143,20 @@ typedef struct
}
Histogram
;
#if CV_SSE2
#define MEDIAN_HAVE_SIMD 1
#if CV_SIMD128
static
inline
void
histogram_add_simd
(
const
HT
x
[
16
],
HT
y
[
16
]
)
{
const
__m128i
*
rx
=
(
const
__m128i
*
)
x
;
__m128i
*
ry
=
(
__m128i
*
)
y
;
__m128i
r0
=
_mm_add_epi16
(
_mm_load_si128
(
ry
+
0
),
_mm_load_si128
(
rx
+
0
));
__m128i
r1
=
_mm_add_epi16
(
_mm_load_si128
(
ry
+
1
),
_mm_load_si128
(
rx
+
1
));
_mm_store_si128
(
ry
+
0
,
r0
);
_mm_store_si128
(
ry
+
1
,
r1
);
v_store
(
y
,
v_load
(
x
)
+
v_load
(
y
));
v_store
(
y
+
8
,
v_load
(
x
+
8
)
+
v_load
(
y
+
8
));
}
static
inline
void
histogram_sub_simd
(
const
HT
x
[
16
],
HT
y
[
16
]
)
{
const
__m128i
*
rx
=
(
const
__m128i
*
)
x
;
__m128i
*
ry
=
(
__m128i
*
)
y
;
__m128i
r0
=
_mm_sub_epi16
(
_mm_load_si128
(
ry
+
0
),
_mm_load_si128
(
rx
+
0
));
__m128i
r1
=
_mm_sub_epi16
(
_mm_load_si128
(
ry
+
1
),
_mm_load_si128
(
rx
+
1
));
_mm_store_si128
(
ry
+
0
,
r0
);
_mm_store_si128
(
ry
+
1
,
r1
);
}
#elif CV_NEON
#define MEDIAN_HAVE_SIMD 1
static
inline
void
histogram_add_simd
(
const
HT
x
[
16
],
HT
y
[
16
]
)
{
vst1q_u16
(
y
,
vaddq_u16
(
vld1q_u16
(
x
),
vld1q_u16
(
y
)));
vst1q_u16
(
y
+
8
,
vaddq_u16
(
vld1q_u16
(
x
+
8
),
vld1q_u16
(
y
+
8
)));
v_store
(
y
,
v_load
(
y
)
-
v_load
(
x
));
v_store
(
y
+
8
,
v_load
(
y
+
8
)
-
v_load
(
x
+
8
));
}
static
inline
void
histogram_sub_simd
(
const
HT
x
[
16
],
HT
y
[
16
]
)
{
vst1q_u16
(
y
,
vsubq_u16
(
vld1q_u16
(
y
),
vld1q_u16
(
x
)));
vst1q_u16
(
y
+
8
,
vsubq_u16
(
vld1q_u16
(
y
+
8
),
vld1q_u16
(
x
+
8
)));
}
#else
#define MEDIAN_HAVE_SIMD 0
#endif
...
...
@@ -2486,8 +2208,8 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
std
::
vector
<
HT
>
_h_fine
(
16
*
16
*
(
STRIPE_SIZE
+
2
*
r
)
*
cn
+
16
);
HT
*
h_coarse
=
alignPtr
(
&
_h_coarse
[
0
],
16
);
HT
*
h_fine
=
alignPtr
(
&
_h_fine
[
0
],
16
);
#if
MEDIAN_HAVE_SIMD
volatile
bool
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#if
CV_SIMD128
volatile
bool
useSIMD
=
hasSIMD128
(
);
#endif
for
(
int
x
=
0
;
x
<
_dst
.
cols
;
x
+=
STRIPE_SIZE
)
...
...
@@ -2533,7 +2255,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
for
(
k
=
0
;
k
<
16
;
++
k
)
histogram_muladd
(
2
*
r
+
1
,
&
h_fine
[
16
*
n
*
(
16
*
c
+
k
)],
&
H
[
c
].
fine
[
k
][
0
]
);
#if MEDIAN_HAVE_SIMD
#if CV_SIMD128
if
(
useSIMD
)
{
for
(
j
=
0
;
j
<
2
*
r
;
++
j
)
...
...
@@ -2597,7 +2319,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
}
}
else
#endif
#endif
{
for
(
j
=
0
;
j
<
2
*
r
;
++
j
)
histogram_add
(
&
h_coarse
[
16
*
(
n
*
c
+
j
)],
H
[
c
].
coarse
);
...
...
@@ -2871,20 +2593,20 @@ struct MinMax32f
}
};
#if CV_S
SE2
#if CV_S
IMD128
struct
MinMaxVec8u
{
typedef
uchar
value_type
;
typedef
__m128i
arg_type
;
typedef
v_uint8x16
arg_type
;
enum
{
SIZE
=
16
};
arg_type
load
(
const
uchar
*
ptr
)
{
return
_mm_loadu_si128
((
const
__m128i
*
)
ptr
);
}
void
store
(
uchar
*
ptr
,
arg_type
val
)
{
_mm_storeu_si128
((
__m128i
*
)
ptr
,
val
);
}
arg_type
load
(
const
uchar
*
ptr
)
{
return
v_load
(
ptr
);
}
void
store
(
uchar
*
ptr
,
const
arg_type
&
val
)
{
v_store
(
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
a
;
a
=
_mm_min_epu8
(
a
,
b
);
b
=
_mm_max_epu8
(
b
,
t
);
a
=
v_min
(
a
,
b
);
b
=
v_max
(
b
,
t
);
}
};
...
...
@@ -2892,80 +2614,15 @@ struct MinMaxVec8u
struct
MinMaxVec16u
{
typedef
ushort
value_type
;
typedef
__m128i
arg_type
;
typedef
v_uint16x8
arg_type
;
enum
{
SIZE
=
8
};
arg_type
load
(
const
ushort
*
ptr
)
{
return
_mm_loadu_si128
((
const
__m128i
*
)
ptr
);
}
void
store
(
ushort
*
ptr
,
arg_type
val
)
{
_mm_storeu_si128
((
__m128i
*
)
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
_mm_subs_epu16
(
a
,
b
);
a
=
_mm_subs_epu16
(
a
,
t
);
b
=
_mm_adds_epu16
(
b
,
t
);
}
};
struct
MinMaxVec16s
{
typedef
short
value_type
;
typedef
__m128i
arg_type
;
enum
{
SIZE
=
8
};
arg_type
load
(
const
short
*
ptr
)
{
return
_mm_loadu_si128
((
const
__m128i
*
)
ptr
);
}
void
store
(
short
*
ptr
,
arg_type
val
)
{
_mm_storeu_si128
((
__m128i
*
)
ptr
,
val
);
}
arg_type
load
(
const
ushort
*
ptr
)
{
return
v_load
(
ptr
);
}
void
store
(
ushort
*
ptr
,
const
arg_type
&
val
)
{
v_store
(
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
a
;
a
=
_mm_min_epi16
(
a
,
b
);
b
=
_mm_max_epi16
(
b
,
t
);
}
};
struct
MinMaxVec32f
{
typedef
float
value_type
;
typedef
__m128
arg_type
;
enum
{
SIZE
=
4
};
arg_type
load
(
const
float
*
ptr
)
{
return
_mm_loadu_ps
(
ptr
);
}
void
store
(
float
*
ptr
,
arg_type
val
)
{
_mm_storeu_ps
(
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
a
;
a
=
_mm_min_ps
(
a
,
b
);
b
=
_mm_max_ps
(
b
,
t
);
}
};
#elif CV_NEON
struct
MinMaxVec8u
{
typedef
uchar
value_type
;
typedef
uint8x16_t
arg_type
;
enum
{
SIZE
=
16
};
arg_type
load
(
const
uchar
*
ptr
)
{
return
vld1q_u8
(
ptr
);
}
void
store
(
uchar
*
ptr
,
arg_type
val
)
{
vst1q_u8
(
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
a
;
a
=
vminq_u8
(
a
,
b
);
b
=
vmaxq_u8
(
b
,
t
);
}
};
struct
MinMaxVec16u
{
typedef
ushort
value_type
;
typedef
uint16x8_t
arg_type
;
enum
{
SIZE
=
8
};
arg_type
load
(
const
ushort
*
ptr
)
{
return
vld1q_u16
(
ptr
);
}
void
store
(
ushort
*
ptr
,
arg_type
val
)
{
vst1q_u16
(
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
a
;
a
=
vminq_u16
(
a
,
b
);
b
=
vmaxq_u16
(
b
,
t
);
a
=
v_min
(
a
,
b
);
b
=
v_max
(
b
,
t
);
}
};
...
...
@@ -2973,15 +2630,15 @@ struct MinMaxVec16u
struct
MinMaxVec16s
{
typedef
short
value_type
;
typedef
int16x8_t
arg_type
;
typedef
v_int16x8
arg_type
;
enum
{
SIZE
=
8
};
arg_type
load
(
const
short
*
ptr
)
{
return
v
ld1q_s16
(
ptr
);
}
void
store
(
short
*
ptr
,
arg_type
val
)
{
vst1q_s16
(
ptr
,
val
);
}
arg_type
load
(
const
short
*
ptr
)
{
return
v
_load
(
ptr
);
}
void
store
(
short
*
ptr
,
const
arg_type
&
val
)
{
v_store
(
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
a
;
a
=
v
minq_s16
(
a
,
b
);
b
=
v
maxq_s16
(
b
,
t
);
a
=
v
_min
(
a
,
b
);
b
=
v
_max
(
b
,
t
);
}
};
...
...
@@ -2989,19 +2646,18 @@ struct MinMaxVec16s
struct
MinMaxVec32f
{
typedef
float
value_type
;
typedef
float32x4_t
arg_type
;
typedef
v_float32x4
arg_type
;
enum
{
SIZE
=
4
};
arg_type
load
(
const
float
*
ptr
)
{
return
v
ld1q_f32
(
ptr
);
}
void
store
(
float
*
ptr
,
arg_type
val
)
{
vst1q_f32
(
ptr
,
val
);
}
arg_type
load
(
const
float
*
ptr
)
{
return
v
_load
(
ptr
);
}
void
store
(
float
*
ptr
,
const
arg_type
&
val
)
{
v_store
(
ptr
,
val
);
}
void
operator
()(
arg_type
&
a
,
arg_type
&
b
)
const
{
arg_type
t
=
a
;
a
=
v
minq_f32
(
a
,
b
);
b
=
v
maxq_f32
(
b
,
t
);
a
=
v
_min
(
a
,
b
);
b
=
v
_max
(
b
,
t
);
}
};
#else
typedef
MinMax8u
MinMaxVec8u
;
...
...
@@ -3027,7 +2683,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
int
i
,
j
,
k
,
cn
=
_src
.
channels
();
Op
op
;
VecOp
vop
;
volatile
bool
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
volatile
bool
useSIMD
=
hasSIMD128
(
);
if
(
m
==
3
)
{
...
...
@@ -3478,7 +3134,7 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
#endif
bool
useSortNet
=
ksize
==
3
||
(
ksize
==
5
#if !(CV_S
SE2 || CV_NEON
)
#if !(CV_S
IMD128
)
&&
(
src0
.
depth
()
>
CV_8U
||
src0
.
channels
()
==
2
||
src0
.
channels
()
>
4
)
#endif
);
...
...
@@ -3513,7 +3169,7 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
double
img_size_mp
=
(
double
)(
src0
.
total
())
/
(
1
<<
20
);
if
(
ksize
<=
3
+
(
img_size_mp
<
1
?
12
:
img_size_mp
<
4
?
6
:
2
)
*
(
MEDIAN_HAVE_SIMD
&&
(
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
)
)
?
1
:
3
))
(
CV_SIMD128
&&
hasSIMD128
(
)
?
1
:
3
))
medianBlur_8u_Om
(
src
,
dst
,
ksize
);
else
medianBlur_8u_O1
(
src
,
dst
,
ksize
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment