Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0b4e06ad
Commit
0b4e06ad
authored
Jul 29, 2016
by
k-shinotsuka
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
improve SumSqr_SIMD<uchar, int, int>()
parent
b34272f8
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
23 deletions
+21
-23
stat.cpp
modules/core/src/stat.cpp
+21
-23
No files found.
modules/core/src/stat.cpp
View file @
0b4e06ad
...
@@ -757,38 +757,36 @@ struct SumSqr_SIMD<uchar, int, int>
...
@@ -757,38 +757,36 @@ struct SumSqr_SIMD<uchar, int, int>
int
x
=
0
;
int
x
=
0
;
__m128i
v_zero
=
_mm_setzero_si128
(),
v_sum
=
v_zero
,
v_sqsum
=
v_zero
;
__m128i
v_zero
=
_mm_setzero_si128
(),
v_sum
=
v_zero
,
v_sqsum
=
v_zero
;
const
int
len_16
=
len
&
~
15
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
for
(
;
x
<=
len_16
-
16
;
)
{
const
int
len_tmp
=
min
(
x
+
2048
,
len_16
);
__m128i
v_sum_tmp
=
v_zero
;
for
(
;
x
<=
len_tmp
-
16
;
x
+=
16
)
{
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src0
+
x
));
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src0
+
x
));
__m128i
v_half
=
_mm_unpacklo_epi8
(
v_src
,
v_zero
);
__m128i
v_half_0
=
_mm_unpacklo_epi8
(
v_src
,
v_zero
);
__m128i
v_half_1
=
_mm_unpackhi_epi8
(
v_src
,
v_zero
);
__m128i
v_mullo
=
_mm_mullo_epi16
(
v_half
,
v_half
);
v_sum_tmp
=
_mm_add_epi16
(
v_sum_tmp
,
_mm_add_epi16
(
v_half_0
,
v_half_1
));
__m128i
v_mulhi
=
_mm_mulhi_epi16
(
v_half
,
v_half
);
__m128i
v_half_2
=
_mm_unpacklo_epi16
(
v_half_0
,
v_half_1
);
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpacklo_epi16
(
v_half
,
v_zero
));
__m128i
v_half_3
=
_mm_unpackhi_epi16
(
v_half_0
,
v_half_1
);
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpackhi_epi16
(
v_half
,
v_zero
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_madd_epi16
(
v_half_2
,
v_half_2
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_unpacklo_epi16
(
v_mullo
,
v_mulhi
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_madd_epi16
(
v_half_3
,
v_half_3
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_unpackhi_epi16
(
v_mullo
,
v_mulhi
));
}
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpacklo_epi16
(
v_sum_tmp
,
v_zero
));
v_half
=
_mm_unpackhi_epi8
(
v_src
,
v_zero
);
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpackhi_epi16
(
v_sum_tmp
,
v_zero
));
v_mullo
=
_mm_mullo_epi16
(
v_half
,
v_half
);
v_mulhi
=
_mm_mulhi_epi16
(
v_half
,
v_half
);
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpacklo_epi16
(
v_half
,
v_zero
));
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpackhi_epi16
(
v_half
,
v_zero
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_unpacklo_epi16
(
v_mullo
,
v_mulhi
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_unpackhi_epi16
(
v_mullo
,
v_mulhi
));
}
}
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_src
=
_mm_unpacklo_epi8
(
_mm_loadl_epi64
((
__m128i
const
*
)(
src0
+
x
)),
v_zero
);
__m128i
v_src
=
_mm_unpacklo_epi8
(
_mm_loadl_epi64
((
__m128i
const
*
)(
src0
+
x
)),
v_zero
);
__m128i
v_half_0
=
_mm_unpackhi_epi64
(
v_src
,
v_src
);
__m128i
v_sum_tmp
=
_mm_add_epi16
(
v_src
,
v_half_0
);
__m128i
v_half_1
=
_mm_unpacklo_epi16
(
v_src
,
v_half_0
);
__m128i
v_mullo
=
_mm_mullo_epi16
(
v_src
,
v_src
);
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpacklo_epi16
(
v_sum_tmp
,
v_zero
));
__m128i
v_mulhi
=
_mm_mulhi_epi16
(
v_src
,
v_src
);
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_madd_epi16
(
v_half_1
,
v_half_1
));
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpacklo_epi16
(
v_src
,
v_zero
));
v_sum
=
_mm_add_epi32
(
v_sum
,
_mm_unpackhi_epi16
(
v_src
,
v_zero
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_unpacklo_epi16
(
v_mullo
,
v_mulhi
));
v_sqsum
=
_mm_add_epi32
(
v_sqsum
,
_mm_unpackhi_epi16
(
v_mullo
,
v_mulhi
));
}
}
int
CV_DECL_ALIGNED
(
16
)
ar
[
8
];
int
CV_DECL_ALIGNED
(
16
)
ar
[
8
];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment