Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0e9c90a0
Commit
0e9c90a0
authored
Jan 15, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #13610 from terfendail:morph_wintr
parents
36dfb71c
012e43de
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
211 additions
and
368 deletions
+211
-368
filter.cpp
modules/imgproc/src/filter.cpp
+1
-1
morph.cpp
modules/imgproc/src/morph.cpp
+210
-367
No files found.
modules/imgproc/src/filter.cpp
View file @
0e9c90a0
...
@@ -213,7 +213,7 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs
...
@@ -213,7 +213,7 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs
}
}
// adjust bufstep so that the used part of the ring buffer stays compact in memory
// adjust bufstep so that the used part of the ring buffer stays compact in memory
bufStep
=
bufElemSize
*
(
int
)
alignSize
(
roi
.
width
+
(
!
isSeparable
()
?
ksize
.
width
-
1
:
0
),
16
);
bufStep
=
bufElemSize
*
(
int
)
alignSize
(
roi
.
width
+
(
!
isSeparable
()
?
ksize
.
width
-
1
:
0
),
VEC_ALIGN
);
dx1
=
std
::
max
(
anchor
.
x
-
roi
.
x
,
0
);
dx1
=
std
::
max
(
anchor
.
x
-
roi
.
x
,
0
);
dx2
=
std
::
max
(
ksize
.
width
-
anchor
.
x
-
1
+
roi
.
x
+
roi
.
width
-
wholeSize
.
width
,
0
);
dx2
=
std
::
max
(
ksize
.
width
-
anchor
.
x
-
1
+
roi
.
x
+
roi
.
width
-
wholeSize
.
width
,
0
);
...
...
modules/imgproc/src/morph.cpp
View file @
0e9c90a0
...
@@ -45,6 +45,7 @@
...
@@ -45,6 +45,7 @@
#include "opencl_kernels_imgproc.hpp"
#include "opencl_kernels_imgproc.hpp"
#include <iostream>
#include <iostream>
#include "hal_replacement.hpp"
#include "hal_replacement.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <opencv2/core/utils/configuration.private.hpp>
#include <opencv2/core/utils/configuration.private.hpp>
/****************************************************************************************\
/****************************************************************************************\
...
@@ -97,73 +98,65 @@ struct MorphNoVec
...
@@ -97,73 +98,65 @@ struct MorphNoVec
int
operator
()(
uchar
**
,
int
,
uchar
*
,
int
)
const
{
return
0
;
}
int
operator
()(
uchar
**
,
int
,
uchar
*
,
int
)
const
{
return
0
;
}
};
};
#if CV_S
SE2
#if CV_S
IMD
template
<
class
VecUpdate
>
struct
MorphRow
I
Vec
template
<
class
VecUpdate
>
struct
MorphRowVec
{
{
enum
{
ESZ
=
VecUpdate
::
ESZ
}
;
typedef
typename
VecUpdate
::
vtype
vtype
;
typedef
typename
vtype
::
lane_type
stype
;
MorphRow
I
Vec
(
int
_ksize
,
int
_anchor
)
:
ksize
(
_ksize
),
anchor
(
_anchor
)
{}
MorphRowVec
(
int
_ksize
,
int
_anchor
)
:
ksize
(
_ksize
),
anchor
(
_anchor
)
{}
int
operator
()(
const
uchar
*
src
,
uchar
*
dst
,
int
width
,
int
cn
)
const
int
operator
()(
const
uchar
*
src
,
uchar
*
dst
,
int
width
,
int
cn
)
const
{
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
cn
*=
ESZ
;
int
i
,
k
,
_ksize
=
ksize
*
cn
;
int
i
,
k
,
_ksize
=
ksize
*
cn
;
width
=
(
width
&
-
4
)
*
cn
;
width
*=
cn
;
VecUpdate
updateOp
;
VecUpdate
updateOp
;
for
(
i
=
0
;
i
<=
width
-
16
;
i
+=
16
)
for
(
i
=
0
;
i
<=
width
-
4
*
vtype
::
nlanes
;
i
+=
4
*
vtype
::
nlanes
)
{
{
__m128i
s
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
i
));
vtype
s0
=
vx_load
((
const
stype
*
)
src
+
i
);
for
(
k
=
cn
;
k
<
_ksize
;
k
+=
cn
)
vtype
s1
=
vx_load
((
const
stype
*
)
src
+
i
+
vtype
::
nlanes
);
vtype
s2
=
vx_load
((
const
stype
*
)
src
+
i
+
2
*
vtype
::
nlanes
);
vtype
s3
=
vx_load
((
const
stype
*
)
src
+
i
+
3
*
vtype
::
nlanes
);
for
(
k
=
cn
;
k
<
_ksize
;
k
+=
cn
)
{
{
__m128i
x
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
i
+
k
));
s0
=
updateOp
(
s0
,
vx_load
((
const
stype
*
)
src
+
i
+
k
));
s
=
updateOp
(
s
,
x
);
s1
=
updateOp
(
s1
,
vx_load
((
const
stype
*
)
src
+
i
+
k
+
vtype
::
nlanes
));
s2
=
updateOp
(
s2
,
vx_load
((
const
stype
*
)
src
+
i
+
k
+
2
*
vtype
::
nlanes
));
s3
=
updateOp
(
s3
,
vx_load
((
const
stype
*
)
src
+
i
+
k
+
3
*
vtype
::
nlanes
));
}
}
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
),
s
);
v_store
((
stype
*
)
dst
+
i
,
s0
);
v_store
((
stype
*
)
dst
+
i
+
vtype
::
nlanes
,
s1
);
v_store
((
stype
*
)
dst
+
i
+
2
*
vtype
::
nlanes
,
s2
);
v_store
((
stype
*
)
dst
+
i
+
3
*
vtype
::
nlanes
,
s3
);
}
}
if
(
i
<=
width
-
2
*
vtype
::
nlanes
)
for
(
;
i
<
width
;
i
+=
4
)
{
{
__m128i
s
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src
+
i
));
vtype
s0
=
vx_load
((
const
stype
*
)
src
+
i
);
vtype
s1
=
vx_load
((
const
stype
*
)
src
+
i
+
vtype
::
nlanes
);
for
(
k
=
cn
;
k
<
_ksize
;
k
+=
cn
)
for
(
k
=
cn
;
k
<
_ksize
;
k
+=
cn
)
{
{
__m128i
x
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src
+
i
+
k
));
s0
=
updateOp
(
s0
,
vx_load
((
const
stype
*
)
src
+
i
+
k
));
s
=
updateOp
(
s
,
x
);
s
1
=
updateOp
(
s1
,
vx_load
((
const
stype
*
)
src
+
i
+
k
+
vtype
::
nlanes
)
);
}
}
*
(
int
*
)(
dst
+
i
)
=
_mm_cvtsi128_si32
(
s
);
v_store
((
stype
*
)
dst
+
i
,
s0
);
v_store
((
stype
*
)
dst
+
i
+
vtype
::
nlanes
,
s1
);
i
+=
2
*
vtype
::
nlanes
;
}
}
if
(
i
<=
width
-
vtype
::
nlanes
)
return
i
/
ESZ
;
}
int
ksize
,
anchor
;
};
template
<
class
VecUpdate
>
struct
MorphRowFVec
{
MorphRowFVec
(
int
_ksize
,
int
_anchor
)
:
ksize
(
_ksize
),
anchor
(
_anchor
)
{}
int
operator
()(
const
uchar
*
src
,
uchar
*
dst
,
int
width
,
int
cn
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
0
;
int
i
,
k
,
_ksize
=
ksize
*
cn
;
width
=
(
width
&
-
4
)
*
cn
;
VecUpdate
updateOp
;
for
(
i
=
0
;
i
<
width
;
i
+=
4
)
{
{
__m128
s
=
_mm_loadu_ps
((
const
float
*
)
src
+
i
);
vtype
s
=
vx_load
((
const
stype
*
)
src
+
i
);
for
(
k
=
cn
;
k
<
_ksize
;
k
+=
cn
)
for
(
k
=
cn
;
k
<
_ksize
;
k
+=
cn
)
{
s
=
updateOp
(
s
,
vx_load
((
const
stype
*
)
src
+
i
+
k
));
__m128
x
=
_mm_loadu_ps
((
const
float
*
)
src
+
i
+
k
);
v_store
((
stype
*
)
dst
+
i
,
s
);
s
=
updateOp
(
s
,
x
);
i
+=
vtype
::
nlanes
;
}
}
_mm_storeu_ps
((
float
*
)
dst
+
i
,
s
);
if
(
i
<=
width
-
vtype
::
nlanes
/
2
)
{
vtype
s
=
vx_load_low
((
const
stype
*
)
src
+
i
);
for
(
k
=
cn
;
k
<
_ksize
;
k
+=
cn
)
s
=
updateOp
(
s
,
vx_load_low
((
const
stype
*
)
src
+
i
+
k
));
v_store_low
((
stype
*
)
dst
+
i
,
s
);
i
+=
vtype
::
nlanes
/
2
;
}
}
return
i
;
return
i
;
...
@@ -173,230 +166,156 @@ template<class VecUpdate> struct MorphRowFVec
...
@@ -173,230 +166,156 @@ template<class VecUpdate> struct MorphRowFVec
};
};
template
<
class
VecUpdate
>
struct
MorphColumn
I
Vec
template
<
class
VecUpdate
>
struct
MorphColumnVec
{
{
enum
{
ESZ
=
VecUpdate
::
ESZ
}
;
typedef
typename
VecUpdate
::
vtype
vtype
;
typedef
typename
vtype
::
lane_type
stype
;
MorphColumn
I
Vec
(
int
_ksize
,
int
_anchor
)
:
ksize
(
_ksize
),
anchor
(
_anchor
)
{}
MorphColumnVec
(
int
_ksize
,
int
_anchor
)
:
ksize
(
_ksize
),
anchor
(
_anchor
)
{}
int
operator
()(
const
uchar
**
src
,
uchar
*
dst
,
int
dststep
,
int
count
,
int
width
)
const
int
operator
()(
const
uchar
**
_src
,
uchar
*
_
dst
,
int
dststep
,
int
count
,
int
width
)
const
{
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
int
i
=
0
,
k
,
_ksize
=
ksize
;
int
i
=
0
,
k
,
_ksize
=
ksize
;
width
*=
ESZ
;
VecUpdate
updateOp
;
VecUpdate
updateOp
;
for
(
i
=
0
;
i
<
count
+
ksize
-
1
;
i
++
)
for
(
i
=
0
;
i
<
count
+
ksize
-
1
;
i
++
)
CV_Assert
(
((
size_t
)
src
[
i
]
&
15
)
==
0
);
CV_Assert
(
((
size_t
)
_src
[
i
]
&
(
CV_SIMD_WIDTH
-
1
))
==
0
);
const
stype
**
src
=
(
const
stype
**
)
_src
;
stype
*
dst
=
(
stype
*
)
_dst
;
dststep
/=
sizeof
(
dst
[
0
]);
for
(
;
_ksize
>
1
&&
count
>
1
;
count
-=
2
,
dst
+=
dststep
*
2
,
src
+=
2
)
for
(
;
_ksize
>
1
&&
count
>
1
;
count
-=
2
,
dst
+=
dststep
*
2
,
src
+=
2
)
{
{
for
(
i
=
0
;
i
<=
width
-
32
;
i
+=
32
)
for
(
i
=
0
;
i
<=
width
-
4
*
vtype
::
nlanes
;
i
+=
4
*
vtype
::
nlanes
)
{
{
const
uchar
*
sptr
=
src
[
1
]
+
i
;
const
stype
*
sptr
=
src
[
1
]
+
i
;
__m128i
s0
=
_mm_load_si128
((
const
__m128i
*
)
sptr
);
vtype
s0
=
vx_load_aligned
(
sptr
);
__m128i
s1
=
_mm_load_si128
((
const
__m128i
*
)(
sptr
+
16
));
vtype
s1
=
vx_load_aligned
(
sptr
+
vtype
::
nlanes
);
__m128i
x0
,
x1
;
vtype
s2
=
vx_load_aligned
(
sptr
+
2
*
vtype
::
nlanes
);
vtype
s3
=
vx_load_aligned
(
sptr
+
3
*
vtype
::
nlanes
);
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
{
{
sptr
=
src
[
k
]
+
i
;
sptr
=
src
[
k
]
+
i
;
x0
=
_mm_load_si128
((
const
__m128i
*
)
sptr
);
s0
=
updateOp
(
s0
,
vx_load_aligned
(
sptr
)
);
x1
=
_mm_load_si128
((
const
__m128i
*
)(
sptr
+
16
));
s1
=
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
));
s
0
=
updateOp
(
s0
,
x0
);
s
2
=
updateOp
(
s2
,
vx_load_aligned
(
sptr
+
2
*
vtype
::
nlanes
)
);
s
1
=
updateOp
(
s1
,
x1
);
s
3
=
updateOp
(
s3
,
vx_load_aligned
(
sptr
+
3
*
vtype
::
nlanes
)
);
}
}
sptr
=
src
[
0
]
+
i
;
sptr
=
src
[
0
]
+
i
;
x0
=
_mm_load_si128
((
const
__m128i
*
)
sptr
);
v_store
(
dst
+
i
,
updateOp
(
s0
,
vx_load_aligned
(
sptr
))
);
x1
=
_mm_load_si128
((
const
__m128i
*
)(
sptr
+
16
));
v_store
(
dst
+
i
+
vtype
::
nlanes
,
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
)
));
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
),
updateOp
(
s0
,
x0
));
v_store
(
dst
+
i
+
2
*
vtype
::
nlanes
,
updateOp
(
s2
,
vx_load_aligned
(
sptr
+
2
*
vtype
::
nlanes
)
));
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
+
16
),
updateOp
(
s1
,
x1
));
v_store
(
dst
+
i
+
3
*
vtype
::
nlanes
,
updateOp
(
s3
,
vx_load_aligned
(
sptr
+
3
*
vtype
::
nlanes
)
));
sptr
=
src
[
k
]
+
i
;
sptr
=
src
[
k
]
+
i
;
x0
=
_mm_load_si128
((
const
__m128i
*
)
sptr
);
v_store
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
vx_load_aligned
(
sptr
))
);
x1
=
_mm_load_si128
((
const
__m128i
*
)(
sptr
+
16
));
v_store
(
dst
+
dststep
+
i
+
vtype
::
nlanes
,
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
)
));
_mm_storeu_si128
((
__m128i
*
)(
dst
+
dststep
+
i
),
updateOp
(
s0
,
x0
));
v_store
(
dst
+
dststep
+
i
+
2
*
vtype
::
nlanes
,
updateOp
(
s2
,
vx_load_aligned
(
sptr
+
2
*
vtype
::
nlanes
)
));
_mm_storeu_si128
((
__m128i
*
)(
dst
+
dststep
+
i
+
16
),
updateOp
(
s1
,
x1
));
v_store
(
dst
+
dststep
+
i
+
3
*
vtype
::
nlanes
,
updateOp
(
s3
,
vx_load_aligned
(
sptr
+
3
*
vtype
::
nlanes
)
));
}
}
if
(
i
<=
width
-
2
*
vtype
::
nlanes
)
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
{
__m128i
s0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
1
]
+
i
)),
x0
;
const
stype
*
sptr
=
src
[
1
]
+
i
;
vtype
s0
=
vx_load_aligned
(
sptr
);
vtype
s1
=
vx_load_aligned
(
sptr
+
vtype
::
nlanes
);
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
{
x0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
k
]
+
i
));
s0
=
updateOp
(
s0
,
x0
);
}
x0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
0
]
+
i
));
_mm_storel_epi64
((
__m128i
*
)(
dst
+
i
),
updateOp
(
s0
,
x0
));
x0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
k
]
+
i
));
_mm_storel_epi64
((
__m128i
*
)(
dst
+
dststep
+
i
),
updateOp
(
s0
,
x0
));
}
}
for
(
;
count
>
0
;
count
--
,
dst
+=
dststep
,
src
++
)
{
for
(
i
=
0
;
i
<=
width
-
32
;
i
+=
32
)
{
const
uchar
*
sptr
=
src
[
0
]
+
i
;
__m128i
s0
=
_mm_load_si128
((
const
__m128i
*
)
sptr
);
__m128i
s1
=
_mm_load_si128
((
const
__m128i
*
)(
sptr
+
16
));
__m128i
x0
,
x1
;
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
{
{
sptr
=
src
[
k
]
+
i
;
sptr
=
src
[
k
]
+
i
;
x0
=
_mm_load_si128
((
const
__m128i
*
)
sptr
);
s0
=
updateOp
(
s0
,
vx_load_aligned
(
sptr
));
x1
=
_mm_load_si128
((
const
__m128i
*
)(
sptr
+
16
));
s1
=
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
));
s0
=
updateOp
(
s0
,
x0
);
s1
=
updateOp
(
s1
,
x1
);
}
}
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
),
s0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
+
16
),
s1
);
}
for
(
;
i
<=
width
-
8
;
i
+=
8
)
sptr
=
src
[
0
]
+
i
;
{
v_store
(
dst
+
i
,
updateOp
(
s0
,
vx_load_aligned
(
sptr
)));
__m128i
s0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
0
]
+
i
)),
x0
;
v_store
(
dst
+
i
+
vtype
::
nlanes
,
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
)))
;
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
sptr
=
src
[
k
]
+
i
;
{
v_store
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
vx_load_aligned
(
sptr
)));
x0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
k
]
+
i
));
v_store
(
dst
+
dststep
+
i
+
vtype
::
nlanes
,
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
)));
s0
=
updateOp
(
s0
,
x0
);
i
+=
2
*
vtype
::
nlanes
;
}
_mm_storel_epi64
((
__m128i
*
)(
dst
+
i
),
s0
);
}
}
}
if
(
i
<=
width
-
vtype
::
nlanes
)
return
i
/
ESZ
;
}
int
ksize
,
anchor
;
};
template
<
class
VecUpdate
>
struct
MorphColumnFVec
{
MorphColumnFVec
(
int
_ksize
,
int
_anchor
)
:
ksize
(
_ksize
),
anchor
(
_anchor
)
{}
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
int
dststep
,
int
count
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
0
;
int
i
=
0
,
k
,
_ksize
=
ksize
;
VecUpdate
updateOp
;
for
(
i
=
0
;
i
<
count
+
ksize
-
1
;
i
++
)
CV_Assert
(
((
size_t
)
_src
[
i
]
&
15
)
==
0
);
const
float
**
src
=
(
const
float
**
)
_src
;
float
*
dst
=
(
float
*
)
_dst
;
dststep
/=
sizeof
(
dst
[
0
]);
for
(
;
_ksize
>
1
&&
count
>
1
;
count
-=
2
,
dst
+=
dststep
*
2
,
src
+=
2
)
{
for
(
i
=
0
;
i
<=
width
-
16
;
i
+=
16
)
{
{
const
float
*
sptr
=
src
[
1
]
+
i
;
vtype
s0
=
vx_load_aligned
(
src
[
1
]
+
i
);
__m128
s0
=
_mm_load_ps
(
sptr
);
__m128
s1
=
_mm_load_ps
(
sptr
+
4
);
__m128
s2
=
_mm_load_ps
(
sptr
+
8
);
__m128
s3
=
_mm_load_ps
(
sptr
+
12
);
__m128
x0
,
x1
,
x2
,
x3
;
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
{
s0
=
updateOp
(
s0
,
vx_load_aligned
(
src
[
k
]
+
i
));
sptr
=
src
[
k
]
+
i
;
x0
=
_mm_load_ps
(
sptr
);
x1
=
_mm_load_ps
(
sptr
+
4
);
s0
=
updateOp
(
s0
,
x0
);
s1
=
updateOp
(
s1
,
x1
);
x2
=
_mm_load_ps
(
sptr
+
8
);
x3
=
_mm_load_ps
(
sptr
+
12
);
s2
=
updateOp
(
s2
,
x2
);
s3
=
updateOp
(
s3
,
x3
);
}
sptr
=
src
[
0
]
+
i
;
x0
=
_mm_load_ps
(
sptr
);
x1
=
_mm_load_ps
(
sptr
+
4
);
x2
=
_mm_load_ps
(
sptr
+
8
);
x3
=
_mm_load_ps
(
sptr
+
12
);
_mm_storeu_ps
(
dst
+
i
,
updateOp
(
s0
,
x0
));
_mm_storeu_ps
(
dst
+
i
+
4
,
updateOp
(
s1
,
x1
));
_mm_storeu_ps
(
dst
+
i
+
8
,
updateOp
(
s2
,
x2
));
_mm_storeu_ps
(
dst
+
i
+
12
,
updateOp
(
s3
,
x3
));
sptr
=
src
[
k
]
+
i
;
v_store
(
dst
+
i
,
updateOp
(
s0
,
vx_load_aligned
(
src
[
0
]
+
i
)));
x0
=
_mm_load_ps
(
sptr
);
v_store
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
vx_load_aligned
(
src
[
k
]
+
i
)));
x1
=
_mm_load_ps
(
sptr
+
4
);
i
+=
vtype
::
nlanes
;
x2
=
_mm_load_ps
(
sptr
+
8
);
x3
=
_mm_load_ps
(
sptr
+
12
);
_mm_storeu_ps
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
x0
));
_mm_storeu_ps
(
dst
+
dststep
+
i
+
4
,
updateOp
(
s1
,
x1
));
_mm_storeu_ps
(
dst
+
dststep
+
i
+
8
,
updateOp
(
s2
,
x2
));
_mm_storeu_ps
(
dst
+
dststep
+
i
+
12
,
updateOp
(
s3
,
x3
));
}
}
if
(
i
<=
width
-
vtype
::
nlanes
/
2
)
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
{
__m128
s0
=
_mm_load_ps
(
src
[
1
]
+
i
),
x0
;
vtype
s0
=
vx_load_low
(
src
[
1
]
+
i
)
;
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
for
(
k
=
2
;
k
<
_ksize
;
k
++
)
{
s0
=
updateOp
(
s0
,
vx_load_low
(
src
[
k
]
+
i
));
x0
=
_mm_load_ps
(
src
[
k
]
+
i
);
s0
=
updateOp
(
s0
,
x0
);
}
x0
=
_mm_load_ps
(
src
[
0
]
+
i
);
v_store_low
(
dst
+
i
,
updateOp
(
s0
,
vx_load_low
(
src
[
0
]
+
i
)));
_mm_storeu_ps
(
dst
+
i
,
updateOp
(
s0
,
x0
));
v_store_low
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
vx_load_low
(
src
[
k
]
+
i
)));
x0
=
_mm_load_ps
(
src
[
k
]
+
i
);
i
+=
vtype
::
nlanes
/
2
;
_mm_storeu_ps
(
dst
+
dststep
+
i
,
updateOp
(
s0
,
x0
));
}
}
}
}
for
(
;
count
>
0
;
count
--
,
dst
+=
dststep
,
src
++
)
for
(
;
count
>
0
;
count
--
,
dst
+=
dststep
,
src
++
)
{
{
for
(
i
=
0
;
i
<=
width
-
16
;
i
+=
16
)
for
(
i
=
0
;
i
<=
width
-
4
*
vtype
::
nlanes
;
i
+=
4
*
vtype
::
nlanes
)
{
{
const
float
*
sptr
=
src
[
0
]
+
i
;
const
stype
*
sptr
=
src
[
0
]
+
i
;
__m128
s0
=
_mm_load_ps
(
sptr
);
vtype
s0
=
vx_load_aligned
(
sptr
);
__m128
s1
=
_mm_load_ps
(
sptr
+
4
);
vtype
s1
=
vx_load_aligned
(
sptr
+
vtype
::
nlanes
);
__m128
s2
=
_mm_load_ps
(
sptr
+
8
);
vtype
s2
=
vx_load_aligned
(
sptr
+
2
*
vtype
::
nlanes
);
__m128
s3
=
_mm_load_ps
(
sptr
+
12
);
vtype
s3
=
vx_load_aligned
(
sptr
+
3
*
vtype
::
nlanes
);
__m128
x0
,
x1
,
x2
,
x3
;
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
{
{
sptr
=
src
[
k
]
+
i
;
sptr
=
src
[
k
]
+
i
;
x0
=
_mm_load_ps
(
sptr
);
s0
=
updateOp
(
s0
,
vx_load_aligned
(
sptr
));
x1
=
_mm_load_ps
(
sptr
+
4
);
s1
=
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
));
s0
=
updateOp
(
s0
,
x0
);
s2
=
updateOp
(
s2
,
vx_load_aligned
(
sptr
+
2
*
vtype
::
nlanes
));
s1
=
updateOp
(
s1
,
x1
);
s3
=
updateOp
(
s3
,
vx_load_aligned
(
sptr
+
3
*
vtype
::
nlanes
));
x2
=
_mm_load_ps
(
sptr
+
8
);
x3
=
_mm_load_ps
(
sptr
+
12
);
s2
=
updateOp
(
s2
,
x2
);
s3
=
updateOp
(
s3
,
x3
);
}
}
_mm_storeu_ps
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
,
s0
);
_mm_storeu_ps
(
dst
+
i
+
4
,
s1
);
v_store
(
dst
+
i
+
vtype
::
nlanes
,
s1
);
_mm_storeu_ps
(
dst
+
i
+
8
,
s2
);
v_store
(
dst
+
i
+
2
*
vtype
::
nlanes
,
s2
);
_mm_storeu_ps
(
dst
+
i
+
12
,
s3
);
v_store
(
dst
+
i
+
3
*
vtype
::
nlanes
,
s3
);
}
}
if
(
i
<=
width
-
2
*
vtype
::
nlanes
)
for
(
i
=
0
;
i
<=
width
-
4
;
i
+=
4
)
{
{
__m128
s0
=
_mm_load_ps
(
src
[
0
]
+
i
),
x0
;
const
stype
*
sptr
=
src
[
0
]
+
i
;
vtype
s0
=
vx_load_aligned
(
sptr
);
vtype
s1
=
vx_load_aligned
(
sptr
+
vtype
::
nlanes
);
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
{
{
x0
=
_mm_load_ps
(
src
[
k
]
+
i
);
sptr
=
src
[
k
]
+
i
;
s0
=
updateOp
(
s0
,
x0
);
s0
=
updateOp
(
s0
,
vx_load_aligned
(
sptr
));
s1
=
updateOp
(
s1
,
vx_load_aligned
(
sptr
+
vtype
::
nlanes
));
}
}
_mm_storeu_ps
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
vtype
::
nlanes
,
s1
);
i
+=
2
*
vtype
::
nlanes
;
}
if
(
i
<=
width
-
vtype
::
nlanes
)
{
vtype
s0
=
vx_load_aligned
(
src
[
0
]
+
i
);
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
s0
=
updateOp
(
s0
,
vx_load_aligned
(
src
[
k
]
+
i
));
v_store
(
dst
+
i
,
s0
);
i
+=
vtype
::
nlanes
;
}
if
(
i
<=
width
-
vtype
::
nlanes
/
2
)
{
vtype
s0
=
vx_load_low
(
src
[
0
]
+
i
);
for
(
k
=
1
;
k
<
_ksize
;
k
++
)
s0
=
updateOp
(
s0
,
vx_load_low
(
src
[
k
]
+
i
));
v_store_low
(
dst
+
i
,
s0
);
i
+=
vtype
::
nlanes
/
2
;
}
}
}
}
...
@@ -407,185 +326,109 @@ template<class VecUpdate> struct MorphColumnFVec
...
@@ -407,185 +326,109 @@ template<class VecUpdate> struct MorphColumnFVec
};
};
template
<
class
VecUpdate
>
struct
Morph
I
Vec
template
<
class
VecUpdate
>
struct
MorphVec
{
{
enum
{
ESZ
=
VecUpdate
::
ESZ
}
;
typedef
typename
VecUpdate
::
vtype
vtype
;
typedef
typename
vtype
::
lane_type
stype
;
int
operator
()(
uchar
**
src
,
int
nz
,
uchar
*
dst
,
int
width
)
const
int
operator
()(
uchar
**
_src
,
int
nz
,
uchar
*
_
dst
,
int
width
)
const
{
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
const
stype
**
src
=
(
const
stype
**
)
_src
;
return
0
;
stype
*
dst
=
(
stype
*
)
_dst
;
int
i
,
k
;
int
i
,
k
;
width
*=
ESZ
;
VecUpdate
updateOp
;
VecUpdate
updateOp
;
for
(
i
=
0
;
i
<=
width
-
32
;
i
+=
32
)
for
(
i
=
0
;
i
<=
width
-
4
*
vtype
::
nlanes
;
i
+=
4
*
vtype
::
nlanes
)
{
{
const
uchar
*
sptr
=
src
[
0
]
+
i
;
const
stype
*
sptr
=
src
[
0
]
+
i
;
__m128i
s0
=
_mm_loadu_si128
((
const
__m128i
*
)
sptr
);
vtype
s0
=
vx_load
(
sptr
);
__m128i
s1
=
_mm_loadu_si128
((
const
__m128i
*
)(
sptr
+
16
)
);
vtype
s1
=
vx_load
(
sptr
+
vtype
::
nlanes
);
__m128i
x0
,
x1
;
vtype
s2
=
vx_load
(
sptr
+
2
*
vtype
::
nlanes
)
;
vtype
s3
=
vx_load
(
sptr
+
3
*
vtype
::
nlanes
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
{
sptr
=
src
[
k
]
+
i
;
sptr
=
src
[
k
]
+
i
;
x0
=
_mm_loadu_si128
((
const
__m128i
*
)
sptr
);
s0
=
updateOp
(
s0
,
vx_load
(
sptr
));
x1
=
_mm_loadu_si128
((
const
__m128i
*
)(
sptr
+
16
));
s1
=
updateOp
(
s1
,
vx_load
(
sptr
+
vtype
::
nlanes
));
s0
=
updateOp
(
s0
,
x0
);
s2
=
updateOp
(
s2
,
vx_load
(
sptr
+
2
*
vtype
::
nlanes
));
s1
=
updateOp
(
s1
,
x1
);
s3
=
updateOp
(
s3
,
vx_load
(
sptr
+
3
*
vtype
::
nlanes
));
}
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
),
s0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
+
16
),
s1
);
}
for
(
;
i
<=
width
-
8
;
i
+=
8
)
{
__m128i
s0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
0
]
+
i
)),
x0
;
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
x0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
[
k
]
+
i
));
s0
=
updateOp
(
s0
,
x0
);
}
}
_mm_storel_epi64
((
__m128i
*
)(
dst
+
i
),
s0
);
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
vtype
::
nlanes
,
s1
);
v_store
(
dst
+
i
+
2
*
vtype
::
nlanes
,
s2
);
v_store
(
dst
+
i
+
3
*
vtype
::
nlanes
,
s3
);
}
}
if
(
i
<=
width
-
2
*
vtype
::
nlanes
)
return
i
/
ESZ
;
}
};
template
<
class
VecUpdate
>
struct
MorphFVec
{
int
operator
()(
uchar
**
_src
,
int
nz
,
uchar
*
_dst
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
0
;
const
float
**
src
=
(
const
float
**
)
_src
;
float
*
dst
=
(
float
*
)
_dst
;
int
i
,
k
;
VecUpdate
updateOp
;
for
(
i
=
0
;
i
<=
width
-
16
;
i
+=
16
)
{
{
const
float
*
sptr
=
src
[
0
]
+
i
;
const
stype
*
sptr
=
src
[
0
]
+
i
;
__m128
s0
=
_mm_loadu_ps
(
sptr
);
vtype
s0
=
vx_load
(
sptr
);
__m128
s1
=
_mm_loadu_ps
(
sptr
+
4
);
vtype
s1
=
vx_load
(
sptr
+
vtype
::
nlanes
);
__m128
s2
=
_mm_loadu_ps
(
sptr
+
8
);
__m128
s3
=
_mm_loadu_ps
(
sptr
+
12
);
__m128
x0
,
x1
,
x2
,
x3
;
for
(
k
=
1
;
k
<
nz
;
k
++
)
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
{
sptr
=
src
[
k
]
+
i
;
sptr
=
src
[
k
]
+
i
;
x0
=
_mm_loadu_ps
(
sptr
);
s0
=
updateOp
(
s0
,
vx_load
(
sptr
));
x1
=
_mm_loadu_ps
(
sptr
+
4
);
s1
=
updateOp
(
s1
,
vx_load
(
sptr
+
vtype
::
nlanes
));
x2
=
_mm_loadu_ps
(
sptr
+
8
);
x3
=
_mm_loadu_ps
(
sptr
+
12
);
s0
=
updateOp
(
s0
,
x0
);
s1
=
updateOp
(
s1
,
x1
);
s2
=
updateOp
(
s2
,
x2
);
s3
=
updateOp
(
s3
,
x3
);
}
}
_mm_storeu_ps
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
,
s0
);
_mm_storeu_ps
(
dst
+
i
+
4
,
s1
);
v_store
(
dst
+
i
+
vtype
::
nlanes
,
s1
);
_mm_storeu_ps
(
dst
+
i
+
8
,
s2
);
i
+=
2
*
vtype
::
nlanes
;
_mm_storeu_ps
(
dst
+
i
+
12
,
s3
);
}
}
if
(
i
<=
width
-
vtype
::
nlanes
)
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
{
__m128
s0
=
_mm_loadu_ps
(
src
[
0
]
+
i
),
x0
;
vtype
s0
=
vx_load
(
src
[
0
]
+
i
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
s0
=
updateOp
(
s0
,
vx_load
(
src
[
k
]
+
i
));
x0
=
_mm_loadu_ps
(
src
[
k
]
+
i
);
v_store
(
dst
+
i
,
s0
);
s0
=
updateOp
(
s0
,
x0
);
i
+=
vtype
::
nlanes
;
}
_mm_storeu_ps
(
dst
+
i
,
s0
);
}
}
if
(
i
<=
width
-
vtype
::
nlanes
/
2
)
for
(
;
i
<
width
;
i
++
)
{
{
__m128
s0
=
_mm_load_ss
(
src
[
0
]
+
i
),
x0
;
vtype
s0
=
vx_load_low
(
src
[
0
]
+
i
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
s0
=
updateOp
(
s0
,
vx_load_low
(
src
[
k
]
+
i
));
x0
=
_mm_load_ss
(
src
[
k
]
+
i
);
v_store_low
(
dst
+
i
,
s0
);
s0
=
updateOp
(
s0
,
x0
);
i
+=
vtype
::
nlanes
/
2
;
}
_mm_store_ss
(
dst
+
i
,
s0
);
}
}
return
i
;
return
i
;
}
}
};
};
struct
VMin8u
template
<
typename
T
>
struct
VMin
{
enum
{
ESZ
=
1
};
__m128i
operator
()(
const
__m128i
&
a
,
const
__m128i
&
b
)
const
{
return
_mm_min_epu8
(
a
,
b
);
}
};
struct
VMax8u
{
enum
{
ESZ
=
1
};
__m128i
operator
()(
const
__m128i
&
a
,
const
__m128i
&
b
)
const
{
return
_mm_max_epu8
(
a
,
b
);
}
};
struct
VMin16u
{
enum
{
ESZ
=
2
};
__m128i
operator
()(
const
__m128i
&
a
,
const
__m128i
&
b
)
const
{
return
_mm_subs_epu16
(
a
,
_mm_subs_epu16
(
a
,
b
));
}
};
struct
VMax16u
{
enum
{
ESZ
=
2
};
__m128i
operator
()(
const
__m128i
&
a
,
const
__m128i
&
b
)
const
{
return
_mm_adds_epu16
(
_mm_subs_epu16
(
a
,
b
),
b
);
}
};
struct
VMin16s
{
{
enum
{
ESZ
=
2
};
typedef
T
vtype
;
__m128i
operator
()(
const
__m128i
&
a
,
const
__m128i
&
b
)
const
vtype
operator
()(
const
vtype
&
a
,
const
vtype
&
b
)
const
{
return
v_min
(
a
,
b
);
}
{
return
_mm_min_epi16
(
a
,
b
);
}
};
};
struct
VMax16s
template
<
typename
T
>
struct
VMax
{
{
enum
{
ESZ
=
2
};
typedef
T
vtype
;
__m128i
operator
()(
const
__m128i
&
a
,
const
__m128i
&
b
)
const
vtype
operator
()(
const
vtype
&
a
,
const
vtype
&
b
)
const
{
return
v_max
(
a
,
b
);
}
{
return
_mm_max_epi16
(
a
,
b
);
}
};
};
struct
VMin32f
{
__m128
operator
()(
const
__m128
&
a
,
const
__m128
&
b
)
const
{
return
_mm_min_ps
(
a
,
b
);
}};
struct
VMax32f
{
__m128
operator
()(
const
__m128
&
a
,
const
__m128
&
b
)
const
{
return
_mm_max_ps
(
a
,
b
);
}};
typedef
MorphRowVec
<
VMin
<
v_uint8
>
>
ErodeRowVec8u
;
typedef
MorphRowVec
<
VMax
<
v_uint8
>
>
DilateRowVec8u
;
typedef
MorphRowIVec
<
VMin8u
>
ErodeRowVec8u
;
typedef
MorphRowVec
<
VMin
<
v_uint16
>
>
ErodeRowVec16u
;
typedef
MorphRowIVec
<
VMax8u
>
DilateRowVec8u
;
typedef
MorphRowVec
<
VMax
<
v_uint16
>
>
DilateRowVec16u
;
typedef
MorphRowIVec
<
VMin16u
>
ErodeRowVec16u
;
typedef
MorphRowVec
<
VMin
<
v_int16
>
>
ErodeRowVec16s
;
typedef
MorphRowIVec
<
VMax16u
>
DilateRowVec16u
;
typedef
MorphRowVec
<
VMax
<
v_int16
>
>
DilateRowVec16s
;
typedef
MorphRowIVec
<
VMin16s
>
ErodeRowVec16s
;
typedef
MorphRowVec
<
VMin
<
v_float32
>
>
ErodeRowVec32f
;
typedef
MorphRowIVec
<
VMax16s
>
DilateRowVec16s
;
typedef
MorphRowVec
<
VMax
<
v_float32
>
>
DilateRowVec32f
;
typedef
MorphRowFVec
<
VMin32f
>
ErodeRowVec32f
;
typedef
MorphRowFVec
<
VMax32f
>
DilateRowVec32f
;
typedef
MorphColumnVec
<
VMin
<
v_uint8
>
>
ErodeColumnVec8u
;
typedef
MorphColumnVec
<
VMax
<
v_uint8
>
>
DilateColumnVec8u
;
typedef
MorphColumnIVec
<
VMin8u
>
ErodeColumnVec8u
;
typedef
MorphColumnVec
<
VMin
<
v_uint16
>
>
ErodeColumnVec16u
;
typedef
MorphColumnIVec
<
VMax8u
>
DilateColumnVec8u
;
typedef
MorphColumnVec
<
VMax
<
v_uint16
>
>
DilateColumnVec16u
;
typedef
MorphColumnIVec
<
VMin16u
>
ErodeColumnVec16u
;
typedef
MorphColumnVec
<
VMin
<
v_int16
>
>
ErodeColumnVec16s
;
typedef
MorphColumnIVec
<
VMax16u
>
DilateColumnVec16u
;
typedef
MorphColumnVec
<
VMax
<
v_int16
>
>
DilateColumnVec16s
;
typedef
MorphColumnIVec
<
VMin16s
>
ErodeColumnVec16s
;
typedef
MorphColumnVec
<
VMin
<
v_float32
>
>
ErodeColumnVec32f
;
typedef
MorphColumnIVec
<
VMax16s
>
DilateColumnVec16s
;
typedef
MorphColumnVec
<
VMax
<
v_float32
>
>
DilateColumnVec32f
;
typedef
MorphColumnFVec
<
VMin32f
>
ErodeColumnVec32f
;
typedef
MorphColumnFVec
<
VMax32f
>
DilateColumnVec32f
;
typedef
MorphVec
<
VMin
<
v_uint8
>
>
ErodeVec8u
;
typedef
MorphVec
<
VMax
<
v_uint8
>
>
DilateVec8u
;
typedef
MorphIVec
<
VMin8u
>
ErodeVec8u
;
typedef
MorphVec
<
VMin
<
v_uint16
>
>
ErodeVec16u
;
typedef
MorphIVec
<
VMax8u
>
DilateVec8u
;
typedef
MorphVec
<
VMax
<
v_uint16
>
>
DilateVec16u
;
typedef
MorphIVec
<
VMin16u
>
ErodeVec16u
;
typedef
MorphVec
<
VMin
<
v_int16
>
>
ErodeVec16s
;
typedef
MorphIVec
<
VMax16u
>
DilateVec16u
;
typedef
MorphVec
<
VMax
<
v_int16
>
>
DilateVec16s
;
typedef
MorphIVec
<
VMin16s
>
ErodeVec16s
;
typedef
MorphVec
<
VMin
<
v_float32
>
>
ErodeVec32f
;
typedef
MorphIVec
<
VMax16s
>
DilateVec16s
;
typedef
MorphVec
<
VMax
<
v_float32
>
>
DilateVec32f
;
typedef
MorphFVec
<
VMin32f
>
ErodeVec32f
;
typedef
MorphFVec
<
VMax32f
>
DilateVec32f
;
#else
#else
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment