Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
bebd49d9
Commit
bebd49d9
authored
Nov 02, 2016
by
Vadim Pisarevsky
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #7467 from tomoaki0705:featureCheckSimdUniversal
parents
050731c4
cba22349
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
190 additions
and
154 deletions
+190
-154
stereosgbm.cpp
modules/calib3d/src/stereosgbm.cpp
+5
-5
intrin_cpp.hpp
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+11
-0
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+11
-0
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+11
-0
arithm_simd.hpp
modules/core/src/arithm_simd.hpp
+14
-14
canny.cpp
modules/imgproc/src/canny.cpp
+3
-3
spatialgradient.cpp
modules/imgproc/src/spatialgradient.cpp
+135
-132
No files found.
modules/calib3d/src/stereosgbm.cpp
View file @
bebd49d9
...
...
@@ -132,7 +132,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
const
PixType
*
row1
=
img1
.
ptr
<
PixType
>
(
y
),
*
row2
=
img2
.
ptr
<
PixType
>
(
y
);
PixType
*
prow1
=
buffer
+
width2
*
2
,
*
prow2
=
prow1
+
width
*
cn
*
2
;
#if CV_SIMD128
bool
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
bool
useSIMD
=
hasSIMD128
(
);
#endif
tab
+=
tabOfs
;
...
...
@@ -292,7 +292,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
};
static
const
v_uint16x8
v_LSB
=
v_uint16x8
(
0x1
,
0x2
,
0x4
,
0x8
,
0x10
,
0x20
,
0x40
,
0x80
);
bool
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
bool
useSIMD
=
hasSIMD128
(
);
#endif
const
int
ALIGN
=
16
;
...
...
@@ -891,7 +891,7 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
ftzero
=
std
::
max
(
params
.
preFilterCap
,
15
)
|
1
;
#if CV_SIMD128
useSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
useSIMD
=
hasSIMD128
(
);
#endif
}
...
...
@@ -1054,7 +1054,7 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
CostType
&
leftMinCost
,
CostType
&
topMinCost
,
int
D
,
int
P1
,
int
P2
)
{
#if CV_SIMD128
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
))
if
(
hasSIMD128
(
))
{
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
...
...
@@ -1166,7 +1166,7 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType*
CostType
&
rightMinCost
,
int
D
,
int
P1
,
int
P2
,
int
&
optimal_disp
,
CostType
&
min_cost
)
{
#if CV_SIMD128
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
))
if
(
hasSIMD128
(
))
{
v_int16x8
P1_reg
=
v_setall_s16
(
cv
::
saturate_cast
<
CostType
>
(
P1
));
...
...
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
View file @
bebd49d9
...
...
@@ -1772,6 +1772,17 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
//! @}
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
static
inline
bool
hasSIMD128
()
{
return
false
;
}
//! @}
}
#endif
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
bebd49d9
...
...
@@ -46,6 +46,7 @@
#define OPENCV_HAL_INTRIN_NEON_HPP
#include <algorithm>
#include "opencv2/core/utility.hpp"
namespace
cv
{
...
...
@@ -1216,6 +1217,16 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a)
}
#endif
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
static
inline
bool
hasSIMD128
()
{
return
checkHardwareSupport
(
CV_CPU_NEON
);
}
//! @}
//! @endcond
}
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
bebd49d9
...
...
@@ -46,6 +46,7 @@
#define OPENCV_HAL_SSE_HPP
#include <algorithm>
#include "opencv2/core/utility.hpp"
#define CV_SIMD128 1
#define CV_SIMD128_64F 1
...
...
@@ -1726,6 +1727,16 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a)
}
#endif
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
static
inline
bool
hasSIMD128
()
{
return
checkHardwareSupport
(
CV_CPU_SSE2
);
}
//! @}
//! @endcond
}
...
...
modules/core/src/arithm_simd.hpp
View file @
bebd49d9
...
...
@@ -1197,7 +1197,7 @@ template <>
struct
Div_SIMD
<
uchar
>
{
bool
haveSIMD
;
Div_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Div_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
uchar
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1243,7 +1243,7 @@ template <>
struct
Div_SIMD
<
schar
>
{
bool
haveSIMD
;
Div_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Div_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
schar
*
src1
,
const
schar
*
src2
,
schar
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1289,7 +1289,7 @@ template <>
struct
Div_SIMD
<
ushort
>
{
bool
haveSIMD
;
Div_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Div_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
ushort
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1334,7 +1334,7 @@ template <>
struct
Div_SIMD
<
short
>
{
bool
haveSIMD
;
Div_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Div_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
short
*
src1
,
const
short
*
src2
,
short
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1379,7 +1379,7 @@ template <>
struct
Div_SIMD
<
int
>
{
bool
haveSIMD
;
Div_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Div_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
int
*
src1
,
const
int
*
src2
,
int
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1423,7 +1423,7 @@ template <>
struct
Div_SIMD
<
float
>
{
bool
haveSIMD
;
Div_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Div_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
float
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1463,7 +1463,7 @@ template <>
struct
Recip_SIMD
<
uchar
>
{
bool
haveSIMD
;
Recip_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Recip_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
uchar
*
src2
,
uchar
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1504,7 +1504,7 @@ template <>
struct
Recip_SIMD
<
schar
>
{
bool
haveSIMD
;
Recip_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Recip_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
schar
*
src2
,
schar
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1545,7 +1545,7 @@ template <>
struct
Recip_SIMD
<
ushort
>
{
bool
haveSIMD
;
Recip_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Recip_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
ushort
*
src2
,
ushort
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1585,7 +1585,7 @@ template <>
struct
Recip_SIMD
<
short
>
{
bool
haveSIMD
;
Recip_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Recip_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
short
*
src2
,
short
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1625,7 +1625,7 @@ template <>
struct
Recip_SIMD
<
int
>
{
bool
haveSIMD
;
Recip_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Recip_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
int
*
src2
,
int
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1665,7 +1665,7 @@ template <>
struct
Recip_SIMD
<
float
>
{
bool
haveSIMD
;
Recip_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Recip_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
float
*
src2
,
float
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1702,7 +1702,7 @@ template <>
struct
Div_SIMD
<
double
>
{
bool
haveSIMD
;
Div_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Div_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
double
*
src1
,
const
double
*
src2
,
double
*
dst
,
int
width
,
double
scale
)
const
{
...
...
@@ -1739,7 +1739,7 @@ template <>
struct
Recip_SIMD
<
double
>
{
bool
haveSIMD
;
Recip_SIMD
()
{
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
}
Recip_SIMD
()
{
haveSIMD
=
hasSIMD128
(
);
}
int
operator
()
(
const
double
*
src2
,
double
*
dst
,
int
width
,
double
scale
)
const
{
...
...
modules/imgproc/src/canny.cpp
View file @
bebd49d9
...
...
@@ -301,7 +301,7 @@ public:
void
operator
()(
const
Range
&
boundaries
)
const
{
#if CV_SIMD128
bool
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
bool
haveSIMD
=
hasSIMD128
(
);
#endif
const
int
type
=
src
.
type
(),
cn
=
CV_MAT_CN
(
type
);
...
...
@@ -709,7 +709,7 @@ public:
uchar
*
pdst
=
dst
.
ptr
()
+
(
ptrdiff_t
)(
dst
.
step
*
boundaries
.
start
);
#if CV_SIMD128
bool
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
bool
haveSIMD
=
hasSIMD128
(
);
#endif
for
(
int
i
=
boundaries
.
start
;
i
<
boundaries
.
end
;
i
++
,
pmap
+=
mapstep
,
pdst
+=
dst
.
step
)
...
...
@@ -962,7 +962,7 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
#define CANNY_POP(d) (d) = *--stack_top
#if CV_SIMD128
bool
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
bool
haveSIMD
=
hasSIMD128
(
);
#endif
// calculate magnitude and angle of gradient, perform non-maxima suppression.
...
...
modules/imgproc/src/spatialgradient.cpp
View file @
bebd49d9
...
...
@@ -130,140 +130,143 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
int
i_start
=
0
;
int
j_start
=
0
;
#if CV_SIMD128 && CV_SSE2
uchar
*
m_src
;
short
*
n_dx
,
*
n_dy
;
// Characters in variable names have the following meanings:
// u: unsigned char
// s: signed int
//
// [row][column]
// m: offset -1
// n: offset 0
// p: offset 1
// Example: umn is offset -1 in row and offset 0 in column
for
(
i
=
0
;
i
<
H
-
1
;
i
+=
2
)
if
(
hasSIMD128
())
{
if
(
i
==
0
)
p_src
=
src
.
ptr
<
uchar
>
(
i_top
);
else
p_src
=
src
.
ptr
<
uchar
>
(
i
-
1
);
c_src
=
src
.
ptr
<
uchar
>
(
i
);
n_src
=
src
.
ptr
<
uchar
>
(
i
+
1
);
if
(
i
==
H
-
2
)
m_src
=
src
.
ptr
<
uchar
>
(
i_bottom
);
else
m_src
=
src
.
ptr
<
uchar
>
(
i
+
2
);
c_dx
=
dx
.
ptr
<
short
>
(
i
);
c_dy
=
dy
.
ptr
<
short
>
(
i
);
n_dx
=
dx
.
ptr
<
short
>
(
i
+
1
);
n_dy
=
dy
.
ptr
<
short
>
(
i
+
1
);
v_uint8x16
v_select_m
=
v_uint8x16
(
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0xFF
);
// Process rest of columns 16-column chunks at a time
for
(
j
=
1
;
j
<
W
-
16
;
j
+=
16
)
uchar
*
m_src
;
short
*
n_dx
,
*
n_dy
;
// Characters in variable names have the following meanings:
// u: unsigned char
// s: signed int
//
// [row][column]
// m: offset -1
// n: offset 0
// p: offset 1
// Example: umn is offset -1 in row and offset 0 in column
for
(
i
=
0
;
i
<
H
-
1
;
i
+=
2
)
{
// Load top row for 3x3 Sobel filter
v_uint8x16
v_um
=
v_load
(
&
p_src
[
j
-
1
]);
v_uint8x16
v_up
=
v_load
(
&
p_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_uint8x16
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_uint16x8
v_um1
,
v_um2
,
v_un1
,
v_un2
,
v_up1
,
v_up2
;
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s1m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s1m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s1n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s1n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s1p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s1p2
=
v_reinterpret_as_s16
(
v_up2
);
// Load second row for 3x3 Sobel filter
v_um
=
v_load
(
&
c_src
[
j
-
1
]);
v_up
=
v_load
(
&
c_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s2m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s2m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s2n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s2n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s2p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s2p2
=
v_reinterpret_as_s16
(
v_up2
);
// Load third row for 3x3 Sobel filter
v_um
=
v_load
(
&
n_src
[
j
-
1
]);
v_up
=
v_load
(
&
n_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s3m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s3m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s3n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s3n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s3p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s3p2
=
v_reinterpret_as_s16
(
v_up2
);
// dx & dy for rows 1, 2, 3
v_int16x8
v_sdx1
,
v_sdy1
;
spatialGradientKernel
<
v_int16x8
>
(
v_sdx1
,
v_sdy1
,
v_s1m1
,
v_s1n1
,
v_s1p1
,
v_s2m1
,
v_s2p1
,
v_s3m1
,
v_s3n1
,
v_s3p1
);
v_int16x8
v_sdx2
,
v_sdy2
;
spatialGradientKernel
<
v_int16x8
>
(
v_sdx2
,
v_sdy2
,
v_s1m2
,
v_s1n2
,
v_s1p2
,
v_s2m2
,
v_s2p2
,
v_s3m2
,
v_s3n2
,
v_s3p2
);
// Store
v_store
(
&
c_dx
[
j
],
v_sdx1
);
v_store
(
&
c_dx
[
j
+
8
],
v_sdx2
);
v_store
(
&
c_dy
[
j
],
v_sdy1
);
v_store
(
&
c_dy
[
j
+
8
],
v_sdy2
);
// Load fourth row for 3x3 Sobel filter
v_um
=
v_load
(
&
m_src
[
j
-
1
]);
v_up
=
v_load
(
&
m_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s4m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s4m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s4n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s4n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s4p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s4p2
=
v_reinterpret_as_s16
(
v_up2
);
// dx & dy for rows 2, 3, 4
spatialGradientKernel
<
v_int16x8
>
(
v_sdx1
,
v_sdy1
,
v_s2m1
,
v_s2n1
,
v_s2p1
,
v_s3m1
,
v_s3p1
,
v_s4m1
,
v_s4n1
,
v_s4p1
);
spatialGradientKernel
<
v_int16x8
>
(
v_sdx2
,
v_sdy2
,
v_s2m2
,
v_s2n2
,
v_s2p2
,
v_s3m2
,
v_s3p2
,
v_s4m2
,
v_s4n2
,
v_s4p2
);
// Store
v_store
(
&
n_dx
[
j
],
v_sdx1
);
v_store
(
&
n_dx
[
j
+
8
],
v_sdx2
);
v_store
(
&
n_dy
[
j
],
v_sdy1
);
v_store
(
&
n_dy
[
j
+
8
],
v_sdy2
);
if
(
i
==
0
)
p_src
=
src
.
ptr
<
uchar
>
(
i_top
);
else
p_src
=
src
.
ptr
<
uchar
>
(
i
-
1
);
c_src
=
src
.
ptr
<
uchar
>
(
i
);
n_src
=
src
.
ptr
<
uchar
>
(
i
+
1
);
if
(
i
==
H
-
2
)
m_src
=
src
.
ptr
<
uchar
>
(
i_bottom
);
else
m_src
=
src
.
ptr
<
uchar
>
(
i
+
2
);
c_dx
=
dx
.
ptr
<
short
>
(
i
);
c_dy
=
dy
.
ptr
<
short
>
(
i
);
n_dx
=
dx
.
ptr
<
short
>
(
i
+
1
);
n_dy
=
dy
.
ptr
<
short
>
(
i
+
1
);
v_uint8x16
v_select_m
=
v_uint8x16
(
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0xFF
);
// Process rest of columns 16-column chunks at a time
for
(
j
=
1
;
j
<
W
-
16
;
j
+=
16
)
{
// Load top row for 3x3 Sobel filter
v_uint8x16
v_um
=
v_load
(
&
p_src
[
j
-
1
]);
v_uint8x16
v_up
=
v_load
(
&
p_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_uint8x16
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_uint16x8
v_um1
,
v_um2
,
v_un1
,
v_un2
,
v_up1
,
v_up2
;
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s1m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s1m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s1n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s1n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s1p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s1p2
=
v_reinterpret_as_s16
(
v_up2
);
// Load second row for 3x3 Sobel filter
v_um
=
v_load
(
&
c_src
[
j
-
1
]);
v_up
=
v_load
(
&
c_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s2m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s2m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s2n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s2n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s2p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s2p2
=
v_reinterpret_as_s16
(
v_up2
);
// Load third row for 3x3 Sobel filter
v_um
=
v_load
(
&
n_src
[
j
-
1
]);
v_up
=
v_load
(
&
n_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s3m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s3m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s3n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s3n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s3p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s3p2
=
v_reinterpret_as_s16
(
v_up2
);
// dx & dy for rows 1, 2, 3
v_int16x8
v_sdx1
,
v_sdy1
;
spatialGradientKernel
<
v_int16x8
>
(
v_sdx1
,
v_sdy1
,
v_s1m1
,
v_s1n1
,
v_s1p1
,
v_s2m1
,
v_s2p1
,
v_s3m1
,
v_s3n1
,
v_s3p1
);
v_int16x8
v_sdx2
,
v_sdy2
;
spatialGradientKernel
<
v_int16x8
>
(
v_sdx2
,
v_sdy2
,
v_s1m2
,
v_s1n2
,
v_s1p2
,
v_s2m2
,
v_s2p2
,
v_s3m2
,
v_s3n2
,
v_s3p2
);
// Store
v_store
(
&
c_dx
[
j
],
v_sdx1
);
v_store
(
&
c_dx
[
j
+
8
],
v_sdx2
);
v_store
(
&
c_dy
[
j
],
v_sdy1
);
v_store
(
&
c_dy
[
j
+
8
],
v_sdy2
);
// Load fourth row for 3x3 Sobel filter
v_um
=
v_load
(
&
m_src
[
j
-
1
]);
v_up
=
v_load
(
&
m_src
[
j
+
1
]);
// TODO: Replace _mm_slli_si128 with hal method
v_un
=
v_select
(
v_select_m
,
v_uint8x16
(
_mm_slli_si128
(
v_up
.
val
,
1
)),
v_uint8x16
(
_mm_srli_si128
(
v_um
.
val
,
1
)));
v_expand
(
v_um
,
v_um1
,
v_um2
);
v_expand
(
v_un
,
v_un1
,
v_un2
);
v_expand
(
v_up
,
v_up1
,
v_up2
);
v_int16x8
v_s4m1
=
v_reinterpret_as_s16
(
v_um1
);
v_int16x8
v_s4m2
=
v_reinterpret_as_s16
(
v_um2
);
v_int16x8
v_s4n1
=
v_reinterpret_as_s16
(
v_un1
);
v_int16x8
v_s4n2
=
v_reinterpret_as_s16
(
v_un2
);
v_int16x8
v_s4p1
=
v_reinterpret_as_s16
(
v_up1
);
v_int16x8
v_s4p2
=
v_reinterpret_as_s16
(
v_up2
);
// dx & dy for rows 2, 3, 4
spatialGradientKernel
<
v_int16x8
>
(
v_sdx1
,
v_sdy1
,
v_s2m1
,
v_s2n1
,
v_s2p1
,
v_s3m1
,
v_s3p1
,
v_s4m1
,
v_s4n1
,
v_s4p1
);
spatialGradientKernel
<
v_int16x8
>
(
v_sdx2
,
v_sdy2
,
v_s2m2
,
v_s2n2
,
v_s2p2
,
v_s3m2
,
v_s3p2
,
v_s4m2
,
v_s4n2
,
v_s4p2
);
// Store
v_store
(
&
n_dx
[
j
],
v_sdx1
);
v_store
(
&
n_dx
[
j
+
8
],
v_sdx2
);
v_store
(
&
n_dy
[
j
],
v_sdy1
);
v_store
(
&
n_dy
[
j
+
8
],
v_sdy2
);
}
}
}
i_start
=
i
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment