Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
a3916113
Commit
a3916113
authored
Oct 10, 2014
by
Vadim Pisarevsky
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #3254 from ilya-lavrenov:neon_scale_add
parents
f6b1c2a0
00f16e91
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
81 additions
and
3 deletions
+81
-3
convert.cpp
modules/core/src/convert.cpp
+36
-0
copy.cpp
modules/core/src/copy.cpp
+19
-0
mathfuncs.cpp
modules/core/src/mathfuncs.cpp
+26
-3
No files found.
modules/core/src/convert.cpp
View file @
a3916113
...
...
@@ -1541,6 +1541,20 @@ cvtScale_<short, short, float>( const short* src, size_t sstep,
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r0
);
}
}
#elif CV_NEON
float32x4_t
v_shift
=
vdupq_n_f32
(
shift
);
for
(;
x
<=
size
.
width
-
8
;
x
+=
8
)
{
int16x8_t
v_src
=
vld1q_s16
(
src
+
x
);
float32x4_t
v_tmp1
=
vcvtq_f32_s32
(
vmovl_s16
(
vget_low_s16
(
v_src
)));
float32x4_t
v_tmp2
=
vcvtq_f32_s32
(
vmovl_s16
(
vget_high_s16
(
v_src
)));
v_tmp1
=
vaddq_f32
(
vmulq_n_f32
(
v_tmp1
,
scale
),
v_shift
);
v_tmp2
=
vaddq_f32
(
vmulq_n_f32
(
v_tmp2
,
scale
),
v_shift
);
vst1q_s16
(
dst
+
x
,
vcombine_s16
(
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_tmp1
)),
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_tmp2
))));
}
#endif
for
(;
x
<
size
.
width
;
x
++
)
...
...
@@ -1580,6 +1594,20 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
+
4
),
r1
);
}
}
#elif CV_NEON
float32x4_t
v_shift
=
vdupq_n_f32
(
shift
);
for
(;
x
<=
size
.
width
-
8
;
x
+=
8
)
{
int16x8_t
v_src
=
vld1q_s16
(
src
+
x
);
float32x4_t
v_tmp1
=
vcvtq_f32_s32
(
vmovl_s16
(
vget_low_s16
(
v_src
)));
float32x4_t
v_tmp2
=
vcvtq_f32_s32
(
vmovl_s16
(
vget_high_s16
(
v_src
)));
v_tmp1
=
vaddq_f32
(
vmulq_n_f32
(
v_tmp1
,
scale
),
v_shift
);
v_tmp2
=
vaddq_f32
(
vmulq_n_f32
(
v_tmp2
,
scale
),
v_shift
);
vst1q_s32
(
dst
+
x
,
cv_vrndq_s32_f32
(
v_tmp1
));
vst1q_s32
(
dst
+
x
+
4
,
cv_vrndq_s32_f32
(
v_tmp2
));
}
#endif
//We will wait Haswell
...
...
@@ -2134,6 +2162,14 @@ cvt_<float, short>( const float* src, size_t sstep,
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
src1_int128
);
}
}
#elif CV_NEON
for
(
;
x
<=
size
.
width
-
8
;
x
+=
8
)
{
float32x4_t
v_src1
=
vld1q_f32
(
src
+
x
),
v_src2
=
vld1q_f32
(
src
+
x
+
4
);
int16x8_t
v_dst
=
vcombine_s16
(
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_src1
)),
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_src2
)));
vst1q_s16
(
dst
+
x
,
v_dst
);
}
#endif
for
(
;
x
<
size
.
width
;
x
++
)
dst
[
x
]
=
saturate_cast
<
short
>
(
src
[
x
]);
...
...
modules/core/src/copy.cpp
View file @
a3916113
...
...
@@ -107,6 +107,14 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
rDst
);
}
}
#elif CV_NEON
uint8x16_t
v_zero
=
vdupq_n_u8
(
0
);
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
uint8x16_t
v_mask
=
vcgtq_u8
(
vld1q_u8
(
mask
+
x
),
v_zero
);
uint8x16_t
v_dst
=
vld1q_u8
(
dst
+
x
),
v_src
=
vld1q_u8
(
src
+
x
);
vst1q_u8
(
dst
+
x
,
vbslq_u8
(
v_mask
,
v_src
,
v_dst
));
}
#endif
for
(
;
x
<
size
.
width
;
x
++
)
if
(
mask
[
x
]
)
...
...
@@ -143,6 +151,17 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
rDst
);
}
}
#elif CV_NEON
uint8x8_t
v_zero
=
vdup_n_u8
(
0
);
for
(
;
x
<=
size
.
width
-
8
;
x
+=
8
)
{
uint8x8_t
v_mask
=
vcgt_u8
(
vld1_u8
(
mask
+
x
),
v_zero
);
uint8x8x2_t
v_mask2
=
vzip_u8
(
v_mask
,
v_mask
);
uint16x8_t
v_mask_res
=
vreinterpretq_u16_u8
(
vcombine_u8
(
v_mask2
.
val
[
0
],
v_mask2
.
val
[
1
]));
uint16x8_t
v_src
=
vld1q_u16
(
src
+
x
),
v_dst
=
vld1q_u16
(
dst
+
x
);
vst1q_u16
(
dst
+
x
,
vbslq_u16
(
v_mask_res
,
v_src
,
v_dst
));
}
#endif
for
(
;
x
<
size
.
width
;
x
++
)
if
(
mask
[
x
]
)
...
...
modules/core/src/mathfuncs.cpp
View file @
a3916113
...
...
@@ -261,6 +261,19 @@ static void Magnitude_32f(const float* x, const float* y, float* mag, int len)
_mm_storeu_ps
(
mag
+
i
,
x0
);
_mm_storeu_ps
(
mag
+
i
+
4
,
x1
);
}
}
#elif CV_NEON
float
CV_DECL_ALIGNED
(
16
)
m
[
4
];
for
(
;
i
<=
len
-
4
;
i
+=
4
)
{
float32x4_t
v_x
=
vld1q_f32
(
x
+
i
),
v_y
=
vld1q_f32
(
y
+
i
);
vst1q_f32
(
m
,
vaddq_f32
(
vmulq_f32
(
v_x
,
v_x
),
vmulq_f32
(
v_y
,
v_y
)));
mag
[
i
]
=
std
::
sqrt
(
m
[
0
]);
mag
[
i
+
1
]
=
std
::
sqrt
(
m
[
1
]);
mag
[
i
+
2
]
=
std
::
sqrt
(
m
[
2
]);
mag
[
i
+
3
]
=
std
::
sqrt
(
m
[
3
]);
}
#endif
for
(
;
i
<
len
;
i
++
)
...
...
@@ -2554,12 +2567,14 @@ void patchNaNs( InputOutputArray _a, double _val )
NAryMatIterator
it
(
arrays
,
(
uchar
**
)
ptrs
);
size_t
len
=
it
.
size
*
a
.
channels
();
Cv32suf
val
;
float
fval
=
(
float
)
_val
;
val
.
f
=
fval
;
val
.
f
=
(
float
)
_val
;
#if CV_SSE2
__m128i
v_mask1
=
_mm_set1_epi32
(
0x7fffffff
),
v_mask2
=
_mm_set1_epi32
(
0x7f800000
);
__m128i
v_val
=
_mm_set1_epi32
(
val
.
i
);
#elif CV_NEON
int32x4_t
v_mask1
=
vdupq_n_s32
(
0x7fffffff
),
v_mask2
=
vdupq_n_s32
(
0x7f800000
),
v_val
=
vdupq_n_s32
(
val
.
i
);
#endif
for
(
size_t
i
=
0
;
i
<
it
.
nplanes
;
i
++
,
++
it
)
...
...
@@ -2570,7 +2585,7 @@ void patchNaNs( InputOutputArray _a, double _val )
#if CV_SSE2
if
(
USE_SSE2
)
{
for
(
;
j
<
len
;
j
+=
4
)
for
(
;
j
+
4
<=
len
;
j
+=
4
)
{
__m128i
v_src
=
_mm_loadu_si128
((
__m128i
const
*
)(
tptr
+
j
));
__m128i
v_cmp_mask
=
_mm_cmplt_epi32
(
v_mask2
,
_mm_and_si128
(
v_src
,
v_mask1
));
...
...
@@ -2578,6 +2593,14 @@ void patchNaNs( InputOutputArray _a, double _val )
_mm_storeu_si128
((
__m128i
*
)(
tptr
+
j
),
v_res
);
}
}
#elif CV_NEON
for
(
;
j
+
4
<=
len
;
j
+=
4
)
{
int32x4_t
v_src
=
vld1q_s32
(
tptr
+
j
);
uint32x4_t
v_cmp_mask
=
vcltq_s32
(
v_mask2
,
vandq_s32
(
v_src
,
v_mask1
));
int32x4_t
v_dst
=
vbslq_s32
(
v_cmp_mask
,
v_val
,
v_src
);
vst1q_s32
(
tptr
+
j
,
v_dst
);
}
#endif
for
(
;
j
<
len
;
j
++
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment