Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
9ef373f6
Commit
9ef373f6
authored
Aug 10, 2014
by
Vadim Pisarevsky
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #3038 from yury-gorbachev:core_arithm_neon
parents
4de4ff56
9a233999
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
236 additions
and
8 deletions
+236
-8
arithm.cpp
modules/core/src/arithm.cpp
+236
-8
No files found.
modules/core/src/arithm.cpp
View file @
9ef373f6
...
...
@@ -2440,6 +2440,34 @@ addWeighted8u( const uchar* src1, size_t step1,
_mm_storel_epi64
((
__m128i
*
)(
dst
+
x
),
u
);
}
}
#elif CV_NEON
float32x4_t
g
=
vdupq_n_f32
(
gamma
);
for
(
;
x
<=
size
.
width
-
8
;
x
+=
8
)
{
uint8x8_t
in1
=
vld1_u8
(
src1
+
x
);
uint16x8_t
in1_16
=
vmovl_u8
(
in1
);
float32x4_t
in1_f_l
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_low_u16
(
in1_16
)));
float32x4_t
in1_f_h
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_high_u16
(
in1_16
)));
uint8x8_t
in2
=
vld1_u8
(
src2
+
x
);
uint16x8_t
in2_16
=
vmovl_u8
(
in2
);
float32x4_t
in2_f_l
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_low_u16
(
in2_16
)));
float32x4_t
in2_f_h
=
vcvtq_f32_u32
(
vmovl_u16
(
vget_high_u16
(
in2_16
)));
float32x4_t
out_f_l
=
vaddq_f32
(
vmulq_n_f32
(
in1_f_l
,
alpha
),
vmulq_n_f32
(
in2_f_l
,
beta
));
float32x4_t
out_f_h
=
vaddq_f32
(
vmulq_n_f32
(
in1_f_h
,
alpha
),
vmulq_n_f32
(
in2_f_h
,
beta
));
out_f_l
=
vaddq_f32
(
out_f_l
,
g
);
out_f_h
=
vaddq_f32
(
out_f_h
,
g
);
uint16x4_t
out_16_l
=
vqmovun_s32
(
vcvtq_s32_f32
(
out_f_l
));
uint16x4_t
out_16_h
=
vqmovun_s32
(
vcvtq_s32_f32
(
out_f_h
));
uint16x8_t
out_16
=
vcombine_u16
(
out_16_l
,
out_16_h
);
uint8x8_t
out
=
vqmovn_u16
(
out_16
);
vst1_u8
(
dst
+
x
,
out
);
}
#endif
#if CV_ENABLE_UNROLLED
for
(
;
x
<=
size
.
width
-
4
;
x
+=
4
)
...
...
@@ -2650,6 +2678,14 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_GT
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vcgtq_u8
(
vld1q_u8
(
src1
+
x
),
vld1q_u8
(
src2
+
x
)),
mask
));
}
#endif
for
(
;
x
<
size
.
width
;
x
++
){
...
...
@@ -2674,6 +2710,13 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r00
);
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_EQ
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vceqq_u8
(
vld1q_u8
(
src1
+
x
),
vld1q_u8
(
src2
+
x
)),
mask
));
}
#endif
for
(
;
x
<
size
.
width
;
x
++
)
dst
[
x
]
=
(
uchar
)(
-
(
src1
[
x
]
==
src2
[
x
])
^
m
);
...
...
@@ -2759,6 +2802,22 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
x
+=
8
;
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_GT
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
int16x8_t
in1
=
vld1q_s16
(
src1
+
x
);
int16x8_t
in2
=
vld1q_s16
(
src2
+
x
);
uint8x8_t
t1
=
vmovn_u16
(
vcgtq_s16
(
in1
,
in2
));
in1
=
vld1q_s16
(
src1
+
x
+
8
);
in2
=
vld1q_s16
(
src2
+
x
+
8
);
uint8x8_t
t2
=
vmovn_u16
(
vcgtq_s16
(
in1
,
in2
));
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vcombine_u8
(
t1
,
t2
),
mask
));
}
#endif
for
(
;
x
<
size
.
width
;
x
++
){
...
...
@@ -2797,6 +2856,21 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
x
+=
8
;
}
}
#elif CV_NEON
uint8x16_t
mask
=
code
==
CMP_EQ
?
vdupq_n_u8
(
0
)
:
vdupq_n_u8
(
255
);
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
int16x8_t
in1
=
vld1q_s16
(
src1
+
x
);
int16x8_t
in2
=
vld1q_s16
(
src2
+
x
);
uint8x8_t
t1
=
vmovn_u16
(
vceqq_s16
(
in1
,
in2
));
in1
=
vld1q_s16
(
src1
+
x
+
8
);
in2
=
vld1q_s16
(
src2
+
x
+
8
);
uint8x8_t
t2
=
vmovn_u16
(
vceqq_s16
(
in1
,
in2
));
vst1q_u8
(
dst
+
x
,
veorq_u8
(
vcombine_u8
(
t1
,
t2
),
mask
));
}
#endif
for
(
;
x
<
size
.
width
;
x
++
)
dst
[
x
]
=
(
uchar
)(
-
(
src1
[
x
]
==
src2
[
x
])
^
m
);
...
...
@@ -3085,7 +3159,7 @@ namespace cv
{
template
<
typename
T
>
struct
InRange_S
SE
struct
InRange_S
IMD
{
int
operator
()
(
const
T
*
,
const
T
*
,
const
T
*
,
uchar
*
,
int
)
const
{
...
...
@@ -3096,7 +3170,7 @@ struct InRange_SSE
#if CV_SSE2
template
<>
struct
InRange_S
SE
<
uchar
>
struct
InRange_S
IMD
<
uchar
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
const
uchar
*
src3
,
uchar
*
dst
,
int
len
)
const
...
...
@@ -3121,7 +3195,7 @@ struct InRange_SSE<uchar>
};
template
<>
struct
InRange_S
SE
<
schar
>
struct
InRange_S
IMD
<
schar
>
{
int
operator
()
(
const
schar
*
src1
,
const
schar
*
src2
,
const
schar
*
src3
,
uchar
*
dst
,
int
len
)
const
...
...
@@ -3146,7 +3220,7 @@ struct InRange_SSE<schar>
};
template
<>
struct
InRange_S
SE
<
ushort
>
struct
InRange_S
IMD
<
ushort
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
const
ushort
*
src3
,
uchar
*
dst
,
int
len
)
const
...
...
@@ -3172,7 +3246,7 @@ struct InRange_SSE<ushort>
};
template
<>
struct
InRange_S
SE
<
short
>
struct
InRange_S
IMD
<
short
>
{
int
operator
()
(
const
short
*
src1
,
const
short
*
src2
,
const
short
*
src3
,
uchar
*
dst
,
int
len
)
const
...
...
@@ -3198,7 +3272,7 @@ struct InRange_SSE<short>
};
template
<>
struct
InRange_S
SE
<
int
>
struct
InRange_S
IMD
<
int
>
{
int
operator
()
(
const
int
*
src1
,
const
int
*
src2
,
const
int
*
src3
,
uchar
*
dst
,
int
len
)
const
...
...
@@ -3230,7 +3304,7 @@ struct InRange_SSE<int>
};
template
<>
struct
InRange_S
SE
<
float
>
struct
InRange_S
IMD
<
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
const
float
*
src3
,
uchar
*
dst
,
int
len
)
const
...
...
@@ -3261,6 +3335,160 @@ struct InRange_SSE<float>
}
};
#elif CV_NEON
template
<>
struct
InRange_SIMD
<
uchar
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
const
uchar
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
uint8x16_t
values
=
vld1q_u8
(
src1
+
x
);
uint8x16_t
low
=
vld1q_u8
(
src2
+
x
);
uint8x16_t
high
=
vld1q_u8
(
src3
+
x
);
vst1q_u8
(
dst
+
x
,
vandq_u8
(
vcgeq_u8
(
values
,
low
),
vcgeq_u8
(
high
,
values
)));
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
schar
>
{
int
operator
()
(
const
schar
*
src1
,
const
schar
*
src2
,
const
schar
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
int8x16_t
values
=
vld1q_s8
(
src1
+
x
);
int8x16_t
low
=
vld1q_s8
(
src2
+
x
);
int8x16_t
high
=
vld1q_s8
(
src3
+
x
);
vst1q_u8
(
dst
+
x
,
vandq_u8
(
vcgeq_s8
(
values
,
low
),
vcgeq_s8
(
high
,
values
)));
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
ushort
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
const
ushort
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
uint16x8_t
values
=
vld1q_u16
((
const
uint16_t
*
)(
src1
+
x
));
uint16x8_t
low
=
vld1q_u16
((
const
uint16_t
*
)(
src2
+
x
));
uint16x8_t
high
=
vld1q_u16
((
const
uint16_t
*
)(
src3
+
x
));
uint8x8_t
r1
=
vmovn_u16
(
vandq_u16
(
vcgeq_u16
(
values
,
low
),
vcgeq_u16
(
high
,
values
)));
values
=
vld1q_u16
((
const
uint16_t
*
)(
src1
+
x
+
8
));
low
=
vld1q_u16
((
const
uint16_t
*
)(
src2
+
x
+
8
));
high
=
vld1q_u16
((
const
uint16_t
*
)(
src3
+
x
+
8
));
uint8x8_t
r2
=
vmovn_u16
(
vandq_u16
(
vcgeq_u16
(
values
,
low
),
vcgeq_u16
(
high
,
values
)));
vst1q_u8
(
dst
+
x
,
vcombine_u8
(
r1
,
r2
));
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
short
>
{
int
operator
()
(
const
short
*
src1
,
const
short
*
src2
,
const
short
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
int16x8_t
values
=
vld1q_s16
((
const
int16_t
*
)(
src1
+
x
));
int16x8_t
low
=
vld1q_s16
((
const
int16_t
*
)(
src2
+
x
));
int16x8_t
high
=
vld1q_s16
((
const
int16_t
*
)(
src3
+
x
));
uint8x8_t
r1
=
vmovn_u16
(
vandq_u16
(
vcgeq_s16
(
values
,
low
),
vcgeq_s16
(
high
,
values
)));
values
=
vld1q_s16
((
const
int16_t
*
)(
src1
+
x
+
8
));
low
=
vld1q_s16
((
const
int16_t
*
)(
src2
+
x
+
8
));
high
=
vld1q_s16
((
const
int16_t
*
)(
src3
+
x
+
8
));
uint8x8_t
r2
=
vmovn_u16
(
vandq_u16
(
vcgeq_s16
(
values
,
low
),
vcgeq_s16
(
high
,
values
)));
vst1q_u8
(
dst
+
x
,
vcombine_u8
(
r1
,
r2
));
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
int
>
{
int
operator
()
(
const
int
*
src1
,
const
int
*
src2
,
const
int
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
int32x4_t
values
=
vld1q_s32
((
const
int32_t
*
)(
src1
+
x
));
int32x4_t
low
=
vld1q_s32
((
const
int32_t
*
)(
src2
+
x
));
int32x4_t
high
=
vld1q_s32
((
const
int32_t
*
)(
src3
+
x
));
uint16x4_t
r1
=
vmovn_u32
(
vandq_u32
(
vcgeq_s32
(
values
,
low
),
vcgeq_s32
(
high
,
values
)));
values
=
vld1q_s32
((
const
int32_t
*
)(
src1
+
x
+
4
));
low
=
vld1q_s32
((
const
int32_t
*
)(
src2
+
x
+
4
));
high
=
vld1q_s32
((
const
int32_t
*
)(
src3
+
x
+
4
));
uint16x4_t
r2
=
vmovn_u32
(
vandq_u32
(
vcgeq_s32
(
values
,
low
),
vcgeq_s32
(
high
,
values
)));
uint16x8_t
res_16
=
vcombine_u16
(
r1
,
r2
);
vst1_u8
(
dst
+
x
,
vmovn_u16
(
res_16
));
}
return
x
;
}
};
template
<>
struct
InRange_SIMD
<
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
const
float
*
src3
,
uchar
*
dst
,
int
len
)
const
{
int
x
=
0
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
float32x4_t
values
=
vld1q_f32
((
const
float32_t
*
)(
src1
+
x
));
float32x4_t
low
=
vld1q_f32
((
const
float32_t
*
)(
src2
+
x
));
float32x4_t
high
=
vld1q_f32
((
const
float32_t
*
)(
src3
+
x
));
uint16x4_t
r1
=
vmovn_u32
(
vandq_u32
(
vcgeq_f32
(
values
,
low
),
vcgeq_f32
(
high
,
values
)));
values
=
vld1q_f32
((
const
float32_t
*
)(
src1
+
x
+
4
));
low
=
vld1q_f32
((
const
float32_t
*
)(
src2
+
x
+
4
));
high
=
vld1q_f32
((
const
float32_t
*
)(
src3
+
x
+
4
));
uint16x4_t
r2
=
vmovn_u32
(
vandq_u32
(
vcgeq_f32
(
values
,
low
),
vcgeq_f32
(
high
,
values
)));
uint16x8_t
res_16
=
vcombine_u16
(
r1
,
r2
);
vst1_u8
(
dst
+
x
,
vmovn_u16
(
res_16
));
}
return
x
;
}
};
#endif
template
<
typename
T
>
...
...
@@ -3272,7 +3500,7 @@ static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
step2
/=
sizeof
(
src2
[
0
]);
step3
/=
sizeof
(
src3
[
0
]);
InRange_S
SE
<
T
>
vop
;
InRange_S
IMD
<
T
>
vop
;
for
(
;
size
.
height
--
;
src1
+=
step1
,
src2
+=
step2
,
src3
+=
step3
,
dst
+=
step
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment