Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
9ad1a848
Commit
9ad1a848
authored
Nov 15, 2018
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Unrolled bilateral filter neighbor processing loop
parent
f5b6bea2
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
584 additions
and
4 deletions
+584
-4
bilateral_filter.cpp
modules/imgproc/src/bilateral_filter.cpp
+584
-4
No files found.
modules/imgproc/src/bilateral_filter.cpp
View file @
9ad1a848
...
@@ -82,7 +82,84 @@ public:
...
@@ -82,7 +82,84 @@ public:
memset
(
buf
.
data
(),
0
,
buf
.
size
()
*
sizeof
(
float
));
memset
(
buf
.
data
(),
0
,
buf
.
size
()
*
sizeof
(
float
));
float
*
sum
=
alignPtr
(
buf
.
data
(),
CV_SIMD_WIDTH
);
float
*
sum
=
alignPtr
(
buf
.
data
(),
CV_SIMD_WIDTH
);
float
*
wsum
=
sum
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
float
*
wsum
=
sum
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
for
(
k
=
0
;
k
<
maxk
;
k
++
)
k
=
0
;
for
(;
k
<=
maxk
-
4
;
k
+=
4
)
{
const
uchar
*
ksptr0
=
sptr
+
space_ofs
[
k
];
const
uchar
*
ksptr1
=
sptr
+
space_ofs
[
k
+
1
];
const
uchar
*
ksptr2
=
sptr
+
space_ofs
[
k
+
2
];
const
uchar
*
ksptr3
=
sptr
+
space_ofs
[
k
+
3
];
j
=
0
;
#if CV_SIMD
v_float32
kweight0
=
vx_setall_f32
(
space_weight
[
k
]);
v_float32
kweight1
=
vx_setall_f32
(
space_weight
[
k
+
1
]);
v_float32
kweight2
=
vx_setall_f32
(
space_weight
[
k
+
2
]);
v_float32
kweight3
=
vx_setall_f32
(
space_weight
[
k
+
3
]);
for
(;
j
<=
size
.
width
-
v_float32
::
nlanes
;
j
+=
v_float32
::
nlanes
)
{
v_uint32
rval
=
vx_load_expand_q
(
sptr
+
j
);
v_uint32
val
=
vx_load_expand_q
(
ksptr0
+
j
);
v_float32
w
=
kweight0
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
v_absdiff
(
val
,
rval
)));
v_float32
v_wsum
=
vx_load_aligned
(
wsum
+
j
)
+
w
;
v_float32
v_sum
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
val
)),
w
,
vx_load_aligned
(
sum
+
j
));
val
=
vx_load_expand_q
(
ksptr1
+
j
);
w
=
kweight1
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
v_absdiff
(
val
,
rval
)));
v_wsum
+=
w
;
v_sum
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
val
)),
w
,
v_sum
);
val
=
vx_load_expand_q
(
ksptr2
+
j
);
w
=
kweight2
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
v_absdiff
(
val
,
rval
)));
v_wsum
+=
w
;
v_sum
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
val
)),
w
,
v_sum
);
val
=
vx_load_expand_q
(
ksptr3
+
j
);
w
=
kweight3
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
v_absdiff
(
val
,
rval
)));
v_wsum
+=
w
;
v_sum
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
val
)),
w
,
v_sum
);
v_store_aligned
(
wsum
+
j
,
v_wsum
);
v_store_aligned
(
sum
+
j
,
v_sum
);
}
#endif
#if CV_SIMD128
v_float32x4
kweight4
=
v_load
(
space_weight
+
k
);
#endif
for
(;
j
<
size
.
width
;
j
++
)
{
#if CV_SIMD128
v_uint32x4
rval
=
v_setall_u32
(
sptr
[
j
]);
v_uint32x4
val
(
ksptr0
[
j
],
ksptr1
[
j
],
ksptr2
[
j
],
ksptr3
[
j
]);
v_float32x4
w
=
kweight4
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
v_absdiff
(
val
,
rval
)));
wsum
[
j
]
+=
v_reduce_sum
(
w
);
sum
[
j
]
+=
v_reduce_sum
(
v_cvt_f32
(
v_reinterpret_as_s32
(
val
))
*
w
);
#else
int
rval
=
sptr
[
j
];
int
val
=
ksptr0
[
j
];
float
w
=
space_weight
[
k
]
*
color_weight
[
std
::
abs
(
val
-
rval
)];
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
val
=
ksptr1
[
j
];
w
=
space_weight
[
k
+
1
]
*
color_weight
[
std
::
abs
(
val
-
rval
)];
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
val
=
ksptr2
[
j
];
w
=
space_weight
[
k
+
2
]
*
color_weight
[
std
::
abs
(
val
-
rval
)];
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
val
=
ksptr3
[
j
];
w
=
space_weight
[
k
+
3
]
*
color_weight
[
std
::
abs
(
val
-
rval
)];
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
#endif
}
}
for
(;
k
<
maxk
;
k
++
)
{
{
const
uchar
*
ksptr
=
sptr
+
space_ofs
[
k
];
const
uchar
*
ksptr
=
sptr
+
space_ofs
[
k
];
j
=
0
;
j
=
0
;
...
@@ -126,7 +203,232 @@ public:
...
@@ -126,7 +203,232 @@ public:
float
*
sum_g
=
sum_b
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
float
*
sum_g
=
sum_b
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
float
*
sum_r
=
sum_g
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
float
*
sum_r
=
sum_g
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
float
*
wsum
=
sum_r
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
float
*
wsum
=
sum_r
+
alignSize
(
size
.
width
,
CV_SIMD_WIDTH
);
for
(
k
=
0
;
k
<
maxk
;
k
++
)
k
=
0
;
for
(;
k
<=
maxk
-
4
;
k
+=
4
)
{
const
uchar
*
ksptr0
=
sptr
+
space_ofs
[
k
];
const
uchar
*
ksptr1
=
sptr
+
space_ofs
[
k
+
1
];
const
uchar
*
ksptr2
=
sptr
+
space_ofs
[
k
+
2
];
const
uchar
*
ksptr3
=
sptr
+
space_ofs
[
k
+
3
];
const
uchar
*
rsptr
=
sptr
;
j
=
0
;
#if CV_SIMD
v_float32
kweight0
=
vx_setall_f32
(
space_weight
[
k
]);
v_float32
kweight1
=
vx_setall_f32
(
space_weight
[
k
+
1
]);
v_float32
kweight2
=
vx_setall_f32
(
space_weight
[
k
+
2
]);
v_float32
kweight3
=
vx_setall_f32
(
space_weight
[
k
+
3
]);
for
(;
j
<=
size
.
width
-
v_uint8
::
nlanes
;
j
+=
v_uint8
::
nlanes
,
rsptr
+=
3
*
v_uint8
::
nlanes
,
ksptr0
+=
3
*
v_uint8
::
nlanes
,
ksptr1
+=
3
*
v_uint8
::
nlanes
,
ksptr2
+=
3
*
v_uint8
::
nlanes
,
ksptr3
+=
3
*
v_uint8
::
nlanes
)
{
v_uint8
kb
,
kg
,
kr
,
rb
,
rg
,
rr
;
v_load_deinterleave
(
rsptr
,
rb
,
rg
,
rr
);
v_load_deinterleave
(
ksptr0
,
kb
,
kg
,
kr
);
v_uint16
val0
,
val1
,
val2
,
val3
,
val4
;
v_expand
(
v_absdiff
(
kb
,
rb
),
val0
,
val1
);
v_expand
(
v_absdiff
(
kg
,
rg
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_expand
(
v_absdiff
(
kr
,
rr
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_uint32
vall
,
valh
;
v_expand
(
val0
,
vall
,
valh
);
v_float32
w0
=
kweight0
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
v_float32
w1
=
kweight0
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
,
w0
+
vx_load_aligned
(
wsum
+
j
));
v_store_aligned
(
wsum
+
j
+
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
v_float32
::
nlanes
));
v_expand
(
kb
,
val0
,
val2
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
)));
v_store_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kg
,
val0
,
val3
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
)));
v_store_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kr
,
val0
,
val4
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
)));
v_store_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
)));
v_expand
(
val1
,
vall
,
valh
);
w0
=
kweight0
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
w1
=
kweight0
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
,
w0
+
vx_load_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
));
v_store_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
));
v_expand
(
val2
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val3
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val4
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
)));
v_load_deinterleave
(
ksptr1
,
kb
,
kg
,
kr
);
v_expand
(
v_absdiff
(
kb
,
rb
),
val0
,
val1
);
v_expand
(
v_absdiff
(
kg
,
rg
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_expand
(
v_absdiff
(
kr
,
rr
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_expand
(
val0
,
vall
,
valh
);
w0
=
kweight1
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
w1
=
kweight1
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
,
w0
+
vx_load_aligned
(
wsum
+
j
));
v_store_aligned
(
wsum
+
j
+
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
v_float32
::
nlanes
));
v_expand
(
kb
,
val0
,
val2
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
)));
v_store_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kg
,
val0
,
val3
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
)));
v_store_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kr
,
val0
,
val4
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
)));
v_store_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
)));
v_expand
(
val1
,
vall
,
valh
);
w0
=
kweight1
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
w1
=
kweight1
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
,
w0
+
vx_load_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
));
v_store_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
));
v_expand
(
val2
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val3
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val4
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
)));
v_load_deinterleave
(
ksptr2
,
kb
,
kg
,
kr
);
v_expand
(
v_absdiff
(
kb
,
rb
),
val0
,
val1
);
v_expand
(
v_absdiff
(
kg
,
rg
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_expand
(
v_absdiff
(
kr
,
rr
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_expand
(
val0
,
vall
,
valh
);
w0
=
kweight2
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
w1
=
kweight2
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
,
w0
+
vx_load_aligned
(
wsum
+
j
));
v_store_aligned
(
wsum
+
j
+
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
v_float32
::
nlanes
));
v_expand
(
kb
,
val0
,
val2
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
)));
v_store_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kg
,
val0
,
val3
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
)));
v_store_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kr
,
val0
,
val4
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
)));
v_store_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
)));
v_expand
(
val1
,
vall
,
valh
);
w0
=
kweight2
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
w1
=
kweight2
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
,
w0
+
vx_load_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
));
v_store_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
));
v_expand
(
val2
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val3
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val4
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
)));
v_load_deinterleave
(
ksptr3
,
kb
,
kg
,
kr
);
v_expand
(
v_absdiff
(
kb
,
rb
),
val0
,
val1
);
v_expand
(
v_absdiff
(
kg
,
rg
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_expand
(
v_absdiff
(
kr
,
rr
),
val2
,
val3
);
val0
+=
val2
;
val1
+=
val3
;
v_expand
(
val0
,
vall
,
valh
);
w0
=
kweight3
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
w1
=
kweight3
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
,
w0
+
vx_load_aligned
(
wsum
+
j
));
v_store_aligned
(
wsum
+
j
+
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
v_float32
::
nlanes
));
v_expand
(
kb
,
val0
,
val2
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
)));
v_store_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kg
,
val0
,
val3
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
)));
v_store_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
v_float32
::
nlanes
)));
v_expand
(
kr
,
val0
,
val4
);
v_expand
(
val0
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
)));
v_store_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
v_float32
::
nlanes
)));
v_expand
(
val1
,
vall
,
valh
);
w0
=
kweight3
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
vall
));
w1
=
kweight3
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
valh
));
v_store_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
,
w0
+
vx_load_aligned
(
wsum
+
j
+
2
*
v_float32
::
nlanes
));
v_store_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
,
w1
+
vx_load_aligned
(
wsum
+
j
+
3
*
v_float32
::
nlanes
));
v_expand
(
val2
,
vall
,
valh
);
v_store_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_b
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_b
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val3
,
vall
,
valh
);
v_store_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_g
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_g
+
j
+
3
*
v_float32
::
nlanes
)));
v_expand
(
val4
,
vall
,
valh
);
v_store_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vall
)),
w0
,
vx_load_aligned
(
sum_r
+
j
+
2
*
v_float32
::
nlanes
)));
v_store_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
,
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
valh
)),
w1
,
vx_load_aligned
(
sum_r
+
j
+
3
*
v_float32
::
nlanes
)));
}
#endif
#if CV_SIMD128
v_float32x4
kweight4
=
v_load
(
space_weight
+
k
);
#endif
for
(;
j
<
size
.
width
;
j
++
,
rsptr
+=
3
,
ksptr0
+=
3
,
ksptr1
+=
3
,
ksptr2
+=
3
,
ksptr3
+=
3
)
{
#if CV_SIMD128
v_uint32x4
rb
=
v_setall_u32
(
rsptr
[
0
]);
v_uint32x4
rg
=
v_setall_u32
(
rsptr
[
1
]);
v_uint32x4
rr
=
v_setall_u32
(
rsptr
[
2
]);
v_uint32x4
b
(
ksptr0
[
0
],
ksptr1
[
0
],
ksptr2
[
0
],
ksptr3
[
0
]);
v_uint32x4
g
(
ksptr0
[
1
],
ksptr1
[
1
],
ksptr2
[
1
],
ksptr3
[
1
]);
v_uint32x4
r
(
ksptr0
[
2
],
ksptr1
[
2
],
ksptr2
[
2
],
ksptr3
[
2
]);
v_float32x4
w
=
kweight4
*
v_lut
(
color_weight
,
v_reinterpret_as_s32
(
v_absdiff
(
b
,
rb
)
+
v_absdiff
(
g
,
rg
)
+
v_absdiff
(
r
,
rr
)));
wsum
[
j
]
+=
v_reduce_sum
(
w
);
sum_b
[
j
]
+=
v_reduce_sum
(
v_cvt_f32
(
v_reinterpret_as_s32
(
b
))
*
w
);
sum_g
[
j
]
+=
v_reduce_sum
(
v_cvt_f32
(
v_reinterpret_as_s32
(
g
))
*
w
);
sum_r
[
j
]
+=
v_reduce_sum
(
v_cvt_f32
(
v_reinterpret_as_s32
(
r
))
*
w
);
#else
int
rb
=
rsptr
[
0
],
rg
=
rsptr
[
1
],
rr
=
rsptr
[
2
];
int
b
=
ksptr0
[
0
],
g
=
ksptr0
[
1
],
r
=
ksptr0
[
2
];
float
w
=
space_weight
[
k
]
*
color_weight
[
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
)];
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
b
=
ksptr1
[
0
];
g
=
ksptr1
[
1
];
r
=
ksptr1
[
2
];
w
=
space_weight
[
k
+
1
]
*
color_weight
[
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
)];
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
b
=
ksptr2
[
0
];
g
=
ksptr2
[
1
];
r
=
ksptr2
[
2
];
w
=
space_weight
[
k
+
2
]
*
color_weight
[
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
)];
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
b
=
ksptr3
[
0
];
g
=
ksptr3
[
1
];
r
=
ksptr3
[
2
];
w
=
space_weight
[
k
+
3
]
*
color_weight
[
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
)];
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
#endif
}
}
for
(;
k
<
maxk
;
k
++
)
{
{
const
uchar
*
ksptr
=
sptr
+
space_ofs
[
k
];
const
uchar
*
ksptr
=
sptr
+
space_ofs
[
k
];
const
uchar
*
rsptr
=
sptr
;
const
uchar
*
rsptr
=
sptr
;
...
@@ -421,7 +723,130 @@ public:
...
@@ -421,7 +723,130 @@ public:
v_float32
v_one
=
vx_setall_f32
(
1.
f
);
v_float32
v_one
=
vx_setall_f32
(
1.
f
);
v_float32
sindex
=
vx_setall_f32
(
scale_index
);
v_float32
sindex
=
vx_setall_f32
(
scale_index
);
#endif
#endif
for
(
k
=
0
;
k
<
maxk
;
k
++
)
k
=
0
;
for
(;
k
<=
maxk
-
4
;
k
+=
4
)
{
const
float
*
ksptr0
=
sptr
+
space_ofs
[
k
];
const
float
*
ksptr1
=
sptr
+
space_ofs
[
k
+
1
];
const
float
*
ksptr2
=
sptr
+
space_ofs
[
k
+
2
];
const
float
*
ksptr3
=
sptr
+
space_ofs
[
k
+
3
];
j
=
0
;
#if CV_SIMD
v_float32
kweight0
=
vx_setall_f32
(
space_weight
[
k
]);
v_float32
kweight1
=
vx_setall_f32
(
space_weight
[
k
+
1
]);
v_float32
kweight2
=
vx_setall_f32
(
space_weight
[
k
+
2
]);
v_float32
kweight3
=
vx_setall_f32
(
space_weight
[
k
+
3
]);
for
(;
j
<=
size
.
width
-
v_float32
::
nlanes
;
j
+=
v_float32
::
nlanes
)
{
v_float32
rval
=
vx_load
(
sptr
+
j
);
v_float32
val
=
vx_load
(
ksptr0
+
j
);
v_float32
knan
=
v_not_nan
(
val
);
v_float32
alpha
=
(
v_absdiff
(
val
,
rval
)
*
sindex
)
&
v_not_nan
(
rval
)
&
knan
;
v_int32
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
v_float32
w
=
(
kweight0
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_float32
v_wsum
=
vx_load_aligned
(
wsum
+
j
)
+
w
;
v_float32
v_sum
=
v_muladd
(
val
&
knan
,
w
,
vx_load_aligned
(
sum
+
j
));
val
=
vx_load
(
ksptr1
+
j
);
knan
=
v_not_nan
(
val
);
alpha
=
(
v_absdiff
(
val
,
rval
)
*
sindex
)
&
v_not_nan
(
rval
)
&
knan
;
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
w
=
(
kweight1
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_wsum
+=
w
;
v_sum
=
v_muladd
(
val
&
knan
,
w
,
v_sum
);
val
=
vx_load
(
ksptr2
+
j
);
knan
=
v_not_nan
(
val
);
alpha
=
(
v_absdiff
(
val
,
rval
)
*
sindex
)
&
v_not_nan
(
rval
)
&
knan
;
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
w
=
(
kweight2
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_wsum
+=
w
;
v_sum
=
v_muladd
(
val
&
knan
,
w
,
v_sum
);
val
=
vx_load
(
ksptr3
+
j
);
knan
=
v_not_nan
(
val
);
alpha
=
(
v_absdiff
(
val
,
rval
)
*
sindex
)
&
v_not_nan
(
rval
)
&
knan
;
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
w
=
(
kweight3
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_wsum
+=
w
;
v_sum
=
v_muladd
(
val
&
knan
,
w
,
v_sum
);
v_store_aligned
(
wsum
+
j
,
v_wsum
);
v_store_aligned
(
sum
+
j
,
v_sum
);
}
#endif
#if CV_SIMD128
v_float32x4
v_one4
=
v_setall_f32
(
1.
f
);
v_float32x4
sindex4
=
v_setall_f32
(
scale_index
);
v_float32x4
kweight4
=
v_load
(
space_weight
+
k
);
#endif
for
(;
j
<
size
.
width
;
j
++
)
{
#if CV_SIMD128
v_float32x4
rval
=
v_setall_f32
(
sptr
[
j
]);
v_float32x4
val
(
ksptr0
[
j
],
ksptr1
[
j
],
ksptr2
[
j
],
ksptr3
[
j
]);
v_float32x4
knan
=
v_not_nan
(
val
);
v_float32x4
alpha
=
(
v_absdiff
(
val
,
rval
)
*
sindex4
)
&
v_not_nan
(
rval
)
&
knan
;
v_int32x4
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
v_float32x4
w
=
(
kweight4
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one4
-
alpha
)))
&
knan
;
wsum
[
j
]
+=
v_reduce_sum
(
w
);
sum
[
j
]
+=
v_reduce_sum
((
val
&
knan
)
*
w
);
#else
float
rval
=
sptr
[
j
];
float
val
=
ksptr0
[
j
];
float
alpha
=
std
::
abs
(
val
-
rval
)
*
scale_index
;
int
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
cvIsNaN
(
val
))
{
float
w
=
space_weight
[
k
]
*
(
cvIsNaN
(
rval
)
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
}
val
=
ksptr1
[
j
];
alpha
=
std
::
abs
(
val
-
rval
)
*
scale_index
;
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
cvIsNaN
(
val
))
{
float
w
=
space_weight
[
k
+
1
]
*
(
cvIsNaN
(
rval
)
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
}
val
=
ksptr2
[
j
];
alpha
=
std
::
abs
(
val
-
rval
)
*
scale_index
;
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
cvIsNaN
(
val
))
{
float
w
=
space_weight
[
k
+
2
]
*
(
cvIsNaN
(
rval
)
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
}
val
=
ksptr3
[
j
];
alpha
=
std
::
abs
(
val
-
rval
)
*
scale_index
;
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
cvIsNaN
(
val
))
{
float
w
=
space_weight
[
k
+
3
]
*
(
cvIsNaN
(
rval
)
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum
[
j
]
+=
val
*
w
;
}
#endif
}
}
for
(;
k
<
maxk
;
k
++
)
{
{
const
float
*
ksptr
=
sptr
+
space_ofs
[
k
];
const
float
*
ksptr
=
sptr
+
space_ofs
[
k
];
j
=
0
;
j
=
0
;
...
@@ -483,7 +908,162 @@ public:
...
@@ -483,7 +908,162 @@ public:
v_float32
v_one
=
vx_setall_f32
(
1.
f
);
v_float32
v_one
=
vx_setall_f32
(
1.
f
);
v_float32
sindex
=
vx_setall_f32
(
scale_index
);
v_float32
sindex
=
vx_setall_f32
(
scale_index
);
#endif
#endif
for
(
k
=
0
;
k
<
maxk
;
k
++
)
k
=
0
;
for
(;
k
<=
maxk
-
4
;
k
+=
4
)
{
const
float
*
ksptr0
=
sptr
+
space_ofs
[
k
];
const
float
*
ksptr1
=
sptr
+
space_ofs
[
k
+
1
];
const
float
*
ksptr2
=
sptr
+
space_ofs
[
k
+
2
];
const
float
*
ksptr3
=
sptr
+
space_ofs
[
k
+
3
];
const
float
*
rsptr
=
sptr
;
j
=
0
;
#if CV_SIMD
v_float32
kweight0
=
vx_setall_f32
(
space_weight
[
k
]);
v_float32
kweight1
=
vx_setall_f32
(
space_weight
[
k
+
1
]);
v_float32
kweight2
=
vx_setall_f32
(
space_weight
[
k
+
2
]);
v_float32
kweight3
=
vx_setall_f32
(
space_weight
[
k
+
3
]);
for
(;
j
<=
size
.
width
-
v_float32
::
nlanes
;
j
+=
v_float32
::
nlanes
,
rsptr
+=
3
*
v_float32
::
nlanes
,
ksptr0
+=
3
*
v_float32
::
nlanes
,
ksptr1
+=
3
*
v_float32
::
nlanes
,
ksptr2
+=
3
*
v_float32
::
nlanes
,
ksptr3
+=
3
*
v_float32
::
nlanes
)
{
v_float32
kb
,
kg
,
kr
,
rb
,
rg
,
rr
;
v_load_deinterleave
(
rsptr
,
rb
,
rg
,
rr
);
v_load_deinterleave
(
ksptr0
,
kb
,
kg
,
kr
);
v_float32
knan
=
v_not_nan
(
kb
)
&
v_not_nan
(
kg
)
&
v_not_nan
(
kr
);
v_float32
alpha
=
((
v_absdiff
(
kb
,
rb
)
+
v_absdiff
(
kg
,
rg
)
+
v_absdiff
(
kr
,
rr
))
*
sindex
)
&
v_not_nan
(
rb
)
&
v_not_nan
(
rg
)
&
v_not_nan
(
rr
)
&
knan
;
v_int32
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
v_float32
w
=
(
kweight0
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_float32
v_wsum
=
vx_load_aligned
(
wsum
+
j
)
+
w
;
v_float32
v_sum_b
=
v_muladd
(
kb
&
knan
,
w
,
vx_load_aligned
(
sum_b
+
j
));
v_float32
v_sum_g
=
v_muladd
(
kg
&
knan
,
w
,
vx_load_aligned
(
sum_g
+
j
));
v_float32
v_sum_r
=
v_muladd
(
kr
&
knan
,
w
,
vx_load_aligned
(
sum_r
+
j
));
v_load_deinterleave
(
ksptr1
,
kb
,
kg
,
kr
);
knan
=
v_not_nan
(
kb
)
&
v_not_nan
(
kg
)
&
v_not_nan
(
kr
);
alpha
=
((
v_absdiff
(
kb
,
rb
)
+
v_absdiff
(
kg
,
rg
)
+
v_absdiff
(
kr
,
rr
))
*
sindex
)
&
v_not_nan
(
rb
)
&
v_not_nan
(
rg
)
&
v_not_nan
(
rr
)
&
knan
;
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
w
=
(
kweight1
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_wsum
+=
w
;
v_sum_b
=
v_muladd
(
kb
&
knan
,
w
,
v_sum_b
);
v_sum_g
=
v_muladd
(
kg
&
knan
,
w
,
v_sum_g
);
v_sum_r
=
v_muladd
(
kr
&
knan
,
w
,
v_sum_r
);
v_load_deinterleave
(
ksptr2
,
kb
,
kg
,
kr
);
knan
=
v_not_nan
(
kb
)
&
v_not_nan
(
kg
)
&
v_not_nan
(
kr
);
alpha
=
((
v_absdiff
(
kb
,
rb
)
+
v_absdiff
(
kg
,
rg
)
+
v_absdiff
(
kr
,
rr
))
*
sindex
)
&
v_not_nan
(
rb
)
&
v_not_nan
(
rg
)
&
v_not_nan
(
rr
)
&
knan
;
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
w
=
(
kweight2
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_wsum
+=
w
;
v_sum_b
=
v_muladd
(
kb
&
knan
,
w
,
v_sum_b
);
v_sum_g
=
v_muladd
(
kg
&
knan
,
w
,
v_sum_g
);
v_sum_r
=
v_muladd
(
kr
&
knan
,
w
,
v_sum_r
);
v_load_deinterleave
(
ksptr3
,
kb
,
kg
,
kr
);
knan
=
v_not_nan
(
kb
)
&
v_not_nan
(
kg
)
&
v_not_nan
(
kr
);
alpha
=
((
v_absdiff
(
kb
,
rb
)
+
v_absdiff
(
kg
,
rg
)
+
v_absdiff
(
kr
,
rr
))
*
sindex
)
&
v_not_nan
(
rb
)
&
v_not_nan
(
rg
)
&
v_not_nan
(
rr
)
&
knan
;
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
w
=
(
kweight3
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one
-
alpha
)))
&
knan
;
v_wsum
+=
w
;
v_sum_b
=
v_muladd
(
kb
&
knan
,
w
,
v_sum_b
);
v_sum_g
=
v_muladd
(
kg
&
knan
,
w
,
v_sum_g
);
v_sum_r
=
v_muladd
(
kr
&
knan
,
w
,
v_sum_r
);
v_store_aligned
(
wsum
+
j
,
v_wsum
);
v_store_aligned
(
sum_b
+
j
,
v_sum_b
);
v_store_aligned
(
sum_g
+
j
,
v_sum_g
);
v_store_aligned
(
sum_r
+
j
,
v_sum_r
);
}
#endif
#if CV_SIMD128
v_float32x4
v_one4
=
v_setall_f32
(
1.
f
);
v_float32x4
sindex4
=
v_setall_f32
(
scale_index
);
v_float32x4
kweight4
=
v_load
(
space_weight
+
k
);
#endif
for
(;
j
<
size
.
width
;
j
++
,
rsptr
+=
3
,
ksptr0
+=
3
,
ksptr1
+=
3
,
ksptr2
+=
3
,
ksptr3
+=
3
)
{
#if CV_SIMD128
v_float32x4
rb
=
v_setall_f32
(
rsptr
[
0
]);
v_float32x4
rg
=
v_setall_f32
(
rsptr
[
1
]);
v_float32x4
rr
=
v_setall_f32
(
rsptr
[
2
]);
v_float32x4
kb
(
ksptr0
[
0
],
ksptr1
[
0
],
ksptr2
[
0
],
ksptr3
[
0
]);
v_float32x4
kg
(
ksptr0
[
1
],
ksptr1
[
1
],
ksptr2
[
1
],
ksptr3
[
1
]);
v_float32x4
kr
(
ksptr0
[
2
],
ksptr1
[
2
],
ksptr2
[
2
],
ksptr3
[
2
]);
v_float32x4
knan
=
v_not_nan
(
kb
)
&
v_not_nan
(
kg
)
&
v_not_nan
(
kr
);
v_float32x4
alpha
=
((
v_absdiff
(
kb
,
rb
)
+
v_absdiff
(
kg
,
rg
)
+
v_absdiff
(
kr
,
rr
))
*
sindex4
)
&
v_not_nan
(
rb
)
&
v_not_nan
(
rg
)
&
v_not_nan
(
rr
)
&
knan
;
v_int32x4
idx
=
v_trunc
(
alpha
);
alpha
-=
v_cvt_f32
(
idx
);
v_float32x4
w
=
(
kweight4
*
v_muladd
(
v_lut
(
expLUT
+
1
,
idx
),
alpha
,
v_lut
(
expLUT
,
idx
)
*
(
v_one4
-
alpha
)))
&
knan
;
wsum
[
j
]
+=
v_reduce_sum
(
w
);
sum_b
[
j
]
+=
v_reduce_sum
((
kb
&
knan
)
*
w
);
sum_g
[
j
]
+=
v_reduce_sum
((
kg
&
knan
)
*
w
);
sum_r
[
j
]
+=
v_reduce_sum
((
kr
&
knan
)
*
w
);
#else
float
rb
=
rsptr
[
0
],
rg
=
rsptr
[
1
],
rr
=
rsptr
[
2
];
bool
r_NAN
=
cvIsNaN
(
rb
)
||
cvIsNaN
(
rg
)
||
cvIsNaN
(
rr
);
float
b
=
ksptr0
[
0
],
g
=
ksptr0
[
1
],
r
=
ksptr0
[
2
];
bool
v_NAN
=
cvIsNaN
(
b
)
||
cvIsNaN
(
g
)
||
cvIsNaN
(
r
);
float
alpha
=
(
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
))
*
scale_index
;
int
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
v_NAN
)
{
float
w
=
space_weight
[
k
]
*
(
r_NAN
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
}
b
=
ksptr1
[
0
];
g
=
ksptr1
[
1
];
r
=
ksptr1
[
2
];
v_NAN
=
cvIsNaN
(
b
)
||
cvIsNaN
(
g
)
||
cvIsNaN
(
r
);
alpha
=
(
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
))
*
scale_index
;
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
v_NAN
)
{
float
w
=
space_weight
[
k
+
1
]
*
(
r_NAN
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
}
b
=
ksptr2
[
0
];
g
=
ksptr2
[
1
];
r
=
ksptr2
[
2
];
v_NAN
=
cvIsNaN
(
b
)
||
cvIsNaN
(
g
)
||
cvIsNaN
(
r
);
alpha
=
(
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
))
*
scale_index
;
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
v_NAN
)
{
float
w
=
space_weight
[
k
+
2
]
*
(
r_NAN
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
}
b
=
ksptr3
[
0
];
g
=
ksptr3
[
1
];
r
=
ksptr3
[
2
];
v_NAN
=
cvIsNaN
(
b
)
||
cvIsNaN
(
g
)
||
cvIsNaN
(
r
);
alpha
=
(
std
::
abs
(
b
-
rb
)
+
std
::
abs
(
g
-
rg
)
+
std
::
abs
(
r
-
rr
))
*
scale_index
;
idx
=
cvFloor
(
alpha
);
alpha
-=
idx
;
if
(
!
v_NAN
)
{
float
w
=
space_weight
[
k
+
3
]
*
(
r_NAN
?
1.
f
:
(
expLUT
[
idx
]
+
alpha
*
(
expLUT
[
idx
+
1
]
-
expLUT
[
idx
])));
wsum
[
j
]
+=
w
;
sum_b
[
j
]
+=
b
*
w
;
sum_g
[
j
]
+=
g
*
w
;
sum_r
[
j
]
+=
r
*
w
;
}
#endif
}
}
for
(;
k
<
maxk
;
k
++
)
{
{
const
float
*
ksptr
=
sptr
+
space_ofs
[
k
];
const
float
*
ksptr
=
sptr
+
space_ofs
[
k
];
const
float
*
rsptr
=
sptr
;
const
float
*
rsptr
=
sptr
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment