Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
333a767b
Commit
333a767b
authored
Feb 28, 2020
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
parents
db5f1c35
5012fc5d
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1026 additions
and
105 deletions
+1026
-105
OpenCVCompilerOptions.cmake
cmake/OpenCVCompilerOptions.cmake
+4
-0
mat.inl.hpp
modules/core/include/opencv2/core/mat.inl.hpp
+20
-0
ie_ngraph.cpp
modules/dnn/src/ie_ngraph.cpp
+6
-5
window_QT.cpp
modules/highgui/src/window_QT.cpp
+27
-8
window_QT.h
modules/highgui/src/window_QT.h
+3
-0
window_QT.qrc
modules/highgui/src/window_QT.qrc
+1
-0
exif.hpp
modules/imgcodecs/src/exif.hpp
+2
-1
sumpixels.simd.hpp
modules/imgproc/src/sumpixels.simd.hpp
+950
-91
cap_v4l.cpp
modules/videoio/src/cap_v4l.cpp
+13
-0
No files found.
cmake/OpenCVCompilerOptions.cmake
View file @
333a767b
...
@@ -153,6 +153,10 @@ if(CV_GCC OR CV_CLANG)
...
@@ -153,6 +153,10 @@ if(CV_GCC OR CV_CLANG)
if
(
CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0
)
if
(
CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0
)
add_extra_compiler_option
(
-Wno-missing-field-initializers
)
# GCC 4.x emits warnings about {}, fixed in GCC 5+
add_extra_compiler_option
(
-Wno-missing-field-initializers
)
# GCC 4.x emits warnings about {}, fixed in GCC 5+
endif
()
endif
()
if
(
CV_CLANG AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0
)
add_extra_compiler_option
(
-Wno-deprecated-enum-enum-conversion
)
add_extra_compiler_option
(
-Wno-deprecated-anon-enum-enum-conversion
)
endif
()
endif
()
endif
()
add_extra_compiler_option
(
-fdiagnostics-show-option
)
add_extra_compiler_option
(
-fdiagnostics-show-option
)
...
...
modules/core/include/opencv2/core/mat.inl.hpp
View file @
333a767b
...
@@ -54,6 +54,21 @@
...
@@ -54,6 +54,21 @@
#pragma warning( disable: 4127 )
#pragma warning( disable: 4127 )
#endif
#endif
#if defined(CV_SKIP_DISABLE_CLANG_ENUM_WARNINGS)
// nothing
#elif defined(CV_FORCE_DISABLE_CLANG_ENUM_WARNINGS)
#define CV_DISABLE_CLANG_ENUM_WARNINGS
#elif defined(__clang__) && defined(__has_warning)
#if __has_warning("-Wdeprecated-enum-enum-conversion") && __has_warning("-Wdeprecated-anon-enum-enum-conversion")
#define CV_DISABLE_CLANG_ENUM_WARNINGS
#endif
#endif
#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-enum-enum-conversion"
#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
#endif
namespace
cv
namespace
cv
{
{
CV__DEBUG_NS_BEGIN
CV__DEBUG_NS_BEGIN
...
@@ -3980,4 +3995,9 @@ inline void UMatData::markDeviceCopyObsolete(bool flag)
...
@@ -3980,4 +3995,9 @@ inline void UMatData::markDeviceCopyObsolete(bool flag)
#pragma warning( pop )
#pragma warning( pop )
#endif
#endif
#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
#undef CV_DISABLE_CLANG_ENUM_WARNINGS
#pragma clang diagnostic pop
#endif
#endif
#endif
modules/dnn/src/ie_ngraph.cpp
View file @
333a767b
...
@@ -442,13 +442,14 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
...
@@ -442,13 +442,14 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
config
.
emplace
(
"VPU_DETECT_NETWORK_BATCH"
,
CONFIG_VALUE
(
NO
));
config
.
emplace
(
"VPU_DETECT_NETWORK_BATCH"
,
CONFIG_VALUE
(
NO
));
}
}
bool
isHetero
=
false
;
bool
isHetero
=
device_name
==
"FPGA"
;
if
(
device_name
!=
"CPU"
)
// It is actual only for non-CPU targets and networks built in runtime using nGraph.
// We do not check IR models because they can be with version less than IRv10
if
(
!
isHetero
&&
device_name
!=
"CPU"
&&
!
hasNetOwner
)
{
{
isHetero
=
device_name
==
"FPGA"
;
for
(
auto
&
node
:
net
.
getFunction
()
->
get_ops
())
for
(
auto
&
layer
:
net
)
{
{
if
(
layer
->
type
==
kOpenCVLayersType
)
if
(
node
->
description
()
==
kOpenCVLayersType
)
{
{
isHetero
=
true
;
isHetero
=
true
;
break
;
break
;
...
...
modules/highgui/src/window_QT.cpp
View file @
333a767b
...
@@ -1856,7 +1856,7 @@ void CvWindow::displayStatusBar(QString text, int delayms)
...
@@ -1856,7 +1856,7 @@ void CvWindow::displayStatusBar(QString text, int delayms)
void
CvWindow
::
enablePropertiesButton
()
void
CvWindow
::
enablePropertiesButton
()
{
{
if
(
!
vect_QActions
.
empty
())
if
(
!
vect_QActions
.
empty
())
vect_QActions
[
9
]
->
setDisabled
(
false
);
vect_QActions
[
10
]
->
setDisabled
(
false
);
}
}
...
@@ -1991,7 +1991,7 @@ void CvWindow::createView()
...
@@ -1991,7 +1991,7 @@ void CvWindow::createView()
void
CvWindow
::
createActions
()
void
CvWindow
::
createActions
()
{
{
vect_QActions
.
resize
(
1
0
);
vect_QActions
.
resize
(
1
1
);
QWidget
*
view
=
myView
->
getWidget
();
QWidget
*
view
=
myView
->
getWidget
();
...
@@ -2032,18 +2032,22 @@ void CvWindow::createActions()
...
@@ -2032,18 +2032,22 @@ void CvWindow::createActions()
vect_QActions
[
8
]
->
setIconVisibleInMenu
(
true
);
vect_QActions
[
8
]
->
setIconVisibleInMenu
(
true
);
QObject
::
connect
(
vect_QActions
[
8
],
SIGNAL
(
triggered
()),
view
,
SLOT
(
saveView
()));
QObject
::
connect
(
vect_QActions
[
8
],
SIGNAL
(
triggered
()),
view
,
SLOT
(
saveView
()));
vect_QActions
[
9
]
=
new
QAction
(
QIcon
(
":/
properties-icon"
),
"Display properties window (CTRL+P
)"
,
this
);
vect_QActions
[
9
]
=
new
QAction
(
QIcon
(
":/
copy_clipbrd-icon"
),
"Copy image to clipboard (CTRL+C
)"
,
this
);
vect_QActions
[
9
]
->
setIconVisibleInMenu
(
true
);
vect_QActions
[
9
]
->
setIconVisibleInMenu
(
true
);
QObject
::
connect
(
vect_QActions
[
9
],
SIGNAL
(
triggered
()),
this
,
SLOT
(
displayPropertiesWin
()));
QObject
::
connect
(
vect_QActions
[
9
],
SIGNAL
(
triggered
()),
view
,
SLOT
(
copy2Clipbrd
()));
vect_QActions
[
10
]
=
new
QAction
(
QIcon
(
":/properties-icon"
),
"Display properties window (CTRL+P)"
,
this
);
vect_QActions
[
10
]
->
setIconVisibleInMenu
(
true
);
QObject
::
connect
(
vect_QActions
[
10
],
SIGNAL
(
triggered
()),
this
,
SLOT
(
displayPropertiesWin
()));
if
(
global_control_panel
->
myLayout
->
count
()
==
0
)
if
(
global_control_panel
->
myLayout
->
count
()
==
0
)
vect_QActions
[
9
]
->
setDisabled
(
true
);
vect_QActions
[
10
]
->
setDisabled
(
true
);
}
}
void
CvWindow
::
createShortcuts
()
void
CvWindow
::
createShortcuts
()
{
{
vect_QShortcuts
.
resize
(
1
0
);
vect_QShortcuts
.
resize
(
1
1
);
QWidget
*
view
=
myView
->
getWidget
();
QWidget
*
view
=
myView
->
getWidget
();
...
@@ -2074,8 +2078,11 @@ void CvWindow::createShortcuts()
...
@@ -2074,8 +2078,11 @@ void CvWindow::createShortcuts()
vect_QShortcuts
[
8
]
=
new
QShortcut
(
shortcut_save_img
,
this
);
vect_QShortcuts
[
8
]
=
new
QShortcut
(
shortcut_save_img
,
this
);
QObject
::
connect
(
vect_QShortcuts
[
8
],
SIGNAL
(
activated
()),
view
,
SLOT
(
saveView
()));
QObject
::
connect
(
vect_QShortcuts
[
8
],
SIGNAL
(
activated
()),
view
,
SLOT
(
saveView
()));
vect_QShortcuts
[
9
]
=
new
QShortcut
(
shortcut_properties_win
,
this
);
vect_QShortcuts
[
9
]
=
new
QShortcut
(
shortcut_copy_clipbrd
,
this
);
QObject
::
connect
(
vect_QShortcuts
[
9
],
SIGNAL
(
activated
()),
this
,
SLOT
(
displayPropertiesWin
()));
QObject
::
connect
(
vect_QShortcuts
[
9
],
SIGNAL
(
activated
()),
view
,
SLOT
(
copy2Clipbrd
()));
vect_QShortcuts
[
10
]
=
new
QShortcut
(
shortcut_properties_win
,
this
);
QObject
::
connect
(
vect_QShortcuts
[
10
],
SIGNAL
(
activated
()),
this
,
SLOT
(
displayPropertiesWin
()));
}
}
...
@@ -2697,6 +2704,18 @@ void DefaultViewPort::saveView()
...
@@ -2697,6 +2704,18 @@ void DefaultViewPort::saveView()
}
}
//copy image to clipboard
void
DefaultViewPort
::
copy2Clipbrd
()
{
// Create a new pixmap to render the viewport into
QPixmap
viewportPixmap
(
viewport
()
->
size
());
viewport
()
->
render
(
&
viewportPixmap
);
QClipboard
*
pClipboard
=
QApplication
::
clipboard
();
pClipboard
->
setPixmap
(
viewportPixmap
);
}
void
DefaultViewPort
::
contextMenuEvent
(
QContextMenuEvent
*
evnt
)
void
DefaultViewPort
::
contextMenuEvent
(
QContextMenuEvent
*
evnt
)
{
{
if
(
centralWidget
->
vect_QActions
.
size
()
>
0
)
if
(
centralWidget
->
vect_QActions
.
size
()
>
0
)
...
...
modules/highgui/src/window_QT.h
View file @
333a767b
...
@@ -76,6 +76,7 @@
...
@@ -76,6 +76,7 @@
#include <QDate>
#include <QDate>
#include <QFileDialog>
#include <QFileDialog>
#include <QToolBar>
#include <QToolBar>
#include <QClipboard>
#include <QAction>
#include <QAction>
#include <QCheckBox>
#include <QCheckBox>
...
@@ -91,6 +92,7 @@ enum { CV_MODE_NORMAL = 0, CV_MODE_OPENGL = 1 };
...
@@ -91,6 +92,7 @@ enum { CV_MODE_NORMAL = 0, CV_MODE_OPENGL = 1 };
enum
{
shortcut_zoom_normal
=
Qt
::
CTRL
+
Qt
::
Key_Z
,
enum
{
shortcut_zoom_normal
=
Qt
::
CTRL
+
Qt
::
Key_Z
,
shortcut_zoom_imgRegion
=
Qt
::
CTRL
+
Qt
::
Key_X
,
shortcut_zoom_imgRegion
=
Qt
::
CTRL
+
Qt
::
Key_X
,
shortcut_save_img
=
Qt
::
CTRL
+
Qt
::
Key_S
,
shortcut_save_img
=
Qt
::
CTRL
+
Qt
::
Key_S
,
shortcut_copy_clipbrd
=
Qt
::
CTRL
+
Qt
::
Key_C
,
shortcut_properties_win
=
Qt
::
CTRL
+
Qt
::
Key_P
,
shortcut_properties_win
=
Qt
::
CTRL
+
Qt
::
Key_P
,
shortcut_zoom_in
=
Qt
::
CTRL
+
Qt
::
Key_Plus
,
//QKeySequence(QKeySequence::ZoomIn),
shortcut_zoom_in
=
Qt
::
CTRL
+
Qt
::
Key_Plus
,
//QKeySequence(QKeySequence::ZoomIn),
shortcut_zoom_out
=
Qt
::
CTRL
+
Qt
::
Key_Minus
,
//QKeySequence(QKeySequence::ZoomOut),
shortcut_zoom_out
=
Qt
::
CTRL
+
Qt
::
Key_Minus
,
//QKeySequence(QKeySequence::ZoomOut),
...
@@ -518,6 +520,7 @@ public slots:
...
@@ -518,6 +520,7 @@ public slots:
void
ZoomOut
();
void
ZoomOut
();
void
saveView
();
void
saveView
();
void
copy2Clipbrd
();
protected
:
protected
:
void
contextMenuEvent
(
QContextMenuEvent
*
event
)
CV_OVERRIDE
;
void
contextMenuEvent
(
QContextMenuEvent
*
event
)
CV_OVERRIDE
;
...
...
modules/highgui/src/window_QT.qrc
View file @
333a767b
...
@@ -9,6 +9,7 @@
...
@@ -9,6 +9,7 @@
<file alias="zoom_in-icon">files_Qt/Milky/48/106.png</file>
<file alias="zoom_in-icon">files_Qt/Milky/48/106.png</file>
<file alias="zoom_out-icon">files_Qt/Milky/48/107.png</file>
<file alias="zoom_out-icon">files_Qt/Milky/48/107.png</file>
<file alias="save-icon">files_Qt/Milky/48/7.png</file>
<file alias="save-icon">files_Qt/Milky/48/7.png</file>
<file alias="copy_clipbrd-icon">files_Qt/Milky/48/43.png</file>
<file alias="properties-icon">files_Qt/Milky/48/38.png</file>
<file alias="properties-icon">files_Qt/Milky/48/38.png</file>
<file alias="stylesheet-trackbar">files_Qt/stylesheet_trackbar.qss</file>
<file alias="stylesheet-trackbar">files_Qt/stylesheet_trackbar.qss</file>
</qresource>
</qresource>
...
...
modules/imgcodecs/src/exif.hpp
View file @
333a767b
...
@@ -154,7 +154,8 @@ enum ImageOrientation
...
@@ -154,7 +154,8 @@ enum ImageOrientation
* Usage example for getting the orientation of the image:
* Usage example for getting the orientation of the image:
*
*
* @code
* @code
* ExifReader reader(fileName);
* std::ifstream stream(filename,std::ios_base::in | std::ios_base::binary);
* ExifReader reader(stream);
* if( reader.parse() )
* if( reader.parse() )
* {
* {
* int orientation = reader.getTag(Orientation).field_u16;
* int orientation = reader.getTag(Orientation).field_u16;
...
...
modules/imgproc/src/sumpixels.simd.hpp
View file @
333a767b
...
@@ -75,32 +75,6 @@ struct Integral_SIMD
...
@@ -75,32 +75,6 @@ struct Integral_SIMD
}
}
};
};
#if CV_AVX512_SKX
template
<>
struct
Integral_SIMD
<
uchar
,
double
,
double
>
{
Integral_SIMD
()
{};
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
double
*
tilted
,
size_t
_tiltedstep
,
int
width
,
int
height
,
int
cn
)
const
{
CV_UNUSED
(
_tiltedstep
);
// TODO: Add support for 1 channel input (WIP)
if
(
!
tilted
&&
(
cn
<=
4
))
{
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
,
cn
);
return
true
;
}
return
false
;
}
};
#endif
#if CV_SIMD && CV_SIMD_WIDTH <= 64
#if CV_SIMD && CV_SIMD_WIDTH <= 64
template
<>
template
<>
...
@@ -114,57 +88,304 @@ struct Integral_SIMD<uchar, int, double>
...
@@ -114,57 +88,304 @@ struct Integral_SIMD<uchar, int, double>
int
*
tilted
,
size_t
,
int
*
tilted
,
size_t
,
int
width
,
int
height
,
int
cn
)
const
int
width
,
int
height
,
int
cn
)
const
{
{
if
(
sqsum
||
tilted
||
cn
!=
1
)
if
(
sqsum
||
tilted
||
cn
>
4
)
return
false
;
#if !CV_SSE4_1 && CV_SSE2
// 3 channel code is slower for SSE2 & SSE3
if
(
cn
==
3
)
return
false
;
return
false
;
#endif
width
*=
cn
;
// the first iteration
// the first iteration
memset
(
sum
,
0
,
(
width
+
1
)
*
sizeof
(
int
));
memset
(
sum
,
0
,
(
width
+
cn
)
*
sizeof
(
int
));
// the others
if
(
cn
==
1
)
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
// the others
int
*
prev_sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
for
(
int
i
=
0
;
i
<
height
;
++
i
)
int
*
sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
int
*
prev_sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
int
*
sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
sum_row
[
-
1
]
=
0
;
sum_row
[
-
1
]
=
0
;
v_int32
prev
=
vx_setzero_s32
();
int
j
=
0
;
for
(
;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_int32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
)),
prev
.
val
);
el4h
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
)),
_mm256_permutevar8x32_epi32
(
el4l
.
val
,
shmask
));
prev
.
val
=
_mm256_permutevar8x32_epi32
(
el4h
.
val
,
shmask
);
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_expand
(
el8
,
el4l
,
el4h
);
el4l
+=
prev
;
el4h
+=
el4l
;
prev
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h
);
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
));
}
v_int32
prev
=
vx_setzero_s32
();
for
(
int
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
int
j
=
0
;
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
for
(
;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
}
}
else
if
(
cn
==
2
)
{
// the others
v_int16
mask
=
vx_setall_s16
((
short
)
0xff
);
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
v_int32
el4l
,
el4h
;
int
*
prev_sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
int
*
sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
0
;
v_int32
prev_1
=
vx_setzero_s32
(),
prev_2
=
vx_setzero_s32
();
int
j
=
0
;
for
(
;
j
+
v_uint16
::
nlanes
*
cn
<=
width
;
j
+=
v_uint16
::
nlanes
*
cn
)
{
v_int16
v_src_row
=
v_reinterpret_as_s16
(
vx_load
(
src_row
+
j
));
v_int16
el8_1
=
v_src_row
&
mask
;
v_int16
el8_2
=
v_reinterpret_as_s16
(
v_reinterpret_as_u16
(
v_src_row
)
>>
8
);
v_int32
el4l_1
,
el4h_1
,
el4l_2
,
el4h_2
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
__m256i
vsum_1
=
_mm256_add_epi16
(
el8_1
.
val
,
_mm256_slli_si256
(
el8_1
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
__m256i
vsum_2
=
_mm256_add_epi16
(
el8_2
.
val
,
_mm256_slli_si256
(
el8_2
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
4
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
4
));
el4l
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
)),
prev
.
val
);
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
8
));
el4h
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
)),
_mm256_permutevar8x32_epi32
(
el4l
.
val
,
shmask
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
8
));
prev
.
val
=
_mm256_permutevar8x32_epi32
(
el4h
.
val
,
shmask
);
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l_1
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_1
)),
prev_1
.
val
);
el4l_2
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_2
)),
prev_2
.
val
);
el4h_1
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_1
)),
_mm256_permutevar8x32_epi32
(
el4l_1
.
val
,
shmask
));
el4h_2
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_2
)),
_mm256_permutevar8x32_epi32
(
el4l_2
.
val
,
shmask
));
prev_1
.
val
=
_mm256_permutevar8x32_epi32
(
el4h_1
.
val
,
shmask
);
prev_2
.
val
=
_mm256_permutevar8x32_epi32
(
el4h_2
.
val
,
shmask
);
#else
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8_1
+=
v_rotate_left
<
1
>
(
el8_1
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
el8_2
+=
v_rotate_left
<
1
>
(
el8_2
);
el8_1
+=
v_rotate_left
<
2
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
2
>
(
el8_2
);
#if CV_SIMD_WIDTH >= 32
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
el8_1
+=
v_rotate_left
<
4
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
4
>
(
el8_2
);
#if CV_SIMD_WIDTH == 64
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
el8_1
+=
v_rotate_left
<
8
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
8
>
(
el8_2
);
#endif
#endif
#endif
#endif
v_expand
(
el8
,
el4l
,
el4h
);
v_expand
(
el8_1
,
el4l_1
,
el4h_1
);
el4l
+=
prev
;
v_expand
(
el8_2
,
el4l_2
,
el4h_2
);
el4h
+=
el4l
;
el4l_1
+=
prev_1
;
el4l_2
+=
prev_2
;
el4h_1
+=
el4l_1
;
el4h_2
+=
el4l_2
;
prev_1
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h_1
);
prev_2
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h_2
);
#endif
v_int32
el4_1
,
el4_2
,
el4_3
,
el4_4
;
v_zip
(
el4l_1
,
el4l_2
,
el4_1
,
el4_2
);
v_zip
(
el4h_1
,
el4h_2
,
el4_3
,
el4_4
);
v_store
(
sum_row
+
j
,
el4_1
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
,
el4_2
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
*
2
,
el4_3
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
*
3
,
el4_4
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
*
3
));
}
for
(
int
v2
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v1
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
];
j
<
width
;
j
+=
2
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
}
}
}
#if CV_SSE4_1 || !CV_SSE2
else
if
(
cn
==
3
)
{
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
int
*
prev_sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
int
*
sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
int
row_cache
[
v_int32
::
nlanes
*
6
];
prev
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h
);
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
sum_row
[
-
3
]
=
0
;
v_int32
prev_1
=
vx_setzero_s32
(),
prev_2
=
vx_setzero_s32
(),
prev_3
=
vx_setzero_s32
();
int
j
=
0
;
for
(
;
j
+
v_uint16
::
nlanes
*
cn
<=
width
;
j
+=
v_uint16
::
nlanes
*
cn
)
{
v_uint8
v_src_row_1
,
v_src_row_2
,
v_src_row_3
;
v_load_deinterleave
(
src_row
+
j
,
v_src_row_1
,
v_src_row_2
,
v_src_row_3
);
v_int16
el8_1
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_1
));
v_int16
el8_2
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_2
));
v_int16
el8_3
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_3
));
v_int32
el4l_1
,
el4h_1
,
el4l_2
,
el4h_2
,
el4l_3
,
el4h_3
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum_1
=
_mm256_add_epi16
(
el8_1
.
val
,
_mm256_slli_si256
(
el8_1
.
val
,
2
));
__m256i
vsum_2
=
_mm256_add_epi16
(
el8_2
.
val
,
_mm256_slli_si256
(
el8_2
.
val
,
2
));
__m256i
vsum_3
=
_mm256_add_epi16
(
el8_3
.
val
,
_mm256_slli_si256
(
el8_3
.
val
,
2
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
4
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
4
));
vsum_3
=
_mm256_add_epi16
(
vsum_3
,
_mm256_slli_si256
(
vsum_3
,
4
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
8
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
8
));
vsum_3
=
_mm256_add_epi16
(
vsum_3
,
_mm256_slli_si256
(
vsum_3
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l_1
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_1
)),
prev_1
.
val
);
el4l_2
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_2
)),
prev_2
.
val
);
el4l_3
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_3
)),
prev_3
.
val
);
el4h_1
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_1
)),
_mm256_permutevar8x32_epi32
(
el4l_1
.
val
,
shmask
));
el4h_2
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_2
)),
_mm256_permutevar8x32_epi32
(
el4l_2
.
val
,
shmask
));
el4h_3
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_3
)),
_mm256_permutevar8x32_epi32
(
el4l_3
.
val
,
shmask
));
prev_1
.
val
=
_mm256_permutevar8x32_epi32
(
el4h_1
.
val
,
shmask
);
prev_2
.
val
=
_mm256_permutevar8x32_epi32
(
el4h_2
.
val
,
shmask
);
prev_3
.
val
=
_mm256_permutevar8x32_epi32
(
el4h_3
.
val
,
shmask
);
#else
el8_1
+=
v_rotate_left
<
1
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
1
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
1
>
(
el8_3
);
el8_1
+=
v_rotate_left
<
2
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
2
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
2
>
(
el8_3
);
#if CV_SIMD_WIDTH >= 32
el8_1
+=
v_rotate_left
<
4
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
4
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
4
>
(
el8_3
);
#if CV_SIMD_WIDTH == 64
el8_1
+=
v_rotate_left
<
8
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
8
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
8
>
(
el8_3
);
#endif
#endif
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_expand
(
el8_1
,
el4l_1
,
el4h_1
);
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
));
v_expand
(
el8_2
,
el4l_2
,
el4h_2
);
v_expand
(
el8_3
,
el4l_3
,
el4h_3
);
el4l_1
+=
prev_1
;
el4l_2
+=
prev_2
;
el4l_3
+=
prev_3
;
el4h_1
+=
el4l_1
;
el4h_2
+=
el4l_2
;
el4h_3
+=
el4l_3
;
prev_1
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h_1
);
prev_2
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h_2
);
prev_3
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h_3
);
#endif
v_store_interleave
(
row_cache
,
el4l_1
,
el4l_2
,
el4l_3
);
v_store_interleave
(
row_cache
+
v_int32
::
nlanes
*
3
,
el4h_1
,
el4h_2
,
el4h_3
);
el4l_1
=
vx_load
(
row_cache
);
el4l_2
=
vx_load
(
row_cache
+
v_int32
::
nlanes
);
el4l_3
=
vx_load
(
row_cache
+
v_int32
::
nlanes
*
2
);
el4h_1
=
vx_load
(
row_cache
+
v_int32
::
nlanes
*
3
);
el4h_2
=
vx_load
(
row_cache
+
v_int32
::
nlanes
*
4
);
el4h_3
=
vx_load
(
row_cache
+
v_int32
::
nlanes
*
5
);
v_store
(
sum_row
+
j
,
el4l_1
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
,
el4l_2
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
*
2
,
el4l_3
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
*
3
,
el4h_1
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
*
3
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
*
4
,
el4h_2
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
*
4
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
*
5
,
el4h_3
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
*
5
));
}
for
(
int
v3
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v2
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
],
v1
=
sum_row
[
j
-
3
]
-
prev_sum_row
[
j
-
3
];
j
<
width
;
j
+=
3
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
sum_row
[
j
+
2
]
=
(
v3
+=
src_row
[
j
+
2
])
+
prev_sum_row
[
j
+
2
];
}
}
}
}
#endif
else
if
(
cn
==
4
)
{
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
int
*
prev_sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
int
*
sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
for
(
int
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
sum_row
[
-
3
]
=
sum_row
[
-
4
]
=
0
;
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
v_int32
prev
=
vx_setzero_s32
();
int
j
=
0
;
for
(
;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_int32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
8
));
el4l
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
)),
prev
.
val
);
el4h
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
)),
_mm256_permute2x128_si256
(
el4l
.
val
,
el4l
.
val
,
0x31
));
prev
.
val
=
_mm256_permute2x128_si256
(
el4h
.
val
,
el4h
.
val
,
0x31
);
#else
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_expand
(
el8
,
el4l
,
el4h
);
el4l
+=
prev
;
el4h
+=
el4l
;
#if CV_SIMD_WIDTH == 16
prev
=
el4h
;
#elif CV_SIMD_WIDTH == 32
prev
=
v_combine_high
(
el4h
,
el4h
);
#else
v_int32
t
=
v_rotate_right
<
12
>
(
el4h
);
t
|=
v_rotate_left
<
4
>
(
t
);
prev
=
v_combine_low
(
t
,
t
);
#endif
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
));
}
for
(
int
v4
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v3
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
],
v2
=
sum_row
[
j
-
3
]
-
prev_sum_row
[
j
-
3
],
v1
=
sum_row
[
j
-
4
]
-
prev_sum_row
[
j
-
4
];
j
<
width
;
j
+=
4
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
sum_row
[
j
+
2
]
=
(
v3
+=
src_row
[
j
+
2
])
+
prev_sum_row
[
j
+
2
];
sum_row
[
j
+
3
]
=
(
v4
+=
src_row
[
j
+
3
])
+
prev_sum_row
[
j
+
3
];
}
}
}
}
else
{
return
false
;
}
vx_cleanup
();
return
true
;
return
true
;
}
}
};
};
...
@@ -180,62 +401,700 @@ struct Integral_SIMD<uchar, float, double>
...
@@ -180,62 +401,700 @@ struct Integral_SIMD<uchar, float, double>
float
*
tilted
,
size_t
,
float
*
tilted
,
size_t
,
int
width
,
int
height
,
int
cn
)
const
int
width
,
int
height
,
int
cn
)
const
{
{
if
(
sqsum
||
tilted
||
cn
!=
1
)
if
(
sqsum
||
tilted
||
cn
>
4
)
return
false
;
return
false
;
width
*=
cn
;
// the first iteration
// the first iteration
memset
(
sum
,
0
,
(
width
+
1
)
*
sizeof
(
in
t
));
memset
(
sum
,
0
,
(
width
+
cn
)
*
sizeof
(
floa
t
));
// the others
if
(
cn
==
1
)
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
// the others
float
*
prev_sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
for
(
int
i
=
0
;
i
<
height
;
++
i
)
float
*
sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
float
*
prev_sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
float
*
sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
sum_row
[
-
1
]
=
0
;
sum_row
[
-
1
]
=
0
;
v_float32
prev
=
vx_setzero_f32
();
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_float32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
))),
prev
.
val
);
el4h
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
))),
_mm256_permutevar8x32_ps
(
el4l
.
val
,
shmask
));
prev
.
val
=
_mm256_permutevar8x32_ps
(
el4h
.
val
,
shmask
);
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_int32
el4li
,
el4hi
;
v_expand
(
el8
,
el4li
,
el4hi
);
el4l
=
v_cvt_f32
(
el4li
)
+
prev
;
el4h
=
v_cvt_f32
(
el4hi
)
+
el4l
;
prev
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h
);
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
));
}
v_float32
prev
=
vx_setzero_f32
();
for
(
float
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
int
j
=
0
;
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
for
(;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
}
}
else
if
(
cn
==
2
)
{
// the others
v_int16
mask
=
vx_setall_s16
((
short
)
0xff
);
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
v_float32
el4l
,
el4h
;
float
*
prev_sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
float
*
sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
0
;
v_float32
prev_1
=
vx_setzero_f32
(),
prev_2
=
vx_setzero_f32
();
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
*
cn
<=
width
;
j
+=
v_uint16
::
nlanes
*
cn
)
{
v_int16
v_src_row
=
v_reinterpret_as_s16
(
vx_load
(
src_row
+
j
));
v_int16
el8_1
=
v_src_row
&
mask
;
v_int16
el8_2
=
v_reinterpret_as_s16
(
v_reinterpret_as_u16
(
v_src_row
)
>>
8
);
v_float32
el4l_1
,
el4h_1
,
el4l_2
,
el4h_2
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
__m256i
vsum_1
=
_mm256_add_epi16
(
el8_1
.
val
,
_mm256_slli_si256
(
el8_1
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
__m256i
vsum_2
=
_mm256_add_epi16
(
el8_2
.
val
,
_mm256_slli_si256
(
el8_2
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
4
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
4
));
el4l
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
))),
prev
.
val
);
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
8
));
el4h
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
))),
_mm256_permutevar8x32_ps
(
el4l
.
val
,
shmask
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
8
));
prev
.
val
=
_mm256_permutevar8x32_ps
(
el4h
.
val
,
shmask
);
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l_1
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_1
))),
prev_1
.
val
);
el4l_2
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_2
))),
prev_2
.
val
);
el4h_1
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_1
))),
_mm256_permutevar8x32_ps
(
el4l_1
.
val
,
shmask
));
el4h_2
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_2
))),
_mm256_permutevar8x32_ps
(
el4l_2
.
val
,
shmask
));
prev_1
.
val
=
_mm256_permutevar8x32_ps
(
el4h_1
.
val
,
shmask
);
prev_2
.
val
=
_mm256_permutevar8x32_ps
(
el4h_2
.
val
,
shmask
);
#else
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8_1
+=
v_rotate_left
<
1
>
(
el8_1
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
el8_2
+=
v_rotate_left
<
1
>
(
el8_2
);
el8_1
+=
v_rotate_left
<
2
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
2
>
(
el8_2
);
#if CV_SIMD_WIDTH >= 32
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
el8_1
+=
v_rotate_left
<
4
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
4
>
(
el8_2
);
#if CV_SIMD_WIDTH == 64
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
el8_1
+=
v_rotate_left
<
8
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
8
>
(
el8_2
);
#endif
#endif
#endif
v_int32
el4li_1
,
el4hi_1
,
el4li_2
,
el4hi_2
;
v_expand
(
el8_1
,
el4li_1
,
el4hi_1
);
v_expand
(
el8_2
,
el4li_2
,
el4hi_2
);
el4l_1
=
v_cvt_f32
(
el4li_1
)
+
prev_1
;
el4l_2
=
v_cvt_f32
(
el4li_2
)
+
prev_2
;
el4h_1
=
v_cvt_f32
(
el4hi_1
)
+
el4l_1
;
el4h_2
=
v_cvt_f32
(
el4hi_2
)
+
el4l_2
;
prev_1
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h_1
);
prev_2
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h_2
);
#endif
#endif
v_int32
el4li
,
el4hi
;
v_float32
el4_1
,
el4_2
,
el4_3
,
el4_4
;
v_expand
(
el8
,
el4li
,
el4hi
);
v_zip
(
el4l_1
,
el4l_2
,
el4_1
,
el4_2
);
el4l
=
v_cvt_f32
(
el4li
)
+
prev
;
v_zip
(
el4h_1
,
el4h_2
,
el4_3
,
el4_4
);
el4h
=
v_cvt_f32
(
el4hi
)
+
el4l
;
v_store
(
sum_row
+
j
,
el4_1
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
,
el4_2
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
*
2
,
el4_3
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
*
3
,
el4_4
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
*
3
));
}
for
(
float
v2
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v1
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
];
j
<
width
;
j
+=
2
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
}
}
}
else
if
(
cn
==
3
)
{
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
float
*
prev_sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
float
*
sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
float
row_cache
[
v_float32
::
nlanes
*
6
];
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
sum_row
[
-
3
]
=
0
;
prev
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h
);
v_float32
prev_1
=
vx_setzero_f32
(),
prev_2
=
vx_setzero_f32
(),
prev_3
=
vx_setzero_f32
();
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
*
cn
<=
width
;
j
+=
v_uint16
::
nlanes
*
cn
)
{
v_uint8
v_src_row_1
,
v_src_row_2
,
v_src_row_3
;
v_load_deinterleave
(
src_row
+
j
,
v_src_row_1
,
v_src_row_2
,
v_src_row_3
);
v_int16
el8_1
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_1
));
v_int16
el8_2
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_2
));
v_int16
el8_3
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_3
));
v_float32
el4l_1
,
el4h_1
,
el4l_2
,
el4h_2
,
el4l_3
,
el4h_3
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum_1
=
_mm256_add_epi16
(
el8_1
.
val
,
_mm256_slli_si256
(
el8_1
.
val
,
2
));
__m256i
vsum_2
=
_mm256_add_epi16
(
el8_2
.
val
,
_mm256_slli_si256
(
el8_2
.
val
,
2
));
__m256i
vsum_3
=
_mm256_add_epi16
(
el8_3
.
val
,
_mm256_slli_si256
(
el8_3
.
val
,
2
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
4
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
4
));
vsum_3
=
_mm256_add_epi16
(
vsum_3
,
_mm256_slli_si256
(
vsum_3
,
4
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
8
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
8
));
vsum_3
=
_mm256_add_epi16
(
vsum_3
,
_mm256_slli_si256
(
vsum_3
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l_1
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_1
))),
prev_1
.
val
);
el4l_2
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_2
))),
prev_2
.
val
);
el4l_3
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_3
))),
prev_3
.
val
);
el4h_1
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_1
))),
_mm256_permutevar8x32_ps
(
el4l_1
.
val
,
shmask
));
el4h_2
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_2
))),
_mm256_permutevar8x32_ps
(
el4l_2
.
val
,
shmask
));
el4h_3
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_3
))),
_mm256_permutevar8x32_ps
(
el4l_3
.
val
,
shmask
));
prev_1
.
val
=
_mm256_permutevar8x32_ps
(
el4h_1
.
val
,
shmask
);
prev_2
.
val
=
_mm256_permutevar8x32_ps
(
el4h_2
.
val
,
shmask
);
prev_3
.
val
=
_mm256_permutevar8x32_ps
(
el4h_3
.
val
,
shmask
);
#else
el8_1
+=
v_rotate_left
<
1
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
1
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
1
>
(
el8_3
);
el8_1
+=
v_rotate_left
<
2
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
2
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
2
>
(
el8_3
);
#if CV_SIMD_WIDTH >= 32
el8_1
+=
v_rotate_left
<
4
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
4
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
4
>
(
el8_3
);
#if CV_SIMD_WIDTH == 64
el8_1
+=
v_rotate_left
<
8
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
8
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
8
>
(
el8_3
);
#endif
#endif
v_int32
el4li_1
,
el4hi_1
,
el4li_2
,
el4hi_2
,
el4li_3
,
el4hi_3
;
v_expand
(
el8_1
,
el4li_1
,
el4hi_1
);
v_expand
(
el8_2
,
el4li_2
,
el4hi_2
);
v_expand
(
el8_3
,
el4li_3
,
el4hi_3
);
el4l_1
=
v_cvt_f32
(
el4li_1
)
+
prev_1
;
el4l_2
=
v_cvt_f32
(
el4li_2
)
+
prev_2
;
el4l_3
=
v_cvt_f32
(
el4li_3
)
+
prev_3
;
el4h_1
=
v_cvt_f32
(
el4hi_1
)
+
el4l_1
;
el4h_2
=
v_cvt_f32
(
el4hi_2
)
+
el4l_2
;
el4h_3
=
v_cvt_f32
(
el4hi_3
)
+
el4l_3
;
prev_1
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h_1
);
prev_2
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h_2
);
prev_3
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h_3
);
#endif
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store_interleave
(
row_cache
,
el4l_1
,
el4l_2
,
el4l_3
);
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
));
v_store_interleave
(
row_cache
+
v_float32
::
nlanes
*
3
,
el4h_1
,
el4h_2
,
el4h_3
);
el4l_1
=
vx_load
(
row_cache
);
el4l_2
=
vx_load
(
row_cache
+
v_float32
::
nlanes
);
el4l_3
=
vx_load
(
row_cache
+
v_float32
::
nlanes
*
2
);
el4h_1
=
vx_load
(
row_cache
+
v_float32
::
nlanes
*
3
);
el4h_2
=
vx_load
(
row_cache
+
v_float32
::
nlanes
*
4
);
el4h_3
=
vx_load
(
row_cache
+
v_float32
::
nlanes
*
5
);
v_store
(
sum_row
+
j
,
el4l_1
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
,
el4l_2
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
*
2
,
el4l_3
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
*
3
,
el4h_1
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
*
3
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
*
4
,
el4h_2
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
*
4
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
*
5
,
el4h_3
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
*
5
));
}
for
(
float
v3
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v2
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
],
v1
=
sum_row
[
j
-
3
]
-
prev_sum_row
[
j
-
3
];
j
<
width
;
j
+=
3
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
sum_row
[
j
+
2
]
=
(
v3
+=
src_row
[
j
+
2
])
+
prev_sum_row
[
j
+
2
];
}
}
}
}
else
if
(
cn
==
4
)
{
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
float
*
prev_sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
float
*
sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
sum_row
[
-
3
]
=
sum_row
[
-
4
]
=
0
;
for
(
float
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
v_float32
prev
=
vx_setzero_f32
();
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
int
j
=
0
;
for
(
;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_float32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
8
));
el4l
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
))),
prev
.
val
);
el4h
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
))),
_mm256_permute2f128_ps
(
el4l
.
val
,
el4l
.
val
,
0x31
));
prev
.
val
=
_mm256_permute2f128_ps
(
el4h
.
val
,
el4h
.
val
,
0x31
);
#else
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_int32
el4li
,
el4hi
;
v_expand
(
el8
,
el4li
,
el4hi
);
el4l
=
v_cvt_f32
(
el4li
)
+
prev
;
el4h
=
v_cvt_f32
(
el4hi
)
+
el4l
;
#if CV_SIMD_WIDTH == 16
prev
=
el4h
;
#elif CV_SIMD_WIDTH == 32
prev
=
v_combine_high
(
el4h
,
el4h
);
#else
v_float32
t
=
v_rotate_right
<
12
>
(
el4h
);
t
|=
v_rotate_left
<
4
>
(
t
);
prev
=
v_combine_low
(
t
,
t
);
#endif
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
));
}
for
(
float
v4
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v3
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
],
v2
=
sum_row
[
j
-
3
]
-
prev_sum_row
[
j
-
3
],
v1
=
sum_row
[
j
-
4
]
-
prev_sum_row
[
j
-
4
];
j
<
width
;
j
+=
4
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
sum_row
[
j
+
2
]
=
(
v3
+=
src_row
[
j
+
2
])
+
prev_sum_row
[
j
+
2
];
sum_row
[
j
+
3
]
=
(
v4
+=
src_row
[
j
+
3
])
+
prev_sum_row
[
j
+
3
];
}
}
}
else
{
return
false
;
}
}
vx_cleanup
();
return
true
;
return
true
;
}
}
};
};
#if CV_SIMD128_64F
template
<>
struct
Integral_SIMD
<
uchar
,
double
,
double
>
{
Integral_SIMD
()
{}
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
double
*
tilted
,
size_t
,
int
width
,
int
height
,
int
cn
)
const
{
#if CV_AVX512_SKX
if
(
!
tilted
&&
cn
<=
4
&&
(
cn
>
1
||
sqsum
))
{
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
,
cn
);
return
true
;
}
#else
CV_UNUSED
(
_sqsumstep
);
#endif
if
(
sqsum
||
tilted
||
cn
>
4
)
return
false
;
width
*=
cn
;
// the first iteration
memset
(
sum
,
0
,
(
width
+
cn
)
*
sizeof
(
double
));
if
(
cn
==
1
)
{
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
double
*
prev_sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
double
*
sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
sum_row
[
-
1
]
=
0
;
v_float64
prev
=
vx_setzero_f64
();
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_float64
el4ll
,
el4lh
,
el4hl
,
el4hh
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
__m256i
el4l_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
));
__m256i
el4h_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
));
el4ll
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4l_32
)),
prev
.
val
);
el4lh
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4l_32
)),
prev
.
val
);
__m256d
el4d
=
_mm256_permute4x64_pd
(
el4lh
.
val
,
0xff
);
el4hl
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4h_32
)),
el4d
);
el4hh
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4h_32
)),
el4d
);
prev
.
val
=
_mm256_permute4x64_pd
(
el4hh
.
val
,
0xff
);
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_int32
el4li
,
el4hi
;
v_expand
(
el8
,
el4li
,
el4hi
);
el4ll
=
v_cvt_f64
(
el4li
)
+
prev
;
el4lh
=
v_cvt_f64_high
(
el4li
)
+
prev
;
el4hl
=
v_cvt_f64
(
el4hi
)
+
el4ll
;
el4hh
=
v_cvt_f64_high
(
el4hi
)
+
el4lh
;
prev
=
vx_setall_f64
(
v_extract_n
<
v_float64
::
nlanes
-
1
>
(
el4hh
));
// prev = v_broadcast_element<v_float64::nlanes - 1>(el4hh);
#endif
v_store
(
sum_row
+
j
,
el4ll
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
,
el4lh
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
2
,
el4hl
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
3
,
el4hh
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
3
));
}
for
(
double
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
}
}
else
if
(
cn
==
2
)
{
// the others
v_int16
mask
=
vx_setall_s16
((
short
)
0xff
);
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
double
*
prev_sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
double
*
sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
0
;
v_float64
prev_1
=
vx_setzero_f64
(),
prev_2
=
vx_setzero_f64
();
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
*
cn
<=
width
;
j
+=
v_uint16
::
nlanes
*
cn
)
{
v_int16
v_src_row
=
v_reinterpret_as_s16
(
vx_load
(
src_row
+
j
));
v_int16
el8_1
=
v_src_row
&
mask
;
v_int16
el8_2
=
v_reinterpret_as_s16
(
v_reinterpret_as_u16
(
v_src_row
)
>>
8
);
v_float64
el4ll_1
,
el4lh_1
,
el4hl_1
,
el4hh_1
,
el4ll_2
,
el4lh_2
,
el4hl_2
,
el4hh_2
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum_1
=
_mm256_add_epi16
(
el8_1
.
val
,
_mm256_slli_si256
(
el8_1
.
val
,
2
));
__m256i
vsum_2
=
_mm256_add_epi16
(
el8_2
.
val
,
_mm256_slli_si256
(
el8_2
.
val
,
2
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
4
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
4
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
8
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
8
));
__m256i
el4l1_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_1
));
__m256i
el4l2_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_2
));
__m256i
el4h1_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_1
));
__m256i
el4h2_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_2
));
el4ll_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4l1_32
)),
prev_1
.
val
);
el4ll_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4l2_32
)),
prev_2
.
val
);
el4lh_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4l1_32
)),
prev_1
.
val
);
el4lh_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4l2_32
)),
prev_2
.
val
);
__m256d
el4d_1
=
_mm256_permute4x64_pd
(
el4lh_1
.
val
,
0xff
);
__m256d
el4d_2
=
_mm256_permute4x64_pd
(
el4lh_2
.
val
,
0xff
);
el4hl_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4h1_32
)),
el4d_1
);
el4hl_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4h2_32
)),
el4d_2
);
el4hh_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4h1_32
)),
el4d_1
);
el4hh_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4h2_32
)),
el4d_2
);
prev_1
.
val
=
_mm256_permute4x64_pd
(
el4hh_1
.
val
,
0xff
);
prev_2
.
val
=
_mm256_permute4x64_pd
(
el4hh_2
.
val
,
0xff
);
#else
el8_1
+=
v_rotate_left
<
1
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
1
>
(
el8_2
);
el8_1
+=
v_rotate_left
<
2
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
2
>
(
el8_2
);
#if CV_SIMD_WIDTH >= 32
el8_1
+=
v_rotate_left
<
4
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
4
>
(
el8_2
);
#if CV_SIMD_WIDTH == 64
el8_1
+=
v_rotate_left
<
8
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
8
>
(
el8_2
);
#endif
#endif
v_int32
el4li_1
,
el4hi_1
,
el4li_2
,
el4hi_2
;
v_expand
(
el8_1
,
el4li_1
,
el4hi_1
);
v_expand
(
el8_2
,
el4li_2
,
el4hi_2
);
el4ll_1
=
v_cvt_f64
(
el4li_1
)
+
prev_1
;
el4ll_2
=
v_cvt_f64
(
el4li_2
)
+
prev_2
;
el4lh_1
=
v_cvt_f64_high
(
el4li_1
)
+
prev_1
;
el4lh_2
=
v_cvt_f64_high
(
el4li_2
)
+
prev_2
;
el4hl_1
=
v_cvt_f64
(
el4hi_1
)
+
el4ll_1
;
el4hl_2
=
v_cvt_f64
(
el4hi_2
)
+
el4ll_2
;
el4hh_1
=
v_cvt_f64_high
(
el4hi_1
)
+
el4lh_1
;
el4hh_2
=
v_cvt_f64_high
(
el4hi_2
)
+
el4lh_2
;
prev_1
=
vx_setall_f64
(
v_extract_n
<
v_float64
::
nlanes
-
1
>
(
el4hh_1
));
prev_2
=
vx_setall_f64
(
v_extract_n
<
v_float64
::
nlanes
-
1
>
(
el4hh_2
));
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
#endif
v_float64
el4_1
,
el4_2
,
el4_3
,
el4_4
,
el4_5
,
el4_6
,
el4_7
,
el4_8
;
v_zip
(
el4ll_1
,
el4ll_2
,
el4_1
,
el4_2
);
v_zip
(
el4lh_1
,
el4lh_2
,
el4_3
,
el4_4
);
v_zip
(
el4hl_1
,
el4hl_2
,
el4_5
,
el4_6
);
v_zip
(
el4hh_1
,
el4hh_2
,
el4_7
,
el4_8
);
v_store
(
sum_row
+
j
,
el4_1
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
,
el4_2
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
2
,
el4_3
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
3
,
el4_4
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
3
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
4
,
el4_5
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
4
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
5
,
el4_6
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
5
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
6
,
el4_7
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
6
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
7
,
el4_8
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
7
));
}
for
(
double
v2
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v1
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
];
j
<
width
;
j
+=
2
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
}
}
}
else
if
(
cn
==
3
)
{
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
double
*
prev_sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
double
*
sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
double
row_cache
[
v_float64
::
nlanes
*
12
];
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
sum_row
[
-
3
]
=
0
;
v_float64
prev_1
=
vx_setzero_f64
(),
prev_2
=
vx_setzero_f64
(),
prev_3
=
vx_setzero_f64
();
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
*
cn
<=
width
;
j
+=
v_uint16
::
nlanes
*
cn
)
{
v_uint8
v_src_row_1
,
v_src_row_2
,
v_src_row_3
;
v_load_deinterleave
(
src_row
+
j
,
v_src_row_1
,
v_src_row_2
,
v_src_row_3
);
v_int16
el8_1
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_1
));
v_int16
el8_2
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_2
));
v_int16
el8_3
=
v_reinterpret_as_s16
(
v_expand_low
(
v_src_row_3
));
v_float64
el4ll_1
,
el4lh_1
,
el4hl_1
,
el4hh_1
,
el4ll_2
,
el4lh_2
,
el4hl_2
,
el4hh_2
,
el4ll_3
,
el4lh_3
,
el4hl_3
,
el4hh_3
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum_1
=
_mm256_add_epi16
(
el8_1
.
val
,
_mm256_slli_si256
(
el8_1
.
val
,
2
));
__m256i
vsum_2
=
_mm256_add_epi16
(
el8_2
.
val
,
_mm256_slli_si256
(
el8_2
.
val
,
2
));
__m256i
vsum_3
=
_mm256_add_epi16
(
el8_3
.
val
,
_mm256_slli_si256
(
el8_3
.
val
,
2
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
4
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
4
));
vsum_3
=
_mm256_add_epi16
(
vsum_3
,
_mm256_slli_si256
(
vsum_3
,
4
));
vsum_1
=
_mm256_add_epi16
(
vsum_1
,
_mm256_slli_si256
(
vsum_1
,
8
));
vsum_2
=
_mm256_add_epi16
(
vsum_2
,
_mm256_slli_si256
(
vsum_2
,
8
));
vsum_3
=
_mm256_add_epi16
(
vsum_3
,
_mm256_slli_si256
(
vsum_3
,
8
));
__m256i
el4l1_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_1
));
__m256i
el4l2_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_2
));
__m256i
el4l3_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum_3
));
__m256i
el4h1_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_1
));
__m256i
el4h2_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_2
));
__m256i
el4h3_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum_3
));
el4ll_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4l1_32
)),
prev_1
.
val
);
el4ll_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4l2_32
)),
prev_2
.
val
);
el4ll_3
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4l3_32
)),
prev_3
.
val
);
el4lh_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4l1_32
)),
prev_1
.
val
);
el4lh_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4l2_32
)),
prev_2
.
val
);
el4lh_3
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4l3_32
)),
prev_3
.
val
);
__m256d
el4d_1
=
_mm256_permute4x64_pd
(
el4lh_1
.
val
,
0xff
);
__m256d
el4d_2
=
_mm256_permute4x64_pd
(
el4lh_2
.
val
,
0xff
);
__m256d
el4d_3
=
_mm256_permute4x64_pd
(
el4lh_3
.
val
,
0xff
);
el4hl_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4h1_32
)),
el4d_1
);
el4hl_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4h2_32
)),
el4d_2
);
el4hl_3
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4h3_32
)),
el4d_3
);
el4hh_1
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4h1_32
)),
el4d_1
);
el4hh_2
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4h2_32
)),
el4d_2
);
el4hh_3
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4h3_32
)),
el4d_3
);
prev_1
.
val
=
_mm256_permute4x64_pd
(
el4hh_1
.
val
,
0xff
);
prev_2
.
val
=
_mm256_permute4x64_pd
(
el4hh_2
.
val
,
0xff
);
prev_3
.
val
=
_mm256_permute4x64_pd
(
el4hh_3
.
val
,
0xff
);
#else
el8_1
+=
v_rotate_left
<
1
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
1
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
1
>
(
el8_3
);
el8_1
+=
v_rotate_left
<
2
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
2
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
2
>
(
el8_3
);
#if CV_SIMD_WIDTH >= 32
el8_1
+=
v_rotate_left
<
4
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
4
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
4
>
(
el8_3
);
#if CV_SIMD_WIDTH == 64
el8_1
+=
v_rotate_left
<
8
>
(
el8_1
);
el8_2
+=
v_rotate_left
<
8
>
(
el8_2
);
el8_3
+=
v_rotate_left
<
8
>
(
el8_3
);
#endif
#endif
v_int32
el4li_1
,
el4hi_1
,
el4li_2
,
el4hi_2
,
el4li_3
,
el4hi_3
;
v_expand
(
el8_1
,
el4li_1
,
el4hi_1
);
v_expand
(
el8_2
,
el4li_2
,
el4hi_2
);
v_expand
(
el8_3
,
el4li_3
,
el4hi_3
);
el4ll_1
=
v_cvt_f64
(
el4li_1
)
+
prev_1
;
el4ll_2
=
v_cvt_f64
(
el4li_2
)
+
prev_2
;
el4ll_3
=
v_cvt_f64
(
el4li_3
)
+
prev_3
;
el4lh_1
=
v_cvt_f64_high
(
el4li_1
)
+
prev_1
;
el4lh_2
=
v_cvt_f64_high
(
el4li_2
)
+
prev_2
;
el4lh_3
=
v_cvt_f64_high
(
el4li_3
)
+
prev_3
;
el4hl_1
=
v_cvt_f64
(
el4hi_1
)
+
el4ll_1
;
el4hl_2
=
v_cvt_f64
(
el4hi_2
)
+
el4ll_2
;
el4hl_3
=
v_cvt_f64
(
el4hi_3
)
+
el4ll_3
;
el4hh_1
=
v_cvt_f64_high
(
el4hi_1
)
+
el4lh_1
;
el4hh_2
=
v_cvt_f64_high
(
el4hi_2
)
+
el4lh_2
;
el4hh_3
=
v_cvt_f64_high
(
el4hi_3
)
+
el4lh_3
;
prev_1
=
vx_setall_f64
(
v_extract_n
<
v_float64
::
nlanes
-
1
>
(
el4hh_1
));
prev_2
=
vx_setall_f64
(
v_extract_n
<
v_float64
::
nlanes
-
1
>
(
el4hh_2
));
prev_3
=
vx_setall_f64
(
v_extract_n
<
v_float64
::
nlanes
-
1
>
(
el4hh_3
));
// prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
// prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
// prev_3 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_3);
#endif
v_store_interleave
(
row_cache
,
el4ll_1
,
el4ll_2
,
el4ll_3
);
v_store_interleave
(
row_cache
+
v_float64
::
nlanes
*
3
,
el4lh_1
,
el4lh_2
,
el4lh_3
);
v_store_interleave
(
row_cache
+
v_float64
::
nlanes
*
6
,
el4hl_1
,
el4hl_2
,
el4hl_3
);
v_store_interleave
(
row_cache
+
v_float64
::
nlanes
*
9
,
el4hh_1
,
el4hh_2
,
el4hh_3
);
el4ll_1
=
vx_load
(
row_cache
);
el4ll_2
=
vx_load
(
row_cache
+
v_float64
::
nlanes
);
el4ll_3
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
2
);
el4lh_1
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
3
);
el4lh_2
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
4
);
el4lh_3
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
5
);
el4hl_1
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
6
);
el4hl_2
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
7
);
el4hl_3
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
8
);
el4hh_1
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
9
);
el4hh_2
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
10
);
el4hh_3
=
vx_load
(
row_cache
+
v_float64
::
nlanes
*
11
);
v_store
(
sum_row
+
j
,
el4ll_1
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
,
el4ll_2
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
2
,
el4ll_3
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
3
,
el4lh_1
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
3
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
4
,
el4lh_2
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
4
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
5
,
el4lh_3
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
5
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
6
,
el4hl_1
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
6
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
7
,
el4hl_2
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
7
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
8
,
el4hl_3
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
8
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
9
,
el4hh_1
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
9
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
10
,
el4hh_2
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
10
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
11
,
el4hh_3
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
11
));
}
for
(
double
v3
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v2
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
],
v1
=
sum_row
[
j
-
3
]
-
prev_sum_row
[
j
-
3
];
j
<
width
;
j
+=
3
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
sum_row
[
j
+
2
]
=
(
v3
+=
src_row
[
j
+
2
])
+
prev_sum_row
[
j
+
2
];
}
}
}
else
if
(
cn
==
4
)
{
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
double
*
prev_sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
cn
;
double
*
sum_row
=
(
double
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
cn
;
sum_row
[
-
1
]
=
sum_row
[
-
2
]
=
sum_row
[
-
3
]
=
sum_row
[
-
4
]
=
0
;
v_float64
prev_1
=
vx_setzero_f64
(),
prev_2
=
vx_setzero_f64
();
int
j
=
0
;
for
(
;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_float64
el4ll
,
el4lh
,
el4hl
,
el4hh
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
8
));
__m256i
el4l_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
));
__m256i
el4h_32
=
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
));
el4ll
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4l_32
)),
prev_1
.
val
);
el4lh
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4l_32
)),
prev_2
.
val
);
el4hl
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_low
(
el4h_32
)),
el4lh
.
val
);
el4hh
.
val
=
_mm256_add_pd
(
_mm256_cvtepi32_pd
(
_v256_extract_high
(
el4h_32
)),
el4lh
.
val
);
prev_1
.
val
=
prev_2
.
val
=
el4hh
.
val
;
#else
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_int32
el4li
,
el4hi
;
v_expand
(
el8
,
el4li
,
el4hi
);
el4ll
=
v_cvt_f64
(
el4li
)
+
prev_1
;
el4lh
=
v_cvt_f64_high
(
el4li
)
+
prev_2
;
el4hl
=
v_cvt_f64
(
el4hi
)
+
el4ll
;
el4hh
=
v_cvt_f64_high
(
el4hi
)
+
el4lh
;
#if CV_SIMD_WIDTH == 16
prev_1
=
el4hl
;
prev_2
=
el4hh
;
#elif CV_SIMD_WIDTH == 32
prev_1
=
prev_2
=
el4hh
;
#else
prev_1
=
prev_2
=
v_combine_high
(
el4hh
,
el4hh
);
#endif
#endif
v_store
(
sum_row
+
j
,
el4ll
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
,
el4lh
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
2
,
el4hl
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
2
));
v_store
(
sum_row
+
j
+
v_float64
::
nlanes
*
3
,
el4hh
+
vx_load
(
prev_sum_row
+
j
+
v_float64
::
nlanes
*
3
));
}
for
(
double
v4
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
],
v3
=
sum_row
[
j
-
2
]
-
prev_sum_row
[
j
-
2
],
v2
=
sum_row
[
j
-
3
]
-
prev_sum_row
[
j
-
3
],
v1
=
sum_row
[
j
-
4
]
-
prev_sum_row
[
j
-
4
];
j
<
width
;
j
+=
4
)
{
sum_row
[
j
]
=
(
v1
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
sum_row
[
j
+
1
]
=
(
v2
+=
src_row
[
j
+
1
])
+
prev_sum_row
[
j
+
1
];
sum_row
[
j
+
2
]
=
(
v3
+=
src_row
[
j
+
2
])
+
prev_sum_row
[
j
+
2
];
sum_row
[
j
+
3
]
=
(
v4
+=
src_row
[
j
+
3
])
+
prev_sum_row
[
j
+
3
];
}
}
}
else
{
return
false
;
}
vx_cleanup
();
return
true
;
}
};
#endif
#endif
#endif
}
// namespace anon
}
// namespace anon
...
...
modules/videoio/src/cap_v4l.cpp
View file @
333a767b
...
@@ -268,6 +268,10 @@ typedef uint32_t __u32;
...
@@ -268,6 +268,10 @@ typedef uint32_t __u32;
#define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ')
#define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ')
#endif
#endif
#ifndef V4L2_PIX_FMT_Y12
#define V4L2_PIX_FMT_Y12 v4l2_fourcc('Y', '1', '2', ' ')
#endif
/* Defaults - If your board can do better, set it here. Set for the most common type inputs. */
/* Defaults - If your board can do better, set it here. Set for the most common type inputs. */
#define DEFAULT_V4L_WIDTH 640
#define DEFAULT_V4L_WIDTH 640
#define DEFAULT_V4L_HEIGHT 480
#define DEFAULT_V4L_HEIGHT 480
...
@@ -570,6 +574,7 @@ bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2()
...
@@ -570,6 +574,7 @@ bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2()
V4L2_PIX_FMT_JPEG
,
V4L2_PIX_FMT_JPEG
,
#endif
#endif
V4L2_PIX_FMT_Y16
,
V4L2_PIX_FMT_Y16
,
V4L2_PIX_FMT_Y12
,
V4L2_PIX_FMT_Y10
,
V4L2_PIX_FMT_Y10
,
V4L2_PIX_FMT_GREY
,
V4L2_PIX_FMT_GREY
,
};
};
...
@@ -663,6 +668,7 @@ void CvCaptureCAM_V4L::v4l2_create_frame()
...
@@ -663,6 +668,7 @@ void CvCaptureCAM_V4L::v4l2_create_frame()
size
.
height
=
size
.
height
*
3
/
2
;
// "1.5" channels
size
.
height
=
size
.
height
*
3
/
2
;
// "1.5" channels
break
;
break
;
case
V4L2_PIX_FMT_Y16
:
case
V4L2_PIX_FMT_Y16
:
case
V4L2_PIX_FMT_Y12
:
case
V4L2_PIX_FMT_Y10
:
case
V4L2_PIX_FMT_Y10
:
depth
=
IPL_DEPTH_16U
;
depth
=
IPL_DEPTH_16U
;
/* fallthru */
/* fallthru */
...
@@ -1593,6 +1599,13 @@ void CvCaptureCAM_V4L::convertToRgb(const Buffer ¤tBuffer)
...
@@ -1593,6 +1599,13 @@ void CvCaptureCAM_V4L::convertToRgb(const Buffer ¤tBuffer)
cv
::
cvtColor
(
temp
,
destination
,
COLOR_GRAY2BGR
);
cv
::
cvtColor
(
temp
,
destination
,
COLOR_GRAY2BGR
);
return
;
return
;
}
}
case
V4L2_PIX_FMT_Y12
:
{
cv
::
Mat
temp
(
imageSize
,
CV_8UC1
,
buffers
[
MAX_V4L_BUFFERS
].
start
);
cv
::
Mat
(
imageSize
,
CV_16UC1
,
currentBuffer
.
start
).
convertTo
(
temp
,
CV_8U
,
1.0
/
16
);
cv
::
cvtColor
(
temp
,
destination
,
COLOR_GRAY2BGR
);
return
;
}
case
V4L2_PIX_FMT_Y10
:
case
V4L2_PIX_FMT_Y10
:
{
{
cv
::
Mat
temp
(
imageSize
,
CV_8UC1
,
buffers
[
MAX_V4L_BUFFERS
].
start
);
cv
::
Mat
temp
(
imageSize
,
CV_8UC1
,
buffers
[
MAX_V4L_BUFFERS
].
start
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment