Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
76c21b73
Commit
76c21b73
authored
Jan 24, 2020
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #16374 from alalek:imgproc_dispatch_sumpixels
parents
4d2da2de
09b3383a
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
440 additions
and
352 deletions
+440
-352
CMakeLists.txt
modules/imgproc/CMakeLists.txt
+1
-0
sumpixels.avx512_skx.hpp
modules/imgproc/src/sumpixels.avx512_skx.hpp
+8
-11
sumpixels.dispatch.cpp
modules/imgproc/src/sumpixels.dispatch.cpp
+143
-316
sumpixels.hpp
modules/imgproc/src/sumpixels.hpp
+0
-25
sumpixels.simd.hpp
modules/imgproc/src/sumpixels.simd.hpp
+288
-0
No files found.
modules/imgproc/CMakeLists.txt
View file @
76c21b73
...
@@ -9,5 +9,6 @@ ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2)
...
@@ -9,5 +9,6 @@ ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2)
ocv_add_dispatched_file
(
median_blur SSE2 SSE4_1 AVX2
)
ocv_add_dispatched_file
(
median_blur SSE2 SSE4_1 AVX2
)
ocv_add_dispatched_file
(
morph SSE2 SSE4_1 AVX2
)
ocv_add_dispatched_file
(
morph SSE2 SSE4_1 AVX2
)
ocv_add_dispatched_file
(
smooth SSE2 SSE4_1 AVX2
)
ocv_add_dispatched_file
(
smooth SSE2 SSE4_1 AVX2
)
ocv_add_dispatched_file
(
sumpixels SSE2 AVX2 AVX512_SKX
)
ocv_add_dispatched_file
(
undistort SSE2 AVX2
)
ocv_add_dispatched_file
(
undistort SSE2 AVX2
)
ocv_define_module
(
imgproc opencv_core WRAP java python js
)
ocv_define_module
(
imgproc opencv_core WRAP java python js
)
modules/imgproc/src/sumpixels.avx512_skx.
c
pp
→
modules/imgproc/src/sumpixels.avx512_skx.
h
pp
View file @
76c21b73
...
@@ -2,14 +2,13 @@
...
@@ -2,14 +2,13 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// of this distribution and at http://opencv.org/license.html.
//
//
// Copyright (C) 2019, Intel Corporation, all rights reserved.
// Copyright (C) 2019-2020, Intel Corporation, all rights reserved.
#include "precomp.hpp"
#include "sumpixels.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/hal/intrin.hpp"
namespace
cv
{
namespace
hal
{
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
namespace
cv
{
namespace
{
// Anonymous namespace to avoid exposing the implementation classes
namespace
{
// Anonymous namespace to avoid exposing the implementation classes
//
//
...
@@ -432,16 +431,14 @@ __m512d IntegralCalculator < 4 > ::calculate_integral(const __m512i src_longs, c
...
@@ -432,16 +431,14 @@ __m512d IntegralCalculator < 4 > ::calculate_integral(const __m512i src_longs, c
}
// end of anonymous namespace
}
// end of anonymous namespace
namespace
opt_AVX512_SKX
{
static
// This is the implementation for the external callers interface entry point.
// It should be the only function called into this file from outside
// Any new implementations should be directed from here
void
calculate_integral_avx512
(
const
uchar
*
src
,
size_t
_srcstep
,
void
calculate_integral_avx512
(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
int
width
,
int
height
,
int
cn
)
int
width
,
int
height
,
int
cn
)
{
{
CV_INSTRUMENT_REGION
();
switch
(
cn
){
switch
(
cn
){
case
1
:
{
case
1
:
{
IntegralCalculator
<
1
>
calculator
;
IntegralCalculator
<
1
>
calculator
;
...
@@ -466,5 +463,5 @@ void calculate_integral_avx512(const uchar *src, size_t _srcstep,
...
@@ -466,5 +463,5 @@ void calculate_integral_avx512(const uchar *src, size_t _srcstep,
}
}
}
// end namespace opt_AVX512_SXK
CV_CPU_OPTIMIZATION_NAMESPACE_END
}
// end namespace cv
}
}
// end namespace cv::hal
modules/imgproc/src/sumpixels.cpp
→
modules/imgproc/src/sumpixels.
dispatch.
cpp
View file @
76c21b73
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
// License Agreement
// License Agreement
// For Open Source Computer Vision Library
// For Open Source Computer Vision Library
//
//
// Copyright (C) 2000-20
08,2019
Intel Corporation, all rights reserved.
// Copyright (C) 2000-20
20
Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// Third party copyrights are property of their respective owners.
...
@@ -44,210 +44,157 @@
...
@@ -44,210 +44,157 @@
#include "precomp.hpp"
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "sumpixels.hpp"
namespace
cv
#include "sumpixels.simd.hpp"
{
#include "sumpixels.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace
cv
{
#ifdef HAVE_OPENCL
template
<
typename
T
,
typename
ST
,
typename
QT
>
static
bool
ocl_integral
(
InputArray
_src
,
OutputArray
_sum
,
int
sdepth
)
struct
Integral_SIMD
{
{
bool
operator
()(
const
T
*
,
size_t
,
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
ST
*
,
size_t
,
QT
*
,
size_t
,
if
(
(
_src
.
type
()
!=
CV_8UC1
)
||
ST
*
,
size_t
,
!
(
sdepth
==
CV_32S
||
sdepth
==
CV_32F
||
(
doubleSupport
&&
sdepth
==
CV_64F
)))
int
,
int
,
int
)
const
{
return
false
;
return
false
;
}
};
static
const
int
tileSize
=
16
;
template
<>
String
build_opt
=
format
(
"-D sumT=%s -D LOCAL_SUM_SIZE=%d%s"
,
struct
Integral_SIMD
<
uchar
,
double
,
double
>
{
ocl
::
typeToStr
(
sdepth
),
tileSize
,
Integral_SIMD
()
{}
;
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
)
;
ocl
::
Kernel
kcols
(
"integral_sum_cols"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
);
if
(
kcols
.
empty
())
return
false
;
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
UMat
src
=
_src
.
getUMat
();
double
*
sum
,
size_t
_sumstep
,
Size
src_size
=
src
.
size
();
double
*
sqsum
,
size_t
_sqsumstep
,
Size
bufsize
(((
src_size
.
height
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
,
((
src_size
.
width
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
);
double
*
tilted
,
size_t
_tiltedstep
,
UMat
buf
(
bufsize
,
sdepth
);
int
width
,
int
height
,
int
cn
)
const
kcols
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
buf
));
{
size_t
gt
=
src
.
cols
,
lt
=
tileSize
;
#if CV_TRY_AVX512_SKX
if
(
!
kcols
.
run
(
1
,
&
gt
,
&
lt
,
false
))
CV_UNUSED
(
_tiltedstep
);
// TODO: Add support for 1 channel input (WIP)
if
(
CV_CPU_HAS_SUPPORT_AVX512_SKX
&&
!
tilted
&&
(
cn
<=
4
)){
opt_AVX512_SKX
::
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
,
cn
);
return
true
;
}
#else
// Avoid warnings in some builds
CV_UNUSED
(
src
);
CV_UNUSED
(
_srcstep
);
CV_UNUSED
(
sum
);
CV_UNUSED
(
_sumstep
);
CV_UNUSED
(
sqsum
);
CV_UNUSED
(
_sqsumstep
);
CV_UNUSED
(
tilted
);
CV_UNUSED
(
_tiltedstep
);
CV_UNUSED
(
width
);
CV_UNUSED
(
height
);
CV_UNUSED
(
cn
);
#endif
return
false
;
return
false
;
}
};
ocl
::
Kernel
krows
(
"integral_sum_rows"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
);
if
(
krows
.
empty
())
return
false
;
#if CV_SIMD && CV_SIMD_WIDTH <= 64
Size
sumsize
(
src_size
.
width
+
1
,
src_size
.
height
+
1
);
_sum
.
create
(
sumsize
,
sdepth
);
UMat
sum
=
_sum
.
getUMat
();
template
<>
krows
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
buf
),
ocl
::
KernelArg
::
WriteOnly
(
sum
));
struct
Integral_SIMD
<
uchar
,
int
,
double
>
gt
=
src
.
rows
;
{
return
krows
.
run
(
1
,
&
gt
,
&
lt
,
false
);
Integral_SIMD
()
{
}
}
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
static
bool
ocl_integral
(
InputArray
_src
,
OutputArray
_sum
,
OutputArray
_sqsum
,
int
sdepth
,
int
sqdepth
)
int
*
sum
,
size_t
_sumstep
,
{
double
*
sqsum
,
size_t
,
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
int
*
tilted
,
size_t
,
int
width
,
int
height
,
int
cn
)
const
{
if
(
sqsum
||
tilted
||
cn
!=
1
)
return
false
;
// the first iteration
if
(
_src
.
type
()
!=
CV_8UC1
||
(
!
doubleSupport
&&
(
sdepth
==
CV_64F
||
sqdepth
==
CV_64F
))
)
memset
(
sum
,
0
,
(
width
+
1
)
*
sizeof
(
int
))
;
return
false
;
// the others
static
const
int
tileSize
=
16
;
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
int
*
prev_sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
int
*
sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
sum_row
[
-
1
]
=
0
;
String
build_opt
=
format
(
"-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s"
,
ocl
::
typeToStr
(
sdepth
),
ocl
::
typeToStr
(
sqdepth
),
tileSize
,
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
);
v_int32
prev
=
vx_setzero_s32
();
ocl
::
Kernel
kcols
(
"integral_sum_cols"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
);
int
j
=
0
;
if
(
kcols
.
empty
())
for
(
;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
return
false
;
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_int32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
)),
prev
.
val
);
el4h
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
)),
_mm256_permutevar8x32_epi32
(
el4l
.
val
,
shmask
));
prev
.
val
=
_mm256_permutevar8x32_epi32
(
el4h
.
val
,
shmask
);
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_expand
(
el8
,
el4l
,
el4h
);
el4l
+=
prev
;
el4h
+=
el4l
;
prev
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h
);
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
));
}
for
(
int
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
UMat
src
=
_src
.
getUMat
();
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
Size
src_size
=
src
.
size
();
}
Size
bufsize
(((
src_size
.
height
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
,
((
src_size
.
width
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
);
vx_cleanup
();
UMat
buf
(
bufsize
,
sdepth
);
UMat
buf_sq
(
bufsize
,
sqdepth
);
kcols
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
buf
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
buf_sq
));
size_t
gt
=
src
.
cols
,
lt
=
tileSize
;
if
(
!
kcols
.
run
(
1
,
&
gt
,
&
lt
,
false
))
return
false
;
return
true
;
ocl
::
Kernel
krows
(
"integral_sum_rows"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
)
;
}
if
(
krows
.
empty
())
}
;
return
false
;
template
<>
Size
sumsize
(
src_size
.
width
+
1
,
src_size
.
height
+
1
);
struct
Integral_SIMD
<
uchar
,
float
,
double
>
_sum
.
create
(
sumsize
,
sdepth
);
{
UMat
sum
=
_sum
.
getUMat
();
Integral_SIMD
()
{}
_sqsum
.
create
(
sumsize
,
sqdepth
);
UMat
sum_sq
=
_sqsum
.
getUMat
();
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
krows
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
buf
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
buf_sq
),
ocl
::
KernelArg
::
WriteOnly
(
sum
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
sum_sq
));
float
*
sum
,
size_t
_sumstep
,
gt
=
src
.
rows
;
double
*
sqsum
,
size_t
,
return
krows
.
run
(
1
,
&
gt
,
&
lt
,
false
);
float
*
tilted
,
size_t
,
}
int
width
,
int
height
,
int
cn
)
const
{
if
(
sqsum
||
tilted
||
cn
!=
1
)
return
false
;
// the first iteration
#endif // HAVE_OPENCL
memset
(
sum
,
0
,
(
width
+
1
)
*
sizeof
(
int
));
// the others
#ifdef HAVE_IPP
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
float
*
prev_sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
float
*
sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
sum_row
[
-
1
]
=
0
;
static
bool
ipp_integral
(
int
depth
,
int
sdepth
,
int
sqdepth
,
const
uchar
*
src
,
size_t
srcstep
,
uchar
*
sum
,
size_t
sumstep
,
uchar
*
sqsum
,
size_t
sqsumstep
,
uchar
*
tilted
,
size_t
tstep
,
int
width
,
int
height
,
int
cn
)
{
CV_INSTRUMENT_REGION_IPP
();
v_float32
prev
=
vx_setzero_f32
();
IppiSize
size
=
{
width
,
height
};
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_float32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
))),
prev
.
val
);
el4h
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
))),
_mm256_permutevar8x32_ps
(
el4l
.
val
,
shmask
));
prev
.
val
=
_mm256_permutevar8x32_ps
(
el4h
.
val
,
shmask
);
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_int32
el4li
,
el4hi
;
v_expand
(
el8
,
el4li
,
el4hi
);
el4l
=
v_cvt_f32
(
el4li
)
+
prev
;
el4h
=
v_cvt_f32
(
el4hi
)
+
el4l
;
prev
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h
);
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
));
}
for
(
float
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
if
(
cn
>
1
)
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
return
false
;
}
if
(
tilted
)
vx_cleanup
();
{
CV_UNUSED
(
tstep
);
return
false
;
}
return
true
;
if
(
!
sqsum
)
{
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiIntegral_8u32s_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32s
*
)
sum
,
(
int
)
sumstep
,
size
,
0
)
>=
0
;
else
if
(
depth
==
CV_8UC1
&&
sdepth
==
CV_32F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiIntegral_8u32f_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32f
*
)
sum
,
(
int
)
sumstep
,
size
,
0
)
>=
0
;
else
if
(
depth
==
CV_32FC1
&&
sdepth
==
CV_32F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiIntegral_32f_C1R
,
(
const
Ipp32f
*
)
src
,
(
int
)
srcstep
,
(
Ipp32f
*
)
sum
,
(
int
)
sumstep
,
size
)
>=
0
;
else
return
false
;
}
}
};
else
{
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_32S
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiSqrIntegral_8u32s_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32s
*
)
sum
,
(
int
)
sumstep
,
(
Ipp32s
*
)
sqsum
,
(
int
)
sqsumstep
,
size
,
0
,
0
)
>=
0
;
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_64F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiSqrIntegral_8u32s64f_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32s
*
)
sum
,
(
int
)
sumstep
,
(
Ipp64f
*
)
sqsum
,
(
int
)
sqsumstep
,
size
,
0
,
0
)
>=
0
;
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32F
&&
sqdepth
==
CV_64F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiSqrIntegral_8u32f64f_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32f
*
)
sum
,
(
int
)
sumstep
,
(
Ipp64f
*
)
sqsum
,
(
int
)
sqsumstep
,
size
,
0
,
0
)
>=
0
;
else
return
false
;
}
}
#endif
#endif
// HAVE_IPP
template
<
typename
T
,
typename
ST
,
typename
QT
>
namespace
hal
{
template
<
typename
T
,
typename
ST
,
typename
QT
>
static
void
integral_
(
const
T
*
src
,
size_t
_srcstep
,
ST
*
sum
,
size_t
_sumstep
,
void
integral_
(
const
T
*
src
,
size_t
_srcstep
,
ST
*
sum
,
size_t
_sumstep
,
QT
*
sqsum
,
size_t
_sqsumstep
,
ST
*
tilted
,
size_t
_tiltedstep
,
QT
*
sqsum
,
size_t
_sqsumstep
,
ST
*
tilted
,
size_t
_tiltedstep
,
int
width
,
int
height
,
int
cn
)
int
width
,
int
height
,
int
cn
)
{
{
int
x
,
y
,
k
;
int
x
,
y
,
k
;
if
(
Integral_SIMD
<
T
,
ST
,
QT
>
()(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
tilted
,
_tiltedstep
,
width
,
height
,
cn
))
return
;
int
srcstep
=
(
int
)(
_srcstep
/
sizeof
(
T
));
int
srcstep
=
(
int
)(
_srcstep
/
sizeof
(
T
));
int
sumstep
=
(
int
)(
_sumstep
/
sizeof
(
ST
));
int
sumstep
=
(
int
)(
_sumstep
/
sizeof
(
ST
));
int
tiltedstep
=
(
int
)(
_tiltedstep
/
sizeof
(
ST
));
int
tiltedstep
=
(
int
)(
_tiltedstep
/
sizeof
(
ST
));
...
@@ -401,157 +348,36 @@ void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep,
...
@@ -401,157 +348,36 @@ void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep,
}
}
}
}
static
bool
integral_SIMD
(
#ifdef HAVE_OPENCL
int
depth
,
int
sdepth
,
int
sqdepth
,
const
uchar
*
src
,
size_t
srcstep
,
static
bool
ocl_integral
(
InputArray
_src
,
OutputArray
_sum
,
int
sdepth
)
uchar
*
sum
,
size_t
sumstep
,
{
uchar
*
sqsum
,
size_t
sqsumstep
,
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
uchar
*
tilted
,
size_t
tstep
,
int
width
,
int
height
,
int
cn
)
if
(
(
_src
.
type
()
!=
CV_8UC1
)
||
!
(
sdepth
==
CV_32S
||
sdepth
==
CV_32F
||
(
doubleSupport
&&
sdepth
==
CV_64F
)))
return
false
;
static
const
int
tileSize
=
16
;
String
build_opt
=
format
(
"-D sumT=%s -D LOCAL_SUM_SIZE=%d%s"
,
ocl
::
typeToStr
(
sdepth
),
tileSize
,
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
);
ocl
::
Kernel
kcols
(
"integral_sum_cols"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
);
if
(
kcols
.
empty
())
return
false
;
UMat
src
=
_src
.
getUMat
();
Size
src_size
=
src
.
size
();
Size
bufsize
(((
src_size
.
height
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
,
((
src_size
.
width
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
);
UMat
buf
(
bufsize
,
sdepth
);
kcols
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
buf
));
size_t
gt
=
src
.
cols
,
lt
=
tileSize
;
if
(
!
kcols
.
run
(
1
,
&
gt
,
&
lt
,
false
))
return
false
;
ocl
::
Kernel
krows
(
"integral_sum_rows"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
);
if
(
krows
.
empty
())
return
false
;
Size
sumsize
(
src_size
.
width
+
1
,
src_size
.
height
+
1
);
_sum
.
create
(
sumsize
,
sdepth
);
UMat
sum
=
_sum
.
getUMat
();
krows
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
buf
),
ocl
::
KernelArg
::
WriteOnly
(
sum
));
gt
=
src
.
rows
;
return
krows
.
run
(
1
,
&
gt
,
&
lt
,
false
);
}
static
bool
ocl_integral
(
InputArray
_src
,
OutputArray
_sum
,
OutputArray
_sqsum
,
int
sdepth
,
int
sqdepth
)
{
{
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
CV_INSTRUMENT_REGION
();
if
(
_src
.
type
()
!=
CV_8UC1
||
(
!
doubleSupport
&&
(
sdepth
==
CV_64F
||
sqdepth
==
CV_64F
))
)
return
false
;
static
const
int
tileSize
=
16
;
String
build_opt
=
format
(
"-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s"
,
ocl
::
typeToStr
(
sdepth
),
ocl
::
typeToStr
(
sqdepth
),
tileSize
,
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
);
ocl
::
Kernel
kcols
(
"integral_sum_cols"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
);
if
(
kcols
.
empty
())
return
false
;
UMat
src
=
_src
.
getUMat
();
Size
src_size
=
src
.
size
();
Size
bufsize
(((
src_size
.
height
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
,
((
src_size
.
width
+
tileSize
-
1
)
/
tileSize
)
*
tileSize
);
UMat
buf
(
bufsize
,
sdepth
);
UMat
buf_sq
(
bufsize
,
sqdepth
);
kcols
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
buf
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
buf_sq
));
size_t
gt
=
src
.
cols
,
lt
=
tileSize
;
if
(
!
kcols
.
run
(
1
,
&
gt
,
&
lt
,
false
))
return
false
;
ocl
::
Kernel
krows
(
"integral_sum_rows"
,
ocl
::
imgproc
::
integral_sum_oclsrc
,
build_opt
);
if
(
krows
.
empty
())
return
false
;
Size
sumsize
(
src_size
.
width
+
1
,
src_size
.
height
+
1
);
_sum
.
create
(
sumsize
,
sdepth
);
UMat
sum
=
_sum
.
getUMat
();
_sqsum
.
create
(
sumsize
,
sqdepth
);
UMat
sum_sq
=
_sqsum
.
getUMat
();
krows
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
buf
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
buf_sq
),
ocl
::
KernelArg
::
WriteOnly
(
sum
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
sum_sq
));
gt
=
src
.
rows
;
return
krows
.
run
(
1
,
&
gt
,
&
lt
,
false
);
}
#endif
CV_CPU_DISPATCH
(
integral_SIMD
,
(
depth
,
sdepth
,
sqdepth
,
src
,
srcstep
,
sum
,
sumstep
,
sqsum
,
sqsumstep
,
tilted
,
tstep
,
width
,
height
,
cn
),
CV_CPU_DISPATCH_MODES_ALL
);
}
}
#if defined(HAVE_IPP)
void
integral
(
namespace
cv
int
depth
,
int
sdepth
,
int
sqdepth
,
const
uchar
*
src
,
size_t
srcstep
,
uchar
*
sum
,
size_t
sumstep
,
uchar
*
sqsum
,
size_t
sqsumstep
,
uchar
*
tilted
,
size_t
tstep
,
int
width
,
int
height
,
int
cn
)
{
{
static
bool
ipp_integral
(
CV_INSTRUMENT_REGION
();
int
depth
,
int
sdepth
,
int
sqdepth
,
const
uchar
*
src
,
size_t
srcstep
,
uchar
*
sum
,
size_t
sumstep
,
uchar
*
sqsum
,
size_t
sqsumstep
,
uchar
*
tilted
,
size_t
tstep
,
int
width
,
int
height
,
int
cn
)
{
CV_INSTRUMENT_REGION_IPP
();
IppiSize
size
=
{
width
,
height
};
if
(
cn
>
1
)
return
false
;
if
(
tilted
)
{
CV_UNUSED
(
tstep
);
return
false
;
}
if
(
!
sqsum
)
{
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiIntegral_8u32s_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32s
*
)
sum
,
(
int
)
sumstep
,
size
,
0
)
>=
0
;
else
if
(
depth
==
CV_8UC1
&&
sdepth
==
CV_32F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiIntegral_8u32f_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32f
*
)
sum
,
(
int
)
sumstep
,
size
,
0
)
>=
0
;
else
if
(
depth
==
CV_32FC1
&&
sdepth
==
CV_32F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiIntegral_32f_C1R
,
(
const
Ipp32f
*
)
src
,
(
int
)
srcstep
,
(
Ipp32f
*
)
sum
,
(
int
)
sumstep
,
size
)
>=
0
;
else
return
false
;
}
else
{
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_32S
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiSqrIntegral_8u32s_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32s
*
)
sum
,
(
int
)
sumstep
,
(
Ipp32s
*
)
sqsum
,
(
int
)
sqsumstep
,
size
,
0
,
0
)
>=
0
;
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_64F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiSqrIntegral_8u32s64f_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32s
*
)
sum
,
(
int
)
sumstep
,
(
Ipp64f
*
)
sqsum
,
(
int
)
sqsumstep
,
size
,
0
,
0
)
>=
0
;
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32F
&&
sqdepth
==
CV_64F
)
return
CV_INSTRUMENT_FUN_IPP
(
ippiSqrIntegral_8u32f64f_C1R
,
(
const
Ipp8u
*
)
src
,
(
int
)
srcstep
,
(
Ipp32f
*
)
sum
,
(
int
)
sumstep
,
(
Ipp64f
*
)
sqsum
,
(
int
)
sqsumstep
,
size
,
0
,
0
)
>=
0
;
else
return
false
;
}
}
}
#endif
namespace
cv
{
namespace
hal
{
void
integral
(
int
depth
,
int
sdepth
,
int
sqdepth
,
const
uchar
*
src
,
size_t
srcstep
,
uchar
*
sum
,
size_t
sumstep
,
uchar
*
sqsum
,
size_t
sqsumstep
,
uchar
*
tilted
,
size_t
tstep
,
int
width
,
int
height
,
int
cn
)
{
CALL_HAL
(
integral
,
cv_hal_integral
,
depth
,
sdepth
,
sqdepth
,
src
,
srcstep
,
sum
,
sumstep
,
sqsum
,
sqsumstep
,
tilted
,
tstep
,
width
,
height
,
cn
);
CALL_HAL
(
integral
,
cv_hal_integral
,
depth
,
sdepth
,
sqdepth
,
src
,
srcstep
,
sum
,
sumstep
,
sqsum
,
sqsumstep
,
tilted
,
tstep
,
width
,
height
,
cn
);
CV_IPP_RUN_FAST
(
ipp_integral
(
depth
,
sdepth
,
sqdepth
,
src
,
srcstep
,
sum
,
sumstep
,
sqsum
,
sqsumstep
,
tilted
,
tstep
,
width
,
height
,
cn
));
CV_IPP_RUN_FAST
(
ipp_integral
(
depth
,
sdepth
,
sqdepth
,
src
,
srcstep
,
sum
,
sumstep
,
sqsum
,
sqsumstep
,
tilted
,
tstep
,
width
,
height
,
cn
));
if
(
integral_SIMD
(
depth
,
sdepth
,
sqdepth
,
src
,
srcstep
,
sum
,
sumstep
,
sqsum
,
sqsumstep
,
tilted
,
tstep
,
width
,
height
,
cn
))
return
;
#define ONE_CALL(A, B, C) integral_<A, B, C>((const A*)src, srcstep, (B*)sum, sumstep, (C*)sqsum, sqsumstep, (B*)tilted, tstep, width, height, cn)
#define ONE_CALL(A, B, C) integral_<A, B, C>((const A*)src, srcstep, (B*)sum, sumstep, (C*)sqsum, sqsumstep, (B*)tilted, tstep, width, height, cn)
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_64F
)
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_64F
)
...
@@ -579,14 +405,14 @@ void integral(int depth, int sdepth, int sqdepth,
...
@@ -579,14 +405,14 @@ void integral(int depth, int sdepth, int sqdepth,
else
if
(
depth
==
CV_64F
&&
sdepth
==
CV_64F
&&
sqdepth
==
CV_64F
)
else
if
(
depth
==
CV_64F
&&
sdepth
==
CV_64F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
double
,
double
,
double
);
ONE_CALL
(
double
,
double
,
double
);
else
else
CV_Error
(
CV_StsUnsupportedFormat
,
""
);
CV_Error
(
Error
::
StsUnsupportedFormat
,
""
);
#undef ONE_CALL
#undef ONE_CALL
}
}
}
}
// cv::hal::
}
// namespace hal
void
cv
::
integral
(
InputArray
_src
,
OutputArray
_sum
,
OutputArray
_sqsum
,
OutputArray
_tilted
,
int
sdepth
,
int
sqdepth
)
void
integral
(
InputArray
_src
,
OutputArray
_sum
,
OutputArray
_sqsum
,
OutputArray
_tilted
,
int
sdepth
,
int
sqdepth
)
{
{
CV_INSTRUMENT_REGION
();
CV_INSTRUMENT_REGION
();
...
@@ -624,20 +450,21 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output
...
@@ -624,20 +450,21 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output
src
.
cols
,
src
.
rows
,
cn
);
src
.
cols
,
src
.
rows
,
cn
);
}
}
void
cv
::
integral
(
InputArray
src
,
OutputArray
sum
,
int
sdepth
)
void
integral
(
InputArray
src
,
OutputArray
sum
,
int
sdepth
)
{
{
CV_INSTRUMENT_REGION
();
CV_INSTRUMENT_REGION
();
integral
(
src
,
sum
,
noArray
(),
noArray
(),
sdepth
);
integral
(
src
,
sum
,
noArray
(),
noArray
(),
sdepth
);
}
}
void
cv
::
integral
(
InputArray
src
,
OutputArray
sum
,
OutputArray
sqsum
,
int
sdepth
,
int
sqdepth
)
void
integral
(
InputArray
src
,
OutputArray
sum
,
OutputArray
sqsum
,
int
sdepth
,
int
sqdepth
)
{
{
CV_INSTRUMENT_REGION
();
CV_INSTRUMENT_REGION
();
integral
(
src
,
sum
,
sqsum
,
noArray
(),
sdepth
,
sqdepth
);
integral
(
src
,
sum
,
sqsum
,
noArray
(),
sdepth
,
sqdepth
);
}
}
}
// namespace
CV_IMPL
void
CV_IMPL
void
cvIntegral
(
const
CvArr
*
image
,
CvArr
*
sumImage
,
cvIntegral
(
const
CvArr
*
image
,
CvArr
*
sumImage
,
...
...
modules/imgproc/src/sumpixels.hpp
deleted
100644 → 0
View file @
4d2da2de
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2019, Intel Corporation, all rights reserved.
#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP
#define OPENCV_IMGPROC_SUM_PIXELS_HPP
namespace
cv
{
namespace
opt_AVX512_SKX
{
#if CV_TRY_AVX512_SKX
void
calculate_integral_avx512
(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
int
width
,
int
height
,
int
cn
);
#endif
}
// end namespace opt_AVX512_SKX
}
// end namespace cv
#endif
modules/imgproc/src/sumpixels.simd.hpp
0 → 100644
View file @
76c21b73
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2020 Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "opencv2/core/hal/intrin.hpp"
#if CV_AVX512_SKX
#include "sumpixels.avx512_skx.hpp"
#endif
namespace
cv
{
namespace
hal
{
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
// forward declarations
bool
integral_SIMD
(
int
depth
,
int
sdepth
,
int
sqdepth
,
const
uchar
*
src
,
size_t
srcstep
,
uchar
*
sum
,
size_t
sumstep
,
uchar
*
sqsum
,
size_t
sqsumstep
,
uchar
*
tilted
,
size_t
tstep
,
int
width
,
int
height
,
int
cn
);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
namespace
{
template
<
typename
T
,
typename
ST
,
typename
QT
>
struct
Integral_SIMD
{
bool
operator
()(
const
T
*
,
size_t
,
ST
*
,
size_t
,
QT
*
,
size_t
,
ST
*
,
size_t
,
int
,
int
,
int
)
const
{
return
false
;
}
};
#if CV_AVX512_SKX
template
<>
struct
Integral_SIMD
<
uchar
,
double
,
double
>
{
Integral_SIMD
()
{};
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
double
*
tilted
,
size_t
_tiltedstep
,
int
width
,
int
height
,
int
cn
)
const
{
CV_UNUSED
(
_tiltedstep
);
// TODO: Add support for 1 channel input (WIP)
if
(
!
tilted
&&
(
cn
<=
4
))
{
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
,
cn
);
return
true
;
}
return
false
;
}
};
#endif
#if CV_SIMD && CV_SIMD_WIDTH <= 64
template
<>
struct
Integral_SIMD
<
uchar
,
int
,
double
>
{
Integral_SIMD
()
{}
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
int
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
,
int
*
tilted
,
size_t
,
int
width
,
int
height
,
int
cn
)
const
{
if
(
sqsum
||
tilted
||
cn
!=
1
)
return
false
;
// the first iteration
memset
(
sum
,
0
,
(
width
+
1
)
*
sizeof
(
int
));
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
int
*
prev_sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
int
*
sum_row
=
(
int
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
sum_row
[
-
1
]
=
0
;
v_int32
prev
=
vx_setzero_s32
();
int
j
=
0
;
for
(
;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_int32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
)),
prev
.
val
);
el4h
.
val
=
_mm256_add_epi32
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
)),
_mm256_permutevar8x32_epi32
(
el4l
.
val
,
shmask
));
prev
.
val
=
_mm256_permutevar8x32_epi32
(
el4h
.
val
,
shmask
);
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_expand
(
el8
,
el4l
,
el4h
);
el4l
+=
prev
;
el4h
+=
el4l
;
prev
=
v_broadcast_element
<
v_int32
::
nlanes
-
1
>
(
el4h
);
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_int32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_int32
::
nlanes
));
}
for
(
int
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
}
return
true
;
}
};
template
<>
struct
Integral_SIMD
<
uchar
,
float
,
double
>
{
Integral_SIMD
()
{}
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
float
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
,
float
*
tilted
,
size_t
,
int
width
,
int
height
,
int
cn
)
const
{
if
(
sqsum
||
tilted
||
cn
!=
1
)
return
false
;
// the first iteration
memset
(
sum
,
0
,
(
width
+
1
)
*
sizeof
(
int
));
// the others
for
(
int
i
=
0
;
i
<
height
;
++
i
)
{
const
uchar
*
src_row
=
src
+
_srcstep
*
i
;
float
*
prev_sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
i
)
+
1
;
float
*
sum_row
=
(
float
*
)((
uchar
*
)
sum
+
_sumstep
*
(
i
+
1
))
+
1
;
sum_row
[
-
1
]
=
0
;
v_float32
prev
=
vx_setzero_f32
();
int
j
=
0
;
for
(;
j
+
v_uint16
::
nlanes
<=
width
;
j
+=
v_uint16
::
nlanes
)
{
v_int16
el8
=
v_reinterpret_as_s16
(
vx_load_expand
(
src_row
+
j
));
v_float32
el4l
,
el4h
;
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i
vsum
=
_mm256_add_epi16
(
el8
.
val
,
_mm256_slli_si256
(
el8
.
val
,
2
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
4
));
vsum
=
_mm256_add_epi16
(
vsum
,
_mm256_slli_si256
(
vsum
,
8
));
__m256i
shmask
=
_mm256_set1_epi32
(
7
);
el4l
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_low
(
vsum
))),
prev
.
val
);
el4h
.
val
=
_mm256_add_ps
(
_mm256_cvtepi32_ps
(
_mm256_cvtepi16_epi32
(
_v256_extract_high
(
vsum
))),
_mm256_permutevar8x32_ps
(
el4l
.
val
,
shmask
));
prev
.
val
=
_mm256_permutevar8x32_ps
(
el4h
.
val
,
shmask
);
#else
el8
+=
v_rotate_left
<
1
>
(
el8
);
el8
+=
v_rotate_left
<
2
>
(
el8
);
#if CV_SIMD_WIDTH >= 32
el8
+=
v_rotate_left
<
4
>
(
el8
);
#if CV_SIMD_WIDTH == 64
el8
+=
v_rotate_left
<
8
>
(
el8
);
#endif
#endif
v_int32
el4li
,
el4hi
;
v_expand
(
el8
,
el4li
,
el4hi
);
el4l
=
v_cvt_f32
(
el4li
)
+
prev
;
el4h
=
v_cvt_f32
(
el4hi
)
+
el4l
;
prev
=
v_broadcast_element
<
v_float32
::
nlanes
-
1
>
(
el4h
);
#endif
v_store
(
sum_row
+
j
,
el4l
+
vx_load
(
prev_sum_row
+
j
));
v_store
(
sum_row
+
j
+
v_float32
::
nlanes
,
el4h
+
vx_load
(
prev_sum_row
+
j
+
v_float32
::
nlanes
));
}
for
(
float
v
=
sum_row
[
j
-
1
]
-
prev_sum_row
[
j
-
1
];
j
<
width
;
++
j
)
sum_row
[
j
]
=
(
v
+=
src_row
[
j
])
+
prev_sum_row
[
j
];
}
return
true
;
}
};
#endif
}
// namespace anon
bool
integral_SIMD
(
int
depth
,
int
sdepth
,
int
sqdepth
,
const
uchar
*
src
,
size_t
srcstep
,
uchar
*
sum
,
size_t
sumstep
,
uchar
*
sqsum
,
size_t
sqsumstep
,
uchar
*
tilted
,
size_t
tstep
,
int
width
,
int
height
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
#define ONE_CALL(T, ST, QT) \
return Integral_SIMD<T, ST, QT>()((const T*)src, srcstep, (ST*)sum, sumstep, (QT*)sqsum, sqsumstep, (ST*)tilted, tstep, width, height, cn)
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
uchar
,
int
,
double
);
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_32F
)
ONE_CALL
(
uchar
,
int
,
float
);
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32S
&&
sqdepth
==
CV_32S
)
ONE_CALL
(
uchar
,
int
,
int
);
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
uchar
,
float
,
double
);
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_32F
&&
sqdepth
==
CV_32F
)
ONE_CALL
(
uchar
,
float
,
float
);
else
if
(
depth
==
CV_8U
&&
sdepth
==
CV_64F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
uchar
,
double
,
double
);
else
if
(
depth
==
CV_16U
&&
sdepth
==
CV_64F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
ushort
,
double
,
double
);
else
if
(
depth
==
CV_16S
&&
sdepth
==
CV_64F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
short
,
double
,
double
);
else
if
(
depth
==
CV_32F
&&
sdepth
==
CV_32F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
float
,
float
,
double
);
else
if
(
depth
==
CV_32F
&&
sdepth
==
CV_32F
&&
sqdepth
==
CV_32F
)
ONE_CALL
(
float
,
float
,
float
);
else
if
(
depth
==
CV_32F
&&
sdepth
==
CV_64F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
float
,
double
,
double
);
else
if
(
depth
==
CV_64F
&&
sdepth
==
CV_64F
&&
sqdepth
==
CV_64F
)
ONE_CALL
(
double
,
double
,
double
);
else
return
false
;
#undef ONE_CALL
}
#endif
CV_CPU_OPTIMIZATION_NAMESPACE_END
}}
// cv::hal::
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment