Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
11a09ef5
Commit
11a09ef5
authored
Jun 06, 2014
by
Richard Yoo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Changes to support Intel AVX/AVX2 in cvResize().
parent
9a5e9d34
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
716 additions
and
31 deletions
+716
-31
CMakeLists.txt
CMakeLists.txt
+1
-0
OpenCVCompilerOptions.cmake
cmake/OpenCVCompilerOptions.cmake
+11
-3
core_c.h
modules/core/include/opencv2/core/core_c.h
+1
-0
internal.hpp
modules/core/include/opencv2/core/internal.hpp
+7
-0
system.cpp
modules/core/src/system.cpp
+35
-0
imgwarp.cpp
modules/imgproc/src/imgwarp.cpp
+658
-28
ts_func.cpp
modules/ts/src/ts_func.cpp
+3
-0
No files found.
CMakeLists.txt
View file @
11a09ef5
...
...
@@ -217,6 +217,7 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions"
OCV_OPTION
(
ENABLE_SSE41
"Enable SSE4.1 instructions"
OFF
IF
((
CV_ICC OR CMAKE_COMPILER_IS_GNUCXX
)
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_SSE42
"Enable SSE4.2 instructions"
OFF
IF
(
CMAKE_COMPILER_IS_GNUCXX
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_AVX
"Enable AVX instructions"
OFF
IF
((
MSVC OR CMAKE_COMPILER_IS_GNUCXX
)
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_AVX2
"Enable AVX2 instructions"
OFF
IF
((
MSVC OR CMAKE_COMPILER_IS_GNUCXX
)
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_NEON
"Enable NEON instructions"
OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM
)
OCV_OPTION
(
ENABLE_VFPV3
"Enable VFPv3-D32 instructions"
OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM
)
OCV_OPTION
(
ENABLE_NOISY_WARNINGS
"Show all warnings even if they are too noisy"
OFF
)
...
...
cmake/OpenCVCompilerOptions.cmake
View file @
11a09ef5
...
...
@@ -143,8 +143,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
add_extra_compiler_option
(
-mavx
)
endif
()
if
(
ENABLE_AVX2
)
add_extra_compiler_option
(
-mavx2
)
endif
()
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
if
(
NOT OPENCV_EXTRA_CXX_FLAGS MATCHES
"-m
avx
"
)
if
(
NOT OPENCV_EXTRA_CXX_FLAGS MATCHES
"-m
(avx|avx2)
"
)
if
(
ENABLE_SSE3
)
add_extra_compiler_option
(
-msse3
)
endif
()
...
...
@@ -165,7 +169,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if
(
X86 OR X86_64
)
if
(
NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4
)
if
(
OPENCV_EXTRA_CXX_FLAGS MATCHES
"-m(sse2|avx)"
)
if
(
OPENCV_EXTRA_CXX_FLAGS MATCHES
"-m(sse2|avx
|avx2
)"
)
add_extra_compiler_option
(
-mfpmath=sse
)
# !! important - be on the same wave with x64 compilers
else
()
add_extra_compiler_option
(
-mfpmath=387
)
...
...
@@ -220,6 +224,10 @@ if(MSVC)
set
(
OPENCV_EXTRA_FLAGS
"
${
OPENCV_EXTRA_FLAGS
}
/arch:AVX"
)
endif
()
if
(
ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800
)
set
(
OPENCV_EXTRA_FLAGS
"
${
OPENCV_EXTRA_FLAGS
}
/arch:AVX2"
)
endif
()
if
(
ENABLE_SSE4_1 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES
"/arch:"
)
set
(
OPENCV_EXTRA_FLAGS
"
${
OPENCV_EXTRA_FLAGS
}
/arch:SSE4.1"
)
endif
()
...
...
@@ -238,7 +246,7 @@ if(MSVC)
endif
()
endif
()
if
(
ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX
)
if
(
ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX
OR ENABLE_AVX2
)
set
(
OPENCV_EXTRA_FLAGS
"
${
OPENCV_EXTRA_FLAGS
}
/Oi"
)
endif
()
...
...
modules/core/include/opencv2/core/core_c.h
View file @
11a09ef5
...
...
@@ -1706,6 +1706,7 @@ CVAPI(double) cvGetTickFrequency( void );
#define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8
#define CV_CPU_AVX 10
#define CV_CPU_AVX2 11
#define CV_HARDWARE_MAX_FEATURE 255
CVAPI
(
int
)
cvCheckHardwareSupport
(
int
feature
);
...
...
modules/core/include/opencv2/core/internal.hpp
View file @
11a09ef5
...
...
@@ -141,6 +141,10 @@ CV_INLINE IppiSize ippiSize(const cv::Size & _size)
# define __xgetbv() 0
# endif
# endif
# if defined __AVX2__
# include <immintrin.h>
# define CV_AVX2 1
# endif
#endif
...
...
@@ -176,6 +180,9 @@ CV_INLINE IppiSize ippiSize(const cv::Size & _size)
#ifndef CV_AVX
# define CV_AVX 0
#endif
#ifndef CV_AVX2
# define CV_AVX2 0
#endif
#ifndef CV_NEON
# define CV_NEON 0
#endif
...
...
modules/core/src/system.cpp
View file @
11a09ef5
...
...
@@ -253,6 +253,41 @@ struct HWFeatures
f
.
have
[
CV_CPU_AVX
]
=
(((
cpuid_data
[
2
]
&
(
1
<<
28
))
!=
0
)
&&
((
cpuid_data
[
2
]
&
(
1
<<
27
))
!=
0
));
//OS uses XSAVE_XRSTORE and CPU support AVX
}
#if CV_AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex
(
cpuid_data
,
7
,
0
);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm
__volatile__
(
"movl $7, %%eax
\n\t
"
"movl $0, %%ecx
\n\t
"
"cpuid
\n\t
"
:
[
eax
]
"=a"
(
cpuid_data
[
0
]),[
ebx
]
"=b"
(
cpuid_data
[
1
]),[
ecx
]
"=c"
(
cpuid_data
[
2
]),[
edx
]
"=d"
(
cpuid_data
[
3
])
:
:
"cc"
);
#else
asm
volatile
(
"pushl %%ebx
\n\t
"
"movl $7,%%eax
\n\t
"
"movl $0,%%ecx
\n\t
"
"cpuid
\n\t
"
"popl %%ebx
\n\t
"
:
"=a"
(
cpuid_data
[
0
]),
"=b"
(
cpuid_data
[
1
]),
"=c"
(
cpuid_data
[
2
]),
"=d"
(
cpuid_data
[
3
])
:
:
"cc"
);
#endif
#endif
if
(
f
.
x86_family
>=
6
)
{
f
.
have
[
CV_CPU_AVX2
]
=
(
cpuid_data
[
1
]
&
(
1
<<
5
))
!=
0
;
}
#endif
return
f
;
}
...
...
modules/imgproc/src/imgwarp.cpp
View file @
11a09ef5
...
...
@@ -54,6 +54,10 @@
static
IppStatus
sts
=
ippInit
();
#endif
#ifdef _MSC_VER
# pragma warning(disable:4752) // Disable warning for mixing SSE and AVX
#endif
namespace
cv
{
...
...
@@ -451,13 +455,8 @@ struct HResizeNoVec
#if CV_SSE2
st
ruct
VResizeLinearVec_32s8u
st
atic
int
VResizeLinearVec_32s8u_sse2
(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
const
int
**
src
=
(
const
int
**
)
_src
;
const
short
*
beta
=
(
const
short
*
)
_beta
;
const
int
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
...
...
@@ -530,17 +529,111 @@ struct VResizeLinearVec_32s8u
}
return
x
;
}
#if CV_AVX2
int
VResizeLinearVec_32s8u_avx2
(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
{
const
int
**
src
=
(
const
int
**
)
_src
;
const
short
*
beta
=
(
const
short
*
)
_beta
;
const
int
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
int
x
=
0
;
__m256i
b0
=
_mm256_set1_epi16
(
beta
[
0
]),
b1
=
_mm256_set1_epi16
(
beta
[
1
]);
__m256i
delta
=
_mm256_set1_epi16
(
2
);
const
int
index
[
8
]
=
{
0
,
4
,
1
,
5
,
2
,
6
,
3
,
7
};
__m256i
shuffle
=
_mm256_load_si256
((
const
__m256i
*
)
index
);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
31
)
==
0
)
for
(
;
x
<=
width
-
32
;
x
+=
32
)
{
__m256i
x0
,
x1
,
x2
,
y0
,
y1
,
y2
;
x0
=
_mm256_load_si256
((
const
__m256i
*
)(
S0
+
x
));
x1
=
_mm256_load_si256
((
const
__m256i
*
)(
S0
+
x
+
8
));
y0
=
_mm256_load_si256
((
const
__m256i
*
)(
S1
+
x
));
y1
=
_mm256_load_si256
((
const
__m256i
*
)(
S1
+
x
+
8
));
x0
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
x0
,
4
),
_mm256_srai_epi32
(
x1
,
4
));
y0
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
y0
,
4
),
_mm256_srai_epi32
(
y1
,
4
));
x1
=
_mm256_load_si256
((
const
__m256i
*
)(
S0
+
x
+
16
));
x2
=
_mm256_load_si256
((
const
__m256i
*
)(
S0
+
x
+
24
));
y1
=
_mm256_load_si256
((
const
__m256i
*
)(
S1
+
x
+
16
));
y2
=
_mm256_load_si256
((
const
__m256i
*
)(
S1
+
x
+
24
));
x1
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
x1
,
4
),
_mm256_srai_epi32
(
x2
,
4
));
y1
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
y1
,
4
),
_mm256_srai_epi32
(
y2
,
4
));
x0
=
_mm256_adds_epi16
(
_mm256_mulhi_epi16
(
x0
,
b0
),
_mm256_mulhi_epi16
(
y0
,
b1
));
x1
=
_mm256_adds_epi16
(
_mm256_mulhi_epi16
(
x1
,
b0
),
_mm256_mulhi_epi16
(
y1
,
b1
));
x0
=
_mm256_srai_epi16
(
_mm256_adds_epi16
(
x0
,
delta
),
2
);
x1
=
_mm256_srai_epi16
(
_mm256_adds_epi16
(
x1
,
delta
),
2
);
x0
=
_mm256_packus_epi16
(
x0
,
x1
);
x0
=
_mm256_permutevar8x32_epi32
(
x0
,
shuffle
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
),
x0
);
}
else
for
(
;
x
<=
width
-
32
;
x
+=
32
)
{
__m256i
x0
,
x1
,
x2
,
y0
,
y1
,
y2
;
x0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S0
+
x
));
x1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S0
+
x
+
8
));
y0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S1
+
x
));
y1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S1
+
x
+
8
));
x0
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
x0
,
4
),
_mm256_srai_epi32
(
x1
,
4
));
y0
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
y0
,
4
),
_mm256_srai_epi32
(
y1
,
4
));
x1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S0
+
x
+
16
));
x2
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S0
+
x
+
24
));
y1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S1
+
x
+
16
));
y2
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S1
+
x
+
24
));
x1
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
x1
,
4
),
_mm256_srai_epi32
(
x2
,
4
));
y1
=
_mm256_packs_epi32
(
_mm256_srai_epi32
(
y1
,
4
),
_mm256_srai_epi32
(
y2
,
4
));
x0
=
_mm256_adds_epi16
(
_mm256_mulhi_epi16
(
x0
,
b0
),
_mm256_mulhi_epi16
(
y0
,
b1
));
x1
=
_mm256_adds_epi16
(
_mm256_mulhi_epi16
(
x1
,
b0
),
_mm256_mulhi_epi16
(
y1
,
b1
));
x0
=
_mm256_srai_epi16
(
_mm256_adds_epi16
(
x0
,
delta
),
2
);
x1
=
_mm256_srai_epi16
(
_mm256_adds_epi16
(
x1
,
delta
),
2
);
x0
=
_mm256_packus_epi16
(
x0
,
x1
);
x0
=
_mm256_permutevar8x32_epi32
(
x0
,
shuffle
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
),
x0
);
}
for
(
;
x
<
width
-
8
;
x
+=
8
)
{
__m256i
x0
,
y0
;
x0
=
_mm256_srai_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)(
S0
+
x
)),
4
);
y0
=
_mm256_srai_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)(
S1
+
x
)),
4
);
x0
=
_mm256_packs_epi32
(
x0
,
x0
);
y0
=
_mm256_packs_epi32
(
y0
,
y0
);
x0
=
_mm256_adds_epi16
(
_mm256_mulhi_epi16
(
x0
,
b0
),
_mm256_mulhi_epi16
(
y0
,
b1
));
x0
=
_mm256_srai_epi16
(
_mm256_adds_epi16
(
x0
,
delta
),
2
);
x0
=
_mm256_packus_epi16
(
x0
,
x0
);
*
(
int
*
)(
dst
+
x
)
=
_mm_cvtsi128_si32
(
_mm256_extracti128_si256
(
x0
,
0
));
*
(
int
*
)(
dst
+
x
+
4
)
=
_mm_cvtsi128_si32
(
_mm256_extracti128_si256
(
x0
,
1
));
}
};
return
x
;
}
#endif
template
<
int
shiftval
>
struct
VResizeLinearVec_32f16
struct
VResizeLinearVec_32s8u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_
dst
,
const
uchar
*
_beta
,
int
width
)
const
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
#if CV_AVX2
if
(
checkHardwareSupport
(
CV_CPU_AVX2
)
)
return
VResizeLinearVec_32s8u_avx2
(
_src
,
dst
,
_beta
,
width
);
#endif
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
VResizeLinearVec_32s8u_sse2
(
_src
,
dst
,
_beta
,
width
);
return
0
;
}
};
template
<
int
shiftval
>
int
VResizeLinearVec_32f16_sse2
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
...
...
@@ -626,19 +719,121 @@ template<int shiftval> struct VResizeLinearVec_32f16
}
return
x
;
}
#if CV_AVX2
template
<
int
shiftval
>
int
VResizeLinearVec_32f16_avx2
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
__m256
b0
=
_mm256_set1_ps
(
beta
[
0
]),
b1
=
_mm256_set1_ps
(
beta
[
1
]);
__m256i
preshift
=
_mm256_set1_epi32
(
shiftval
);
__m256i
postshift
=
_mm256_set1_epi16
((
short
)
shiftval
);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
31
)
==
0
)
for
(
;
x
<=
width
-
32
;
x
+=
32
)
{
__m256
x0
,
x1
,
y0
,
y1
;
__m256i
t0
,
t1
,
t2
;
x0
=
_mm256_load_ps
(
S0
+
x
);
x1
=
_mm256_load_ps
(
S0
+
x
+
8
);
y0
=
_mm256_load_ps
(
S1
+
x
);
y1
=
_mm256_load_ps
(
S1
+
x
+
8
);
x0
=
_mm256_add_ps
(
_mm256_mul_ps
(
x0
,
b0
),
_mm256_mul_ps
(
y0
,
b1
));
x1
=
_mm256_add_ps
(
_mm256_mul_ps
(
x1
,
b0
),
_mm256_mul_ps
(
y1
,
b1
));
t0
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x1
),
preshift
);
t0
=
_mm256_add_epi16
(
_mm256_packs_epi32
(
t0
,
t2
),
postshift
);
x0
=
_mm256_load_ps
(
S0
+
x
+
16
);
x1
=
_mm256_load_ps
(
S0
+
x
+
24
);
y0
=
_mm256_load_ps
(
S1
+
x
+
16
);
y1
=
_mm256_load_ps
(
S1
+
x
+
24
);
x0
=
_mm256_add_ps
(
_mm256_mul_ps
(
x0
,
b0
),
_mm256_mul_ps
(
y0
,
b1
));
x1
=
_mm256_add_ps
(
_mm256_mul_ps
(
x1
,
b0
),
_mm256_mul_ps
(
y1
,
b1
));
t1
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x1
),
preshift
);
t1
=
_mm256_add_epi16
(
_mm256_packs_epi32
(
t1
,
t2
),
postshift
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
),
t0
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
+
16
),
t1
);
}
};
else
for
(
;
x
<=
width
-
32
;
x
+=
32
)
{
__m256
x0
,
x1
,
y0
,
y1
;
__m256i
t0
,
t1
,
t2
;
x0
=
_mm256_loadu_ps
(
S0
+
x
);
x1
=
_mm256_loadu_ps
(
S0
+
x
+
8
);
y0
=
_mm256_loadu_ps
(
S1
+
x
);
y1
=
_mm256_loadu_ps
(
S1
+
x
+
8
);
typedef
VResizeLinearVec_32f16
<
SHRT_MIN
>
VResizeLinearVec_32f16u
;
typedef
VResizeLinearVec_32f16
<
0
>
VResizeLinearVec_32f16s
;
x0
=
_mm256_add_ps
(
_mm256_mul_ps
(
x0
,
b0
),
_mm256_mul_ps
(
y0
,
b1
));
x1
=
_mm256_add_ps
(
_mm256_mul_ps
(
x1
,
b0
),
_mm256_mul_ps
(
y1
,
b1
));
t0
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x1
),
preshift
);
t0
=
_mm256_add_epi16
(
_mm256_packs_epi32
(
t0
,
t2
),
postshift
);
struct
VResizeLinearVec_32f
x0
=
_mm256_loadu_ps
(
S0
+
x
+
16
);
x1
=
_mm256_loadu_ps
(
S0
+
x
+
24
);
y0
=
_mm256_loadu_ps
(
S1
+
x
+
16
);
y1
=
_mm256_loadu_ps
(
S1
+
x
+
24
);
x0
=
_mm256_add_ps
(
_mm256_mul_ps
(
x0
,
b0
),
_mm256_mul_ps
(
y0
,
b1
));
x1
=
_mm256_add_ps
(
_mm256_mul_ps
(
x1
,
b0
),
_mm256_mul_ps
(
y1
,
b1
));
t1
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x1
),
preshift
);
t1
=
_mm256_add_epi16
(
_mm256_packs_epi32
(
t1
,
t2
),
postshift
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
),
t0
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
+
16
),
t1
);
}
for
(
;
x
<
width
-
8
;
x
+=
8
)
{
__m256
x0
,
y0
;
__m256i
t0
;
x0
=
_mm256_loadu_ps
(
S0
+
x
);
y0
=
_mm256_loadu_ps
(
S1
+
x
);
x0
=
_mm256_add_ps
(
_mm256_mul_ps
(
x0
,
b0
),
_mm256_mul_ps
(
y0
,
b1
));
t0
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
x0
),
preshift
);
t0
=
_mm256_add_epi16
(
_mm256_packs_epi32
(
t0
,
t0
),
postshift
);
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
),
_mm256_extracti128_si256
(
t0
,
0
));
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
+
4
),
_mm256_extracti128_si256
(
t0
,
1
));
}
return
x
;
}
#endif
template
<
int
shiftval
>
struct
VResizeLinearVec_32f16
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
#if CV_AVX2
if
(
checkHardwareSupport
(
CV_CPU_AVX2
)
)
return
VResizeLinearVec_32f16_avx2
<
shiftval
>
(
_src
,
_dst
,
_beta
,
width
);
#endif
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
VResizeLinearVec_32f16_sse2
<
shiftval
>
(
_src
,
_dst
,
_beta
,
width
);
return
0
;
}
};
typedef
VResizeLinearVec_32f16
<
SHRT_MIN
>
VResizeLinearVec_32f16u
;
typedef
VResizeLinearVec_32f16
<
0
>
VResizeLinearVec_32f16s
;
static
int
VResizeLinearVec_32f_sse
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
...
...
@@ -679,17 +874,72 @@ struct VResizeLinearVec_32f
}
return
x
;
}
#if CV_AVX
int
VResizeLinearVec_32f_avx
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
float
*
dst
=
(
float
*
)
_dst
;
int
x
=
0
;
__m256
b0
=
_mm256_set1_ps
(
beta
[
0
]),
b1
=
_mm256_set1_ps
(
beta
[
1
]);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
31
)
==
0
)
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256
x0
,
x1
,
y0
,
y1
;
x0
=
_mm256_load_ps
(
S0
+
x
);
x1
=
_mm256_load_ps
(
S0
+
x
+
8
);
y0
=
_mm256_load_ps
(
S1
+
x
);
y1
=
_mm256_load_ps
(
S1
+
x
+
8
);
x0
=
_mm256_add_ps
(
_mm256_mul_ps
(
x0
,
b0
),
_mm256_mul_ps
(
y0
,
b1
));
x1
=
_mm256_add_ps
(
_mm256_mul_ps
(
x1
,
b0
),
_mm256_mul_ps
(
y1
,
b1
));
_mm256_storeu_ps
(
dst
+
x
,
x0
);
_mm256_storeu_ps
(
dst
+
x
+
8
,
x1
);
}
};
else
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256
x0
,
x1
,
y0
,
y1
;
x0
=
_mm256_loadu_ps
(
S0
+
x
);
x1
=
_mm256_loadu_ps
(
S0
+
x
+
8
);
y0
=
_mm256_loadu_ps
(
S1
+
x
);
y1
=
_mm256_loadu_ps
(
S1
+
x
+
8
);
x0
=
_mm256_add_ps
(
_mm256_mul_ps
(
x0
,
b0
),
_mm256_mul_ps
(
y0
,
b1
));
x1
=
_mm256_add_ps
(
_mm256_mul_ps
(
x1
,
b0
),
_mm256_mul_ps
(
y1
,
b1
));
struct
VResizeCubicVec_32s8u
_mm256_storeu_ps
(
dst
+
x
,
x0
);
_mm256_storeu_ps
(
dst
+
x
+
8
,
x1
);
}
return
x
;
}
#endif
struct
VResizeLinearVec_32f
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
const
int
operator
()(
const
uchar
**
_src
,
uchar
*
_
dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
#if CV_AVX
if
(
checkHardwareSupport
(
CV_CPU_AVX
)
)
return
VResizeLinearVec_32f_avx
(
_src
,
_dst
,
_beta
,
width
);
#endif
if
(
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
VResizeLinearVec_32f_sse
(
_src
,
_dst
,
_beta
,
width
);
return
0
;
}
};
static
int
VResizeCubicVec_32s8u_sse2
(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
{
const
int
**
src
=
(
const
int
**
)
_src
;
const
short
*
beta
=
(
const
short
*
)
_beta
;
const
int
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
...
...
@@ -774,17 +1024,124 @@ struct VResizeCubicVec_32s8u
}
return
x
;
}
#if CV_AVX2
int
VResizeCubicVec_32s8u_avx2
(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
{
const
int
**
src
=
(
const
int
**
)
_src
;
const
short
*
beta
=
(
const
short
*
)
_beta
;
const
int
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
int
x
=
0
;
float
scale
=
1.
f
/
(
INTER_RESIZE_COEF_SCALE
*
INTER_RESIZE_COEF_SCALE
);
__m256
b0
=
_mm256_set1_ps
(
beta
[
0
]
*
scale
),
b1
=
_mm256_set1_ps
(
beta
[
1
]
*
scale
),
b2
=
_mm256_set1_ps
(
beta
[
2
]
*
scale
),
b3
=
_mm256_set1_ps
(
beta
[
3
]
*
scale
);
const
int
shuffle
=
0xd8
;
// 11 | 01 | 10 | 00
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
|
(
size_t
)
S2
|
(
size_t
)
S3
)
&
31
)
==
0
)
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256i
x0
,
x1
,
y0
,
y1
;
__m256
s0
,
s1
,
f0
,
f1
;
x0
=
_mm256_load_si256
((
const
__m256i
*
)(
S0
+
x
));
x1
=
_mm256_load_si256
((
const
__m256i
*
)(
S0
+
x
+
8
));
y0
=
_mm256_load_si256
((
const
__m256i
*
)(
S1
+
x
));
y1
=
_mm256_load_si256
((
const
__m256i
*
)(
S1
+
x
+
8
));
s0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x0
),
b0
);
s1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x1
),
b0
);
f0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y0
),
b1
);
f1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y1
),
b1
);
s0
=
_mm256_add_ps
(
s0
,
f0
);
s1
=
_mm256_add_ps
(
s1
,
f1
);
x0
=
_mm256_load_si256
((
const
__m256i
*
)(
S2
+
x
));
x1
=
_mm256_load_si256
((
const
__m256i
*
)(
S2
+
x
+
8
));
y0
=
_mm256_load_si256
((
const
__m256i
*
)(
S3
+
x
));
y1
=
_mm256_load_si256
((
const
__m256i
*
)(
S3
+
x
+
8
));
f0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x0
),
b2
);
f1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x1
),
b2
);
s0
=
_mm256_add_ps
(
s0
,
f0
);
s1
=
_mm256_add_ps
(
s1
,
f1
);
f0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y0
),
b3
);
f1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y1
),
b3
);
s0
=
_mm256_add_ps
(
s0
,
f0
);
s1
=
_mm256_add_ps
(
s1
,
f1
);
x0
=
_mm256_cvtps_epi32
(
s0
);
x1
=
_mm256_cvtps_epi32
(
s1
);
x0
=
_mm256_packs_epi32
(
x0
,
x1
);
x0
=
_mm256_permute4x64_epi64
(
x0
,
shuffle
);
x0
=
_mm256_packus_epi16
(
x0
,
x0
);
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
),
_mm256_extracti128_si256
(
x0
,
0
));
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
+
8
),
_mm256_extracti128_si256
(
x0
,
1
));
}
else
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256i
x0
,
x1
,
y0
,
y1
;
__m256
s0
,
s1
,
f0
,
f1
;
x0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S0
+
x
));
x1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S0
+
x
+
8
));
y0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S1
+
x
));
y1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S1
+
x
+
8
));
s0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x0
),
b0
);
s1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x1
),
b0
);
f0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y0
),
b1
);
f1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y1
),
b1
);
s0
=
_mm256_add_ps
(
s0
,
f0
);
s1
=
_mm256_add_ps
(
s1
,
f1
);
x0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S2
+
x
));
x1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S2
+
x
+
8
));
y0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S3
+
x
));
y1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
S3
+
x
+
8
));
f0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x0
),
b2
);
f1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
x1
),
b2
);
s0
=
_mm256_add_ps
(
s0
,
f0
);
s1
=
_mm256_add_ps
(
s1
,
f1
);
f0
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y0
),
b3
);
f1
=
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
y1
),
b3
);
s0
=
_mm256_add_ps
(
s0
,
f0
);
s1
=
_mm256_add_ps
(
s1
,
f1
);
x0
=
_mm256_cvtps_epi32
(
s0
);
x1
=
_mm256_cvtps_epi32
(
s1
);
x0
=
_mm256_packs_epi32
(
x0
,
x1
);
x0
=
_mm256_permute4x64_epi64
(
x0
,
shuffle
);
x0
=
_mm256_packus_epi16
(
x0
,
x0
);
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
),
_mm256_extracti128_si256
(
x0
,
0
));
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
+
8
),
_mm256_extracti128_si256
(
x0
,
1
));
}
};
return
x
;
}
#endif
template
<
int
shiftval
>
struct
VResizeCubicVec_32f16
struct
VResizeCubicVec_32s8u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_
dst
,
const
uchar
*
_beta
,
int
width
)
const
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
#if CV_AVX2
if
(
checkHardwareSupport
(
CV_CPU_AVX2
)
)
return
VResizeCubicVec_32s8u_avx2
(
_src
,
dst
,
_beta
,
width
);
#endif
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
VResizeCubicVec_32s8u_sse2
(
_src
,
dst
,
_beta
,
width
);
return
0
;
}
};
template
<
int
shiftval
>
int
VResizeCubicVec_32f16_sse2
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
...
...
@@ -795,6 +1152,44 @@ template<int shiftval> struct VResizeCubicVec_32f16
__m128i
preshift
=
_mm_set1_epi32
(
shiftval
);
__m128i
postshift
=
_mm_set1_epi16
((
short
)
shiftval
);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
|
(
size_t
)
S2
|
(
size_t
)
S3
)
&
15
)
==
0
)
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
__m128i
t0
,
t1
;
x0
=
_mm_load_ps
(
S0
+
x
);
x1
=
_mm_load_ps
(
S0
+
x
+
4
);
y0
=
_mm_load_ps
(
S1
+
x
);
y1
=
_mm_load_ps
(
S1
+
x
+
4
);
s0
=
_mm_mul_ps
(
x0
,
b0
);
s1
=
_mm_mul_ps
(
x1
,
b0
);
y0
=
_mm_mul_ps
(
y0
,
b1
);
y1
=
_mm_mul_ps
(
y1
,
b1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
x0
=
_mm_load_ps
(
S2
+
x
);
x1
=
_mm_load_ps
(
S2
+
x
+
4
);
y0
=
_mm_load_ps
(
S3
+
x
);
y1
=
_mm_load_ps
(
S3
+
x
+
4
);
x0
=
_mm_mul_ps
(
x0
,
b2
);
x1
=
_mm_mul_ps
(
x1
,
b2
);
y0
=
_mm_mul_ps
(
y0
,
b3
);
y1
=
_mm_mul_ps
(
y1
,
b3
);
s0
=
_mm_add_ps
(
s0
,
x0
);
s1
=
_mm_add_ps
(
s1
,
x1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
t0
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
s0
),
preshift
);
t1
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
s1
),
preshift
);
t0
=
_mm_add_epi16
(
_mm_packs_epi32
(
t0
,
t1
),
postshift
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
),
t0
);
}
else
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
...
...
@@ -833,19 +1228,124 @@ template<int shiftval> struct VResizeCubicVec_32f16
}
return
x
;
}
#if CV_AVX2
template
<
int
shiftval
>
int
VResizeCubicVec_32f16_avx2
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
__m256
b0
=
_mm256_set1_ps
(
beta
[
0
]),
b1
=
_mm256_set1_ps
(
beta
[
1
]),
b2
=
_mm256_set1_ps
(
beta
[
2
]),
b3
=
_mm256_set1_ps
(
beta
[
3
]);
__m256i
preshift
=
_mm256_set1_epi32
(
shiftval
);
__m256i
postshift
=
_mm256_set1_epi16
((
short
)
shiftval
);
const
int
shuffle
=
0xd8
;
// 11 | 01 | 10 | 00
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
|
(
size_t
)
S2
|
(
size_t
)
S3
)
&
31
)
==
0
)
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
__m256i
t0
,
t1
;
x0
=
_mm256_load_ps
(
S0
+
x
);
x1
=
_mm256_load_ps
(
S0
+
x
+
8
);
y0
=
_mm256_load_ps
(
S1
+
x
);
y1
=
_mm256_load_ps
(
S1
+
x
+
8
);
s0
=
_mm256_mul_ps
(
x0
,
b0
);
s1
=
_mm256_mul_ps
(
x1
,
b0
);
y0
=
_mm256_mul_ps
(
y0
,
b1
);
y1
=
_mm256_mul_ps
(
y1
,
b1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
x0
=
_mm256_load_ps
(
S2
+
x
);
x1
=
_mm256_load_ps
(
S2
+
x
+
8
);
y0
=
_mm256_load_ps
(
S3
+
x
);
y1
=
_mm256_load_ps
(
S3
+
x
+
8
);
x0
=
_mm256_mul_ps
(
x0
,
b2
);
x1
=
_mm256_mul_ps
(
x1
,
b2
);
y0
=
_mm256_mul_ps
(
y0
,
b3
);
y1
=
_mm256_mul_ps
(
y1
,
b3
);
s0
=
_mm256_add_ps
(
s0
,
x0
);
s1
=
_mm256_add_ps
(
s1
,
x1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
t0
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
s0
),
preshift
);
t1
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
s1
),
preshift
);
t0
=
_mm256_add_epi16
(
_mm256_packs_epi32
(
t0
,
t1
),
postshift
);
t0
=
_mm256_permute4x64_epi64
(
t0
,
shuffle
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
),
t0
);
}
else
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
__m256i
t0
,
t1
;
x0
=
_mm256_loadu_ps
(
S0
+
x
);
x1
=
_mm256_loadu_ps
(
S0
+
x
+
8
);
y0
=
_mm256_loadu_ps
(
S1
+
x
);
y1
=
_mm256_loadu_ps
(
S1
+
x
+
8
);
s0
=
_mm256_mul_ps
(
x0
,
b0
);
s1
=
_mm256_mul_ps
(
x1
,
b0
);
y0
=
_mm256_mul_ps
(
y0
,
b1
);
y1
=
_mm256_mul_ps
(
y1
,
b1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
x0
=
_mm256_loadu_ps
(
S2
+
x
);
x1
=
_mm256_loadu_ps
(
S2
+
x
+
8
);
y0
=
_mm256_loadu_ps
(
S3
+
x
);
y1
=
_mm256_loadu_ps
(
S3
+
x
+
8
);
x0
=
_mm256_mul_ps
(
x0
,
b2
);
x1
=
_mm256_mul_ps
(
x1
,
b2
);
y0
=
_mm256_mul_ps
(
y0
,
b3
);
y1
=
_mm256_mul_ps
(
y1
,
b3
);
s0
=
_mm256_add_ps
(
s0
,
x0
);
s1
=
_mm256_add_ps
(
s1
,
x1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
t0
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
s0
),
preshift
);
t1
=
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
s1
),
preshift
);
t0
=
_mm256_add_epi16
(
_mm256_packs_epi32
(
t0
,
t1
),
postshift
);
t0
=
_mm256_permute4x64_epi64
(
t0
,
shuffle
);
_mm256_storeu_si256
(
(
__m256i
*
)(
dst
+
x
),
t0
);
}
};
typedef
VResizeCubicVec_32f16
<
SHRT_MIN
>
VResizeCubicVec_32f16u
;
typedef
VResizeCubicVec_32f16
<
0
>
VResizeCubicVec_32f16s
;
return
x
;
}
#endif
struct
VResizeCubicVec_32f
template
<
int
shiftval
>
struct
VResizeCubicVec_32f16
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
#if CV_AVX2
if
(
checkHardwareSupport
(
CV_CPU_AVX2
)
)
return
VResizeCubicVec_32f16_avx2
<
shiftval
>
(
_src
,
_dst
,
_beta
,
width
);
#endif
if
(
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
VResizeCubicVec_32f16_sse2
<
shiftval
>
(
_src
,
_dst
,
_beta
,
width
);
return
0
;
}
};
typedef
VResizeCubicVec_32f16
<
SHRT_MIN
>
VResizeCubicVec_32f16u
;
typedef
VResizeCubicVec_32f16
<
0
>
VResizeCubicVec_32f16s
;
static
int
VResizeCubicVec_32f_sse
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
...
...
@@ -854,6 +1354,40 @@ struct VResizeCubicVec_32f
__m128
b0
=
_mm_set1_ps
(
beta
[
0
]),
b1
=
_mm_set1_ps
(
beta
[
1
]),
b2
=
_mm_set1_ps
(
beta
[
2
]),
b3
=
_mm_set1_ps
(
beta
[
3
]);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
|
(
size_t
)
S2
|
(
size_t
)
S3
)
&
15
)
==
0
)
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
x0
=
_mm_load_ps
(
S0
+
x
);
x1
=
_mm_load_ps
(
S0
+
x
+
4
);
y0
=
_mm_load_ps
(
S1
+
x
);
y1
=
_mm_load_ps
(
S1
+
x
+
4
);
s0
=
_mm_mul_ps
(
x0
,
b0
);
s1
=
_mm_mul_ps
(
x1
,
b0
);
y0
=
_mm_mul_ps
(
y0
,
b1
);
y1
=
_mm_mul_ps
(
y1
,
b1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
x0
=
_mm_load_ps
(
S2
+
x
);
x1
=
_mm_load_ps
(
S2
+
x
+
4
);
y0
=
_mm_load_ps
(
S3
+
x
);
y1
=
_mm_load_ps
(
S3
+
x
+
4
);
x0
=
_mm_mul_ps
(
x0
,
b2
);
x1
=
_mm_mul_ps
(
x1
,
b2
);
y0
=
_mm_mul_ps
(
y0
,
b3
);
y1
=
_mm_mul_ps
(
y1
,
b3
);
s0
=
_mm_add_ps
(
s0
,
x0
);
s1
=
_mm_add_ps
(
s1
,
x1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
_mm_storeu_ps
(
dst
+
x
,
s0
);
_mm_storeu_ps
(
dst
+
x
+
4
,
s1
);
}
else
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
...
...
@@ -888,6 +1422,102 @@ struct VResizeCubicVec_32f
}
return
x
;
}
#if CV_AVX
int
VResizeCubicVec_32f_avx
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
float
*
dst
=
(
float
*
)
_dst
;
int
x
=
0
;
__m256
b0
=
_mm256_set1_ps
(
beta
[
0
]),
b1
=
_mm256_set1_ps
(
beta
[
1
]),
b2
=
_mm256_set1_ps
(
beta
[
2
]),
b3
=
_mm256_set1_ps
(
beta
[
3
]);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
|
(
size_t
)
S2
|
(
size_t
)
S3
)
&
31
)
==
0
)
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
x0
=
_mm256_load_ps
(
S0
+
x
);
x1
=
_mm256_load_ps
(
S0
+
x
+
8
);
y0
=
_mm256_load_ps
(
S1
+
x
);
y1
=
_mm256_load_ps
(
S1
+
x
+
8
);
s0
=
_mm256_mul_ps
(
x0
,
b0
);
s1
=
_mm256_mul_ps
(
x1
,
b0
);
y0
=
_mm256_mul_ps
(
y0
,
b1
);
y1
=
_mm256_mul_ps
(
y1
,
b1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
x0
=
_mm256_load_ps
(
S2
+
x
);
x1
=
_mm256_load_ps
(
S2
+
x
+
8
);
y0
=
_mm256_load_ps
(
S3
+
x
);
y1
=
_mm256_load_ps
(
S3
+
x
+
8
);
x0
=
_mm256_mul_ps
(
x0
,
b2
);
x1
=
_mm256_mul_ps
(
x1
,
b2
);
y0
=
_mm256_mul_ps
(
y0
,
b3
);
y1
=
_mm256_mul_ps
(
y1
,
b3
);
s0
=
_mm256_add_ps
(
s0
,
x0
);
s1
=
_mm256_add_ps
(
s1
,
x1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
_mm256_storeu_ps
(
dst
+
x
,
s0
);
_mm256_storeu_ps
(
dst
+
x
+
8
,
s1
);
}
else
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m256
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
x0
=
_mm256_loadu_ps
(
S0
+
x
);
x1
=
_mm256_loadu_ps
(
S0
+
x
+
8
);
y0
=
_mm256_loadu_ps
(
S1
+
x
);
y1
=
_mm256_loadu_ps
(
S1
+
x
+
8
);
s0
=
_mm256_mul_ps
(
x0
,
b0
);
s1
=
_mm256_mul_ps
(
x1
,
b0
);
y0
=
_mm256_mul_ps
(
y0
,
b1
);
y1
=
_mm256_mul_ps
(
y1
,
b1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
x0
=
_mm256_loadu_ps
(
S2
+
x
);
x1
=
_mm256_loadu_ps
(
S2
+
x
+
8
);
y0
=
_mm256_loadu_ps
(
S3
+
x
);
y1
=
_mm256_loadu_ps
(
S3
+
x
+
8
);
x0
=
_mm256_mul_ps
(
x0
,
b2
);
x1
=
_mm256_mul_ps
(
x1
,
b2
);
y0
=
_mm256_mul_ps
(
y0
,
b3
);
y1
=
_mm256_mul_ps
(
y1
,
b3
);
s0
=
_mm256_add_ps
(
s0
,
x0
);
s1
=
_mm256_add_ps
(
s1
,
x1
);
s0
=
_mm256_add_ps
(
s0
,
y0
);
s1
=
_mm256_add_ps
(
s1
,
y1
);
_mm256_storeu_ps
(
dst
+
x
,
s0
);
_mm256_storeu_ps
(
dst
+
x
+
8
,
s1
);
}
return
x
;
}
#endif
struct
VResizeCubicVec_32f
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
#if CV_AVX
if
(
checkHardwareSupport
(
CV_CPU_AVX
)
)
return
VResizeCubicVec_32f_avx
(
_src
,
_dst
,
_beta
,
width
);
#endif
if
(
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
VResizeCubicVec_32f_sse
(
_src
,
_dst
,
_beta
,
width
);
return
0
;
}
};
...
...
modules/ts/src/ts_func.cpp
View file @
11a09ef5
...
...
@@ -3005,6 +3005,9 @@ void printVersionInfo(bool useStdOut)
#if CV_AVX
if
(
checkHardwareSupport
(
CV_CPU_AVX
))
cpu_features
+=
" avx"
;
#endif
#if CV_AVX2
if
(
checkHardwareSupport
(
CV_CPU_AVX2
))
cpu_features
+=
" avx2"
;
#endif
#if CV_NEON
cpu_features
+=
" neon"
;
// NEON is currently not checked at runtime
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment