Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
8d48632e
Commit
8d48632e
authored
Jan 12, 2015
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
avx2
parent
28833421
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
105 additions
and
39 deletions
+105
-39
CMakeLists.txt
CMakeLists.txt
+1
-0
OpenCVCompilerOptions.cmake
cmake/OpenCVCompilerOptions.cmake
+3
-0
cvdef.h
modules/core/include/opencv2/core/cvdef.h
+9
-2
convert.cpp
modules/core/src/convert.cpp
+36
-36
precomp.hpp
modules/core/src/precomp.hpp
+1
-0
system.cpp
modules/core/src/system.cpp
+52
-1
ts_func.cpp
modules/ts/src/ts_func.cpp
+3
-0
No files found.
CMakeLists.txt
View file @
8d48632e
...
...
@@ -221,6 +221,7 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions"
OCV_OPTION
(
ENABLE_SSE41
"Enable SSE4.1 instructions"
OFF
IF
((
CV_ICC OR CMAKE_COMPILER_IS_GNUCXX
)
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_SSE42
"Enable SSE4.2 instructions"
OFF
IF
(
CMAKE_COMPILER_IS_GNUCXX
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_AVX
"Enable AVX instructions"
OFF
IF
((
MSVC OR CMAKE_COMPILER_IS_GNUCXX
)
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_AVX2
"Enable AVX2 instructions"
OFF
IF
((
MSVC OR CMAKE_COMPILER_IS_GNUCXX
)
AND
(
X86 OR X86_64
))
)
OCV_OPTION
(
ENABLE_NEON
"Enable NEON instructions"
OFF IF CMAKE_COMPILER_IS_GNUCXX
AND
(
ARM OR IOS
)
)
OCV_OPTION
(
ENABLE_VFPV3
"Enable VFPv3-D32 instructions"
OFF IF CMAKE_COMPILER_IS_GNUCXX
AND
(
ARM OR IOS
)
)
OCV_OPTION
(
ENABLE_NOISY_WARNINGS
"Show all warnings even if they are too noisy"
OFF
)
...
...
cmake/OpenCVCompilerOptions.cmake
View file @
8d48632e
...
...
@@ -140,6 +140,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if
(
ENABLE_AVX
)
add_extra_compiler_option
(
-mavx
)
endif
()
if
(
ENABLE_AVX2
)
add_extra_compiler_option
(
-mavx2
)
endif
()
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
if
(
NOT OPENCV_EXTRA_CXX_FLAGS MATCHES
"-mavx"
)
...
...
modules/core/include/opencv2/core/cvdef.h
View file @
8d48632e
...
...
@@ -114,7 +114,8 @@
#define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8
#define CV_CPU_AVX 10
#define CV_CPU_NEON 11
#define CV_CPU_AVX2 11
#define CV_CPU_NEON 12
// when adding to this list remember to update the enum in core/utility.cpp
#define CV_HARDWARE_MAX_FEATURE 255
...
...
@@ -141,7 +142,7 @@
# include <nmmintrin.h>
# define CV_SSE4_2 1
# endif
# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
# if defined __AVX__ ||
defined __AVX2__ ||
(defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
# include <immintrin.h>
...
...
@@ -150,6 +151,9 @@
# define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
# else
# define __xgetbv() 0
# ifdef __AVX2__
# define CV_AVX2 1
# endif
# endif
# endif
#endif
...
...
@@ -187,6 +191,9 @@
#ifndef CV_AVX
# define CV_AVX 0
#endif
#ifndef CV_AVX2
# define CV_AVX2 0
#endif
#ifndef CV_NEON
# define CV_NEON 0
#endif
...
...
modules/core/src/convert.cpp
View file @
8d48632e
...
...
@@ -2294,26 +2294,44 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
{
int
x
=
0
;
#if CV_SSE2
if
(
USE_SSE2
)
//~5X
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256
scale256
=
_mm256_set1_ps
(
scale
);
__m256
shift256
=
_mm256_set1_ps
(
shift
);
__m256i
zero
=
_mm256_setzero_si256
();
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
__m128
scale128
=
_mm_set1_ps
(
scale
);
__m128
shift128
=
_mm_set1_ps
(
shift
);
for
(;
x
<=
size
.
width
-
8
;
x
+=
8
)
{
__m128i
r0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
));
__m128i
r1
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
+
4
));
__m128
rf0
=
_mm_cvtepi32_ps
(
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
r0
,
r0
),
16
));
__m128
rf1
=
_mm_cvtepi32_ps
(
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
r1
,
r1
),
16
));
rf0
=
_mm_add_ps
(
_mm_mul_ps
(
rf0
,
scale128
),
shift128
);
rf1
=
_mm_add_ps
(
_mm_mul_ps
(
rf1
,
scale128
),
shift128
);
r0
=
_mm_cvtps_epi32
(
rf0
);
r1
=
_mm_cvtps_epi32
(
rf1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
+
4
),
r1
);
}
__m256i
v_src
=
_mm256_loadu_si256
((
__m256i
const
*
)(
src
+
x
));
__m256i
v_src_lo
=
_mm256_unpacklo_epi16
(
v_src
,
zero
);
__m256i
v_src_hi
=
_mm256_unpackhi_epi16
(
v_src
,
zero
);
__m256
v_dst0
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
v_src_lo
),
scale256
),
shift256
);
__m256
v_dst1
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_cvtepi32_ps
(
v_src_hi
),
scale256
),
shift256
);
_mm256_storeu_si256
((
__m256i
*
)(
dst
+
x
),
_mm256_cvtps_epi32
(
v_dst0
));
_mm256_storeu_si256
((
__m256i
*
)(
dst
+
x
+
8
),
_mm256_cvtps_epi32
(
v_dst1
));
}
}
#endif
#if CV_SSE2
if
(
USE_SSE2
)
//~5X
{
__m128
scale128
=
_mm_set1_ps
(
scale
);
__m128
shift128
=
_mm_set1_ps
(
shift
);
for
(;
x
<=
size
.
width
-
8
;
x
+=
8
)
{
__m128i
r0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
));
__m128i
r1
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
+
4
));
__m128
rf0
=
_mm_cvtepi32_ps
(
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
r0
,
r0
),
16
));
__m128
rf1
=
_mm_cvtepi32_ps
(
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
r1
,
r1
),
16
));
rf0
=
_mm_add_ps
(
_mm_mul_ps
(
rf0
,
scale128
),
shift128
);
rf1
=
_mm_add_ps
(
_mm_mul_ps
(
rf1
,
scale128
),
shift128
);
r0
=
_mm_cvtps_epi32
(
rf0
);
r1
=
_mm_cvtps_epi32
(
rf1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
r0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
+
4
),
r1
);
}
}
#elif CV_NEON
float32x4_t
v_shift
=
vdupq_n_f32
(
shift
);
for
(;
x
<=
size
.
width
-
8
;
x
+=
8
)
...
...
@@ -2330,24 +2348,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
}
#endif
//We will wait Haswell
/*
#if CV_AVX
if(USE_AVX)//2X - bad variant
{
////TODO:AVX implementation (optimization?) required
__m256 scale256 = _mm256_set1_ps (scale);
__m256 shift256 = _mm256_set1_ps (shift);
for(; x <= size.width - 8; x += 8 )
{
__m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
__m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
__m256i res = _mm256_cvtps_epi32(r0);
_mm256_storeu_si256 ((__m256i*)(dst+x), res);
}
}
#endif*/
for
(;
x
<
size
.
width
;
x
++
)
dst
[
x
]
=
saturate_cast
<
int
>
(
src
[
x
]
*
scale
+
shift
);
}
...
...
modules/core/src/precomp.hpp
View file @
8d48632e
...
...
@@ -192,6 +192,7 @@ struct NoVec
extern
volatile
bool
USE_SSE2
;
extern
volatile
bool
USE_SSE4_2
;
extern
volatile
bool
USE_AVX
;
extern
volatile
bool
USE_AVX2
;
enum
{
BLOCK_SIZE
=
1024
};
...
...
modules/core/src/system.cpp
View file @
8d48632e
...
...
@@ -82,6 +82,22 @@
pop
ebx
}
}
static
void
__cpuidex
(
int
*
cpuid_data
,
int
,
int
)
{
__asm
{
push
edi
mov
edi
,
cpuid_data
mov
eax
,
7
mov
ecx
,
0
cpuid
mov
[
edi
],
eax
mov
[
edi
+
4
],
ebx
mov
[
edi
+
8
],
ecx
mov
[
edi
+
12
],
edx
pop
edi
}
}
#endif
#endif
...
...
@@ -203,7 +219,7 @@ struct HWFeatures
enum
{
MAX_FEATURE
=
CV_HARDWARE_MAX_FEATURE
};
HWFeatures
(
void
)
{
{
memset
(
have
,
0
,
sizeof
(
have
)
);
x86_family
=
0
;
}
...
...
@@ -251,6 +267,40 @@ struct HWFeatures
f
.
have
[
CV_CPU_SSE4_2
]
=
(
cpuid_data
[
2
]
&
(
1
<<
20
))
!=
0
;
f
.
have
[
CV_CPU_POPCNT
]
=
(
cpuid_data
[
2
]
&
(
1
<<
23
))
!=
0
;
f
.
have
[
CV_CPU_AVX
]
=
(((
cpuid_data
[
2
]
&
(
1
<<
28
))
!=
0
)
&&
((
cpuid_data
[
2
]
&
(
1
<<
27
))
!=
0
));
//OS uses XSAVE_XRSTORE and CPU support AVX
// make the second call to the cpuid command in order to get
// information about extended features like AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex
(
cpuid_data
,
7
,
0
);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm
__volatile__
(
"movl $7, %%eax
\n\t
"
"movl $0, %%ecx
\n\t
"
"cpuid
\n\t
"
:
[
eax
]
"=a"
(
cpuid_data
[
0
]),[
ebx
]
"=b"
(
cpuid_data
[
1
]),[
ecx
]
"=c"
(
cpuid_data
[
2
]),[
edx
]
"=d"
(
cpuid_data
[
3
])
:
:
"cc"
);
#else
asm
volatile
(
"pushl %%eax
\n\t
"
"pushl %%edx
\n\t
"
"movl $7,%%eax
\n\t
"
"movl $0,%%ecx
\n\t
"
"cpuid
\n\t
"
"popl %%edx
\n\t
"
"popl %%eax
\n\t
"
:
"=b"
(
cpuid_data
[
1
]),
"=c"
(
cpuid_data
[
2
])
:
:
"cc"
);
#endif
#endif
f
.
have
[
CV_CPU_AVX2
]
=
(
cpuid_data
[
1
]
&
(
1
<<
5
))
!=
0
;
}
return
f
;
...
...
@@ -290,6 +340,7 @@ IPPInitializer ippInitializer;
volatile
bool
USE_SSE2
=
featuresEnabled
.
have
[
CV_CPU_SSE2
];
volatile
bool
USE_SSE4_2
=
featuresEnabled
.
have
[
CV_CPU_SSE4_2
];
volatile
bool
USE_AVX
=
featuresEnabled
.
have
[
CV_CPU_AVX
];
volatile
bool
USE_AVX2
=
featuresEnabled
.
have
[
CV_CPU_AVX2
];
void
setUseOptimized
(
bool
flag
)
{
...
...
modules/ts/src/ts_func.cpp
View file @
8d48632e
...
...
@@ -3019,6 +3019,9 @@ void printVersionInfo(bool useStdOut)
#if CV_AVX
if
(
checkHardwareSupport
(
CV_CPU_AVX
))
cpu_features
+=
" avx"
;
#endif
#if CV_AVX2
if
(
checkHardwareSupport
(
CV_CPU_AVX2
))
cpu_features
+=
" avx2"
;
#endif
#if CV_NEON
cpu_features
+=
" neon"
;
// NEON is currently not checked at runtime
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment