Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
8546ac3c
Commit
8546ac3c
authored
6 years ago
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
imgproc: get rid of filter.avx2.cpp
parent
9a8dbfd5
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
99 additions
and
219 deletions
+99
-219
filter.avx2.cpp
modules/imgproc/src/filter.avx2.cpp
+0
-197
filter.hpp
modules/imgproc/src/filter.hpp
+0
-6
filter.simd.hpp
modules/imgproc/src/filter.simd.hpp
+99
-16
No files found.
modules/imgproc/src/filter.avx2.cpp
deleted
100644 → 0
View file @
9a8dbfd5
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "filter.hpp"
namespace
cv
{
int
RowVec_32f_AVX
(
const
float
*
src0
,
const
float
*
_kx
,
float
*
dst
,
int
width
,
int
cn
,
int
_ksize
)
{
int
i
=
0
,
k
;
for
(;
i
<=
width
-
8
;
i
+=
8
)
{
const
float
*
src
=
src0
+
i
;
__m256
f
,
x0
;
__m256
s0
=
_mm256_set1_ps
(
0.0
f
);
for
(
k
=
0
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
{
f
=
_mm256_set1_ps
(
_kx
[
k
]);
x0
=
_mm256_loadu_ps
(
src
);
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
x0
,
f
,
s0
);
#else
s0
=
_mm256_add_ps
(
s0
,
_mm256_mul_ps
(
x0
,
f
));
#endif
}
_mm256_storeu_ps
(
dst
+
i
,
s0
);
}
_mm256_zeroupper
();
return
i
;
}
int
SymmColumnVec_32f_Symm_AVX
(
const
float
**
src
,
const
float
*
ky
,
float
*
dst
,
float
delta
,
int
width
,
int
ksize2
)
{
int
i
=
0
,
k
;
const
float
*
S
,
*
S2
;
const
__m128
d4
=
_mm_set1_ps
(
delta
);
const
__m256
d8
=
_mm256_set1_ps
(
delta
);
for
(
;
i
<=
width
-
16
;
i
+=
16
)
{
__m256
f
=
_mm256_set1_ps
(
ky
[
0
]);
__m256
s0
,
s1
;
__m256
x0
;
S
=
src
[
0
]
+
i
;
s0
=
_mm256_loadu_ps
(
S
);
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
s0
,
f
,
d8
);
#else
s0
=
_mm256_add_ps
(
_mm256_mul_ps
(
s0
,
f
),
d8
);
#endif
s1
=
_mm256_loadu_ps
(
S
+
8
);
#if CV_FMA3
s1
=
_mm256_fmadd_ps
(
s1
,
f
,
d8
);
#else
s1
=
_mm256_add_ps
(
_mm256_mul_ps
(
s1
,
f
),
d8
);
#endif
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
S
=
src
[
k
]
+
i
;
S2
=
src
[
-
k
]
+
i
;
f
=
_mm256_set1_ps
(
ky
[
k
]);
x0
=
_mm256_add_ps
(
_mm256_loadu_ps
(
S
),
_mm256_loadu_ps
(
S2
));
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
x0
,
f
,
s0
);
#else
s0
=
_mm256_add_ps
(
s0
,
_mm256_mul_ps
(
x0
,
f
));
#endif
x0
=
_mm256_add_ps
(
_mm256_loadu_ps
(
S
+
8
),
_mm256_loadu_ps
(
S2
+
8
));
#if CV_FMA3
s1
=
_mm256_fmadd_ps
(
x0
,
f
,
s1
);
#else
s1
=
_mm256_add_ps
(
s1
,
_mm256_mul_ps
(
x0
,
f
));
#endif
}
_mm256_storeu_ps
(
dst
+
i
,
s0
);
_mm256_storeu_ps
(
dst
+
i
+
8
,
s1
);
}
for
(
;
i
<=
width
-
4
;
i
+=
4
)
{
__m128
f
=
_mm_set1_ps
(
ky
[
0
]);
__m128
x0
,
s0
=
_mm_load_ps
(
src
[
0
]
+
i
);
s0
=
_mm_add_ps
(
_mm_mul_ps
(
s0
,
f
),
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
f
=
_mm_set1_ps
(
ky
[
k
]);
x0
=
_mm_add_ps
(
_mm_load_ps
(
src
[
k
]
+
i
),
_mm_load_ps
(
src
[
-
k
]
+
i
));
s0
=
_mm_add_ps
(
s0
,
_mm_mul_ps
(
x0
,
f
));
}
_mm_storeu_ps
(
dst
+
i
,
s0
);
}
_mm256_zeroupper
();
return
i
;
}
int
SymmColumnVec_32f_Unsymm_AVX
(
const
float
**
src
,
const
float
*
ky
,
float
*
dst
,
float
delta
,
int
width
,
int
ksize2
)
{
int
i
=
0
,
k
;
const
float
*
S2
;
const
__m128
d4
=
_mm_set1_ps
(
delta
);
const
__m256
d8
=
_mm256_set1_ps
(
delta
);
for
(;
i
<=
width
-
16
;
i
+=
16
)
{
__m256
f
,
s0
=
d8
,
s1
=
d8
;
__m256
x0
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
const
float
*
S
=
src
[
k
]
+
i
;
S2
=
src
[
-
k
]
+
i
;
f
=
_mm256_set1_ps
(
ky
[
k
]);
x0
=
_mm256_sub_ps
(
_mm256_loadu_ps
(
S
),
_mm256_loadu_ps
(
S2
));
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
x0
,
f
,
s0
);
#else
s0
=
_mm256_add_ps
(
s0
,
_mm256_mul_ps
(
x0
,
f
));
#endif
x0
=
_mm256_sub_ps
(
_mm256_loadu_ps
(
S
+
8
),
_mm256_loadu_ps
(
S2
+
8
));
#if CV_FMA3
s1
=
_mm256_fmadd_ps
(
x0
,
f
,
s1
);
#else
s1
=
_mm256_add_ps
(
s1
,
_mm256_mul_ps
(
x0
,
f
));
#endif
}
_mm256_storeu_ps
(
dst
+
i
,
s0
);
_mm256_storeu_ps
(
dst
+
i
+
8
,
s1
);
}
for
(;
i
<=
width
-
4
;
i
+=
4
)
{
__m128
f
,
x0
,
s0
=
d4
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
f
=
_mm_set1_ps
(
ky
[
k
]);
x0
=
_mm_sub_ps
(
_mm_load_ps
(
src
[
k
]
+
i
),
_mm_load_ps
(
src
[
-
k
]
+
i
));
s0
=
_mm_add_ps
(
s0
,
_mm_mul_ps
(
x0
,
f
));
}
_mm_storeu_ps
(
dst
+
i
,
s0
);
}
_mm256_zeroupper
();
return
i
;
}
}
/* End of file. */
This diff is collapsed.
Click to expand it.
modules/imgproc/src/filter.hpp
View file @
8546ac3c
...
...
@@ -45,12 +45,6 @@
namespace
cv
{
#if CV_TRY_AVX2
int
RowVec_32f_AVX
(
const
float
*
src0
,
const
float
*
_kx
,
float
*
dst
,
int
width
,
int
cn
,
int
_ksize
);
int
SymmColumnVec_32f_Symm_AVX
(
const
float
**
src
,
const
float
*
ky
,
float
*
dst
,
float
delta
,
int
width
,
int
ksize2
);
int
SymmColumnVec_32f_Unsymm_AVX
(
const
float
**
src
,
const
float
*
ky
,
float
*
dst
,
float
delta
,
int
width
,
int
ksize2
);
#endif
#ifdef HAVE_OPENCL
bool
ocl_sepFilter2D
(
InputArray
_src
,
OutputArray
_dst
,
int
ddepth
,
InputArray
_kernelX
,
InputArray
_kernelY
,
Point
anchor
,
...
...
This diff is collapsed.
Click to expand it.
modules/imgproc/src/filter.simd.hpp
View file @
8546ac3c
...
...
@@ -1507,7 +1507,6 @@ struct RowVec_32f
{
RowVec_32f
()
{
haveAVX2
=
CV_CPU_HAS_SUPPORT_AVX2
;
#if defined USE_IPP_SEP_FILTERS
bufsz
=
-
1
;
#endif
...
...
@@ -1516,7 +1515,6 @@ struct RowVec_32f
RowVec_32f
(
const
Mat
&
_kernel
)
{
kernel
=
_kernel
;
haveAVX2
=
CV_CPU_HAS_SUPPORT_AVX2
;
#if defined USE_IPP_SEP_FILTERS
bufsz
=
-
1
;
#endif
...
...
@@ -1543,9 +1541,24 @@ struct RowVec_32f
int
i
=
0
,
k
;
width
*=
cn
;
#if CV_TRY_AVX2
if
(
haveAVX2
)
return
RowVec_32f_AVX
(
src0
,
_kx
,
dst
,
width
,
cn
,
_ksize
);
#if CV_AVX
for
(;
i
<=
width
-
8
;
i
+=
8
)
{
const
float
*
src
=
src0
+
i
;
__m256
f
,
x0
;
__m256
s0
=
_mm256_set1_ps
(
0.0
f
);
for
(
k
=
0
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
{
f
=
_mm256_set1_ps
(
_kx
[
k
]);
x0
=
_mm256_loadu_ps
(
src
);
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
x0
,
f
,
s0
);
#else
s0
=
_mm256_add_ps
(
s0
,
_mm256_mul_ps
(
x0
,
f
));
#endif
}
_mm256_storeu_ps
(
dst
+
i
,
s0
);
}
#endif
v_float32
k0
=
vx_setall_f32
(
_kx
[
0
]);
for
(
;
i
<=
width
-
4
*
v_float32
::
nlanes
;
i
+=
4
*
v_float32
::
nlanes
)
...
...
@@ -1599,7 +1612,6 @@ struct RowVec_32f
}
Mat
kernel
;
bool
haveAVX2
;
#if defined USE_IPP_SEP_FILTERS
private
:
mutable
int
bufsz
;
...
...
@@ -1754,7 +1766,6 @@ struct SymmColumnVec_32f
{
SymmColumnVec_32f
()
{
symmetryType
=
0
;
haveAVX2
=
CV_CPU_HAS_SUPPORT_AVX2
;
delta
=
0
;
}
SymmColumnVec_32f
(
const
Mat
&
_kernel
,
int
_symmetryType
,
int
,
double
_delta
)
...
...
@@ -1762,7 +1773,6 @@ struct SymmColumnVec_32f
symmetryType
=
_symmetryType
;
kernel
=
_kernel
;
delta
=
(
float
)
_delta
;
haveAVX2
=
CV_CPU_HAS_SUPPORT_AVX2
;
CV_Assert
(
(
symmetryType
&
(
KERNEL_SYMMETRICAL
|
KERNEL_ASYMMETRICAL
))
!=
0
);
}
...
...
@@ -1780,9 +1790,53 @@ struct SymmColumnVec_32f
if
(
symmetrical
)
{
#if CV_TRY_AVX2
if
(
haveAVX2
)
return
SymmColumnVec_32f_Symm_AVX
(
src
,
ky
,
dst
,
delta
,
width
,
ksize2
);
#if CV_AVX
{
const
float
*
S
,
*
S2
;
const
__m256
d8
=
_mm256_set1_ps
(
delta
);
for
(
;
i
<=
width
-
16
;
i
+=
16
)
{
__m256
f
=
_mm256_set1_ps
(
ky
[
0
]);
__m256
s0
,
s1
;
__m256
x0
;
S
=
src
[
0
]
+
i
;
s0
=
_mm256_loadu_ps
(
S
);
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
s0
,
f
,
d8
);
#else
s0
=
_mm256_add_ps
(
_mm256_mul_ps
(
s0
,
f
),
d8
);
#endif
s1
=
_mm256_loadu_ps
(
S
+
8
);
#if CV_FMA3
s1
=
_mm256_fmadd_ps
(
s1
,
f
,
d8
);
#else
s1
=
_mm256_add_ps
(
_mm256_mul_ps
(
s1
,
f
),
d8
);
#endif
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
S
=
src
[
k
]
+
i
;
S2
=
src
[
-
k
]
+
i
;
f
=
_mm256_set1_ps
(
ky
[
k
]);
x0
=
_mm256_add_ps
(
_mm256_loadu_ps
(
S
),
_mm256_loadu_ps
(
S2
));
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
x0
,
f
,
s0
);
#else
s0
=
_mm256_add_ps
(
s0
,
_mm256_mul_ps
(
x0
,
f
));
#endif
x0
=
_mm256_add_ps
(
_mm256_loadu_ps
(
S
+
8
),
_mm256_loadu_ps
(
S2
+
8
));
#if CV_FMA3
s1
=
_mm256_fmadd_ps
(
x0
,
f
,
s1
);
#else
s1
=
_mm256_add_ps
(
s1
,
_mm256_mul_ps
(
x0
,
f
));
#endif
}
_mm256_storeu_ps
(
dst
+
i
,
s0
);
_mm256_storeu_ps
(
dst
+
i
+
8
,
s1
);
}
}
#endif
const
v_float32
d4
=
vx_setall_f32
(
delta
);
const
v_float32
k0
=
vx_setall_f32
(
ky
[
0
]);
...
...
@@ -1830,11 +1884,41 @@ struct SymmColumnVec_32f
}
else
{
#if CV_TRY_AVX2
if
(
haveAVX2
)
return
SymmColumnVec_32f_Unsymm_AVX
(
src
,
ky
,
dst
,
delta
,
width
,
ksize2
);
#endif
CV_DbgAssert
(
ksize2
>
0
);
#if CV_AVX
{
const
float
*
S2
;
const
__m256
d8
=
_mm256_set1_ps
(
delta
);
for
(;
i
<=
width
-
16
;
i
+=
16
)
{
__m256
f
,
s0
=
d8
,
s1
=
d8
;
__m256
x0
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
const
float
*
S
=
src
[
k
]
+
i
;
S2
=
src
[
-
k
]
+
i
;
f
=
_mm256_set1_ps
(
ky
[
k
]);
x0
=
_mm256_sub_ps
(
_mm256_loadu_ps
(
S
),
_mm256_loadu_ps
(
S2
));
#if CV_FMA3
s0
=
_mm256_fmadd_ps
(
x0
,
f
,
s0
);
#else
s0
=
_mm256_add_ps
(
s0
,
_mm256_mul_ps
(
x0
,
f
));
#endif
x0
=
_mm256_sub_ps
(
_mm256_loadu_ps
(
S
+
8
),
_mm256_loadu_ps
(
S2
+
8
));
#if CV_FMA3
s1
=
_mm256_fmadd_ps
(
x0
,
f
,
s1
);
#else
s1
=
_mm256_add_ps
(
s1
,
_mm256_mul_ps
(
x0
,
f
));
#endif
}
_mm256_storeu_ps
(
dst
+
i
,
s0
);
_mm256_storeu_ps
(
dst
+
i
+
8
,
s1
);
}
}
#endif
const
v_float32
d4
=
vx_setall_f32
(
delta
);
const
v_float32
k1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
4
*
v_float32
::
nlanes
;
i
+=
4
*
v_float32
::
nlanes
)
...
...
@@ -1885,7 +1969,6 @@ struct SymmColumnVec_32f
int
symmetryType
;
float
delta
;
Mat
kernel
;
bool
haveAVX2
;
};
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment