Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv_contrib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv_contrib
Commits
f4da1502
Commit
f4da1502
authored
Mar 15, 2017
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1033 from woodychow:sift_avx2
parents
c16b5c26
c5e55dfd
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
231 additions
and
6 deletions
+231
-6
precomp.hpp
modules/xfeatures2d/src/precomp.hpp
+2
-0
sift.cpp
modules/xfeatures2d/src/sift.cpp
+229
-6
No files found.
modules/xfeatures2d/src/precomp.hpp
View file @
f4da1502
...
...
@@ -61,4 +61,6 @@
#include "opencv2/core/private.hpp"
#define USE_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
#endif
modules/xfeatures2d/src/sift.cpp
View file @
f4da1502
...
...
@@ -344,7 +344,40 @@ static float calcOrientationHist( const Mat& img, Point pt, int radius,
cv
::
hal
::
fastAtan2
(
Y
,
X
,
Ori
,
len
,
true
);
cv
::
hal
::
magnitude32f
(
X
,
Y
,
Mag
,
len
);
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256
__nd360
=
_mm256_set1_ps
(
n
/
360.
f
);
__m256i
__n
=
_mm256_set1_epi32
(
n
);
int
CV_DECL_ALIGNED
(
32
)
bin_buf
[
8
];
float
CV_DECL_ALIGNED
(
32
)
w_mul_mag_buf
[
8
];
for
(
;
k
<=
len
-
8
;
k
+=
8
)
{
__m256i
__bin
=
_mm256_cvtps_epi32
(
_mm256_round_ps
(
_mm256_mul_ps
(
__nd360
,
_mm256_loadu_ps
(
&
Ori
[
k
])),
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
));
__bin
=
_mm256_sub_epi32
(
__bin
,
_mm256_and_si256
(
__n
,
_mm256_or_si256
(
_mm256_cmpeq_epi32
(
__bin
,
__n
),
_mm256_cmpgt_epi32
(
__bin
,
__n
))));
__bin
=
_mm256_add_epi32
(
__bin
,
_mm256_and_si256
(
__n
,
_mm256_cmpgt_epi32
(
_mm256_setzero_si256
(),
__bin
)));
__m256
__w_mul_mag
=
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
W
[
k
]),
_mm256_loadu_ps
(
&
Mag
[
k
]));
_mm256_store_si256
((
__m256i
*
)
bin_buf
,
__bin
);
_mm256_store_ps
(
w_mul_mag_buf
,
__w_mul_mag
);
temphist
[
bin_buf
[
0
]]
+=
w_mul_mag_buf
[
0
];
temphist
[
bin_buf
[
1
]]
+=
w_mul_mag_buf
[
1
];
temphist
[
bin_buf
[
2
]]
+=
w_mul_mag_buf
[
2
];
temphist
[
bin_buf
[
3
]]
+=
w_mul_mag_buf
[
3
];
temphist
[
bin_buf
[
4
]]
+=
w_mul_mag_buf
[
4
];
temphist
[
bin_buf
[
5
]]
+=
w_mul_mag_buf
[
5
];
temphist
[
bin_buf
[
6
]]
+=
w_mul_mag_buf
[
6
];
temphist
[
bin_buf
[
7
]]
+=
w_mul_mag_buf
[
7
];
}
}
#endif
for
(
;
k
<
len
;
k
++
)
{
int
bin
=
cvRound
((
n
/
360.
f
)
*
Ori
[
k
]);
if
(
bin
>=
n
)
...
...
@@ -359,7 +392,40 @@ static float calcOrientationHist( const Mat& img, Point pt, int radius,
temphist
[
-
2
]
=
temphist
[
n
-
2
];
temphist
[
n
]
=
temphist
[
0
];
temphist
[
n
+
1
]
=
temphist
[
1
];
for
(
i
=
0
;
i
<
n
;
i
++
)
i
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256
__d_1_16
=
_mm256_set1_ps
(
1.
f
/
16.
f
);
__m256
__d_4_16
=
_mm256_set1_ps
(
4.
f
/
16.
f
);
__m256
__d_6_16
=
_mm256_set1_ps
(
6.
f
/
16.
f
);
for
(
;
i
<=
n
-
8
;
i
+=
8
)
{
#if CV_FMA3
__m256
__hist
=
_mm256_fmadd_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
2
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
2
])),
__d_1_16
,
_mm256_fmadd_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
1
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
1
])),
__d_4_16
,
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
]),
__d_6_16
)));
#else
__m256
__hist
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
2
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
2
])),
__d_1_16
),
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
1
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
1
])),
__d_4_16
),
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
]),
__d_6_16
)));
#endif
_mm256_storeu_ps
(
&
hist
[
i
],
__hist
);
}
}
#endif
for
(
;
i
<
n
;
i
++
)
{
hist
[
i
]
=
(
temphist
[
i
-
2
]
+
temphist
[
i
+
2
])
*
(
1.
f
/
16.
f
)
+
(
temphist
[
i
-
1
]
+
temphist
[
i
+
1
])
*
(
4.
f
/
16.
f
)
+
...
...
@@ -627,7 +693,99 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
cv
::
hal
::
magnitude32f
(
X
,
Y
,
Mag
,
len
);
cv
::
hal
::
exp32f
(
W
,
W
,
len
);
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
int
CV_DECL_ALIGNED
(
32
)
idx_buf
[
8
];
float
CV_DECL_ALIGNED
(
32
)
rco_buf
[
64
];
__m256
__ori
=
_mm256_set1_ps
(
ori
);
__m256
__bins_per_rad
=
_mm256_set1_ps
(
bins_per_rad
);
__m256i
__n
=
_mm256_set1_epi32
(
n
);
for
(
;
k
<=
len
-
8
;
k
+=
8
)
{
__m256
__rbin
=
_mm256_loadu_ps
(
&
RBin
[
k
]);
__m256
__cbin
=
_mm256_loadu_ps
(
&
CBin
[
k
]);
__m256
__obin
=
_mm256_mul_ps
(
_mm256_sub_ps
(
_mm256_loadu_ps
(
&
Ori
[
k
]),
__ori
),
__bins_per_rad
);
__m256
__mag
=
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
Mag
[
k
]),
_mm256_loadu_ps
(
&
W
[
k
]));
__m256
__r0
=
_mm256_floor_ps
(
__rbin
);
__rbin
=
_mm256_sub_ps
(
__rbin
,
__r0
);
__m256
__c0
=
_mm256_floor_ps
(
__cbin
);
__cbin
=
_mm256_sub_ps
(
__cbin
,
__c0
);
__m256
__o0
=
_mm256_floor_ps
(
__obin
);
__obin
=
_mm256_sub_ps
(
__obin
,
__o0
);
__m256i
__o0i
=
_mm256_cvtps_epi32
(
__o0
);
// _o0 += (o0 < 0) * n
__o0i
=
_mm256_add_epi32
(
__o0i
,
_mm256_and_si256
(
__n
,
_mm256_cmpgt_epi32
(
_mm256_setzero_si256
(),
__o0i
)));
__o0i
=
_mm256_sub_epi32
(
__o0i
,
_mm256_and_si256
(
__n
,
_mm256_or_si256
(
_mm256_cmpeq_epi32
(
__o0i
,
__n
),
_mm256_cmpgt_epi32
(
__o0i
,
__n
))));
__m256
__v_r1
=
_mm256_mul_ps
(
__mag
,
__rbin
);
__m256
__v_r0
=
_mm256_sub_ps
(
__mag
,
__v_r1
);
__m256
__v_rc11
=
_mm256_mul_ps
(
__v_r1
,
__cbin
);
__m256
__v_rc10
=
_mm256_sub_ps
(
__v_r1
,
__v_rc11
);
__m256
__v_rc01
=
_mm256_mul_ps
(
__v_r0
,
__cbin
);
__m256
__v_rc00
=
_mm256_sub_ps
(
__v_r0
,
__v_rc01
);
__m256
__v_rco111
=
_mm256_mul_ps
(
__v_rc11
,
__obin
);
__m256
__v_rco110
=
_mm256_sub_ps
(
__v_rc11
,
__v_rco111
);
__m256
__v_rco101
=
_mm256_mul_ps
(
__v_rc10
,
__obin
);
__m256
__v_rco100
=
_mm256_sub_ps
(
__v_rc10
,
__v_rco101
);
__m256
__v_rco011
=
_mm256_mul_ps
(
__v_rc01
,
__obin
);
__m256
__v_rco010
=
_mm256_sub_ps
(
__v_rc01
,
__v_rco011
);
__m256
__v_rco001
=
_mm256_mul_ps
(
__v_rc00
,
__obin
);
__m256
__v_rco000
=
_mm256_sub_ps
(
__v_rc00
,
__v_rco001
);
__m256i
__one
=
_mm256_set1_epi32
(
1
);
__m256i
__idx
=
_mm256_add_epi32
(
_mm256_mullo_epi32
(
_mm256_add_epi32
(
_mm256_mullo_epi32
(
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
__r0
),
__one
),
_mm256_set1_epi32
(
d
+
2
)),
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
__c0
),
__one
)),
_mm256_set1_epi32
(
n
+
2
)),
__o0i
);
_mm256_store_si256
((
__m256i
*
)
idx_buf
,
__idx
);
_mm256_store_ps
(
&
(
rco_buf
[
0
]),
__v_rco000
);
_mm256_store_ps
(
&
(
rco_buf
[
8
]),
__v_rco001
);
_mm256_store_ps
(
&
(
rco_buf
[
16
]),
__v_rco010
);
_mm256_store_ps
(
&
(
rco_buf
[
24
]),
__v_rco011
);
_mm256_store_ps
(
&
(
rco_buf
[
32
]),
__v_rco100
);
_mm256_store_ps
(
&
(
rco_buf
[
40
]),
__v_rco101
);
_mm256_store_ps
(
&
(
rco_buf
[
48
]),
__v_rco110
);
_mm256_store_ps
(
&
(
rco_buf
[
56
]),
__v_rco111
);
#define HIST_SUM_HELPER(id) \
hist[idx_buf[(id)]] += rco_buf[(id)]; \
hist[idx_buf[(id)]+1] += rco_buf[8 + (id)]; \
hist[idx_buf[(id)]+(n+2)] += rco_buf[16 + (id)]; \
hist[idx_buf[(id)]+(n+3)] += rco_buf[24 + (id)]; \
hist[idx_buf[(id)]+(d+2)*(n+2)] += rco_buf[32 + (id)]; \
hist[idx_buf[(id)]+(d+2)*(n+2)+1] += rco_buf[40 + (id)]; \
hist[idx_buf[(id)]+(d+3)*(n+2)] += rco_buf[48 + (id)]; \
hist[idx_buf[(id)]+(d+3)*(n+2)+1] += rco_buf[56 + (id)];
HIST_SUM_HELPER
(
0
);
HIST_SUM_HELPER
(
1
);
HIST_SUM_HELPER
(
2
);
HIST_SUM_HELPER
(
3
);
HIST_SUM_HELPER
(
4
);
HIST_SUM_HELPER
(
5
);
HIST_SUM_HELPER
(
6
);
HIST_SUM_HELPER
(
7
);
#undef HIST_SUM_HELPER
}
}
#endif
for
(
;
k
<
len
;
k
++
)
{
float
rbin
=
RBin
[
k
],
cbin
=
CBin
[
k
];
float
obin
=
(
Ori
[
k
]
-
ori
)
*
bins_per_rad
;
...
...
@@ -681,10 +839,59 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
// to byte array
float
nrm2
=
0
;
len
=
d
*
d
*
n
;
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
float
CV_DECL_ALIGNED
(
32
)
nrm2_buf
[
8
];
__m256
__nrm2
=
_mm256_setzero_ps
();
__m256
__dst
;
for
(
;
k
<=
len
-
8
;
k
+=
8
)
{
__dst
=
_mm256_loadu_ps
(
&
dst
[
k
]);
#if CV_FMA3
__nrm2
=
_mm256_fmadd_ps
(
__dst
,
__dst
,
__nrm2
);
#else
__nrm2
=
_mm256_add_ps
(
__nrm2
,
_mm256_mul_ps
(
__dst
,
__dst
));
#endif
}
_mm256_store_ps
(
nrm2_buf
,
__nrm2
);
nrm2
=
nrm2_buf
[
0
]
+
nrm2_buf
[
1
]
+
nrm2_buf
[
2
]
+
nrm2_buf
[
3
]
+
nrm2_buf
[
4
]
+
nrm2_buf
[
5
]
+
nrm2_buf
[
6
]
+
nrm2_buf
[
7
];
}
#endif
for
(
;
k
<
len
;
k
++
)
nrm2
+=
dst
[
k
]
*
dst
[
k
];
float
thr
=
std
::
sqrt
(
nrm2
)
*
SIFT_DESCR_MAG_THR
;
for
(
i
=
0
,
nrm2
=
0
;
i
<
k
;
i
++
)
i
=
0
,
nrm2
=
0
;
#if 0 //CV_AVX2
// This code cannot be enabled because it sums nrm2 in a different order,
// thus producing slightly different results
if( USE_AVX2 )
{
float CV_DECL_ALIGNED(32) nrm2_buf[8];
__m256 __dst;
__m256 __nrm2 = _mm256_setzero_ps();
__m256 __thr = _mm256_set1_ps(thr);
for( ; i <= len - 8; i += 8 )
{
__dst = _mm256_loadu_ps(&dst[i]);
__dst = _mm256_min_ps(__dst, __thr);
_mm256_storeu_ps(&dst[i], __dst);
#if CV_FMA3
__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
#else
__nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
#endif
}
_mm256_store_ps
(
nrm2_buf
,
__nrm2
);
nrm2
=
nrm2_buf
[
0
]
+
nrm2_buf
[
1
]
+
nrm2_buf
[
2
]
+
nrm2_buf
[
3
]
+
nrm2_buf
[
4
]
+
nrm2_buf
[
5
]
+
nrm2_buf
[
6
]
+
nrm2_buf
[
7
];
}
#endif
for
(
;
i
<
len
;
i
++
)
{
float
val
=
std
::
min
(
dst
[
i
],
thr
);
dst
[
i
]
=
val
;
...
...
@@ -693,7 +900,23 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
nrm2
=
SIFT_INT_DESCR_FCTR
/
std
::
max
(
std
::
sqrt
(
nrm2
),
FLT_EPSILON
);
#if 1
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256
__dst
;
__m256
__min
=
_mm256_setzero_ps
();
__m256
__max
=
_mm256_set1_ps
(
255.0
f
);
// max of uchar
__m256
__nrm2
=
_mm256_set1_ps
(
nrm2
);
for
(
k
=
0
;
k
<=
len
-
8
;
k
+=
8
)
{
__dst
=
_mm256_loadu_ps
(
&
dst
[
k
]);
__dst
=
_mm256_min_ps
(
_mm256_max_ps
(
_mm256_round_ps
(
_mm256_mul_ps
(
__dst
,
__nrm2
),
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
),
__min
),
__max
);
_mm256_storeu_ps
(
&
dst
[
k
],
__dst
);
}
}
#endif
for
(
;
k
<
len
;
k
++
)
{
dst
[
k
]
=
saturate_cast
<
uchar
>
(
dst
[
k
]
*
nrm2
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment