Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv_contrib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv_contrib
Commits
c5e55dfd
Commit
c5e55dfd
authored
Mar 08, 2017
by
Woody Chow
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Optimize SIFT with AVX2
parent
6ab7a0df
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
231 additions
and
6 deletions
+231
-6
precomp.hpp
modules/xfeatures2d/src/precomp.hpp
+2
-0
sift.cpp
modules/xfeatures2d/src/sift.cpp
+229
-6
No files found.
modules/xfeatures2d/src/precomp.hpp
View file @
c5e55dfd
...
...
@@ -61,4 +61,6 @@
#include "opencv2/core/private.hpp"
#define USE_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
#endif
modules/xfeatures2d/src/sift.cpp
View file @
c5e55dfd
...
...
@@ -344,7 +344,40 @@ static float calcOrientationHist( const Mat& img, Point pt, int radius,
cv
::
hal
::
fastAtan2
(
Y
,
X
,
Ori
,
len
,
true
);
cv
::
hal
::
magnitude32f
(
X
,
Y
,
Mag
,
len
);
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256
__nd360
=
_mm256_set1_ps
(
n
/
360.
f
);
__m256i
__n
=
_mm256_set1_epi32
(
n
);
int
CV_DECL_ALIGNED
(
32
)
bin_buf
[
8
];
float
CV_DECL_ALIGNED
(
32
)
w_mul_mag_buf
[
8
];
for
(
;
k
<=
len
-
8
;
k
+=
8
)
{
__m256i
__bin
=
_mm256_cvtps_epi32
(
_mm256_round_ps
(
_mm256_mul_ps
(
__nd360
,
_mm256_loadu_ps
(
&
Ori
[
k
])),
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
));
__bin
=
_mm256_sub_epi32
(
__bin
,
_mm256_and_si256
(
__n
,
_mm256_or_si256
(
_mm256_cmpeq_epi32
(
__bin
,
__n
),
_mm256_cmpgt_epi32
(
__bin
,
__n
))));
__bin
=
_mm256_add_epi32
(
__bin
,
_mm256_and_si256
(
__n
,
_mm256_cmpgt_epi32
(
_mm256_setzero_si256
(),
__bin
)));
__m256
__w_mul_mag
=
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
W
[
k
]),
_mm256_loadu_ps
(
&
Mag
[
k
]));
_mm256_store_si256
((
__m256i
*
)
bin_buf
,
__bin
);
_mm256_store_ps
(
w_mul_mag_buf
,
__w_mul_mag
);
temphist
[
bin_buf
[
0
]]
+=
w_mul_mag_buf
[
0
];
temphist
[
bin_buf
[
1
]]
+=
w_mul_mag_buf
[
1
];
temphist
[
bin_buf
[
2
]]
+=
w_mul_mag_buf
[
2
];
temphist
[
bin_buf
[
3
]]
+=
w_mul_mag_buf
[
3
];
temphist
[
bin_buf
[
4
]]
+=
w_mul_mag_buf
[
4
];
temphist
[
bin_buf
[
5
]]
+=
w_mul_mag_buf
[
5
];
temphist
[
bin_buf
[
6
]]
+=
w_mul_mag_buf
[
6
];
temphist
[
bin_buf
[
7
]]
+=
w_mul_mag_buf
[
7
];
}
}
#endif
for
(
;
k
<
len
;
k
++
)
{
int
bin
=
cvRound
((
n
/
360.
f
)
*
Ori
[
k
]);
if
(
bin
>=
n
)
...
...
@@ -359,7 +392,40 @@ static float calcOrientationHist( const Mat& img, Point pt, int radius,
temphist
[
-
2
]
=
temphist
[
n
-
2
];
temphist
[
n
]
=
temphist
[
0
];
temphist
[
n
+
1
]
=
temphist
[
1
];
for
(
i
=
0
;
i
<
n
;
i
++
)
i
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256
__d_1_16
=
_mm256_set1_ps
(
1.
f
/
16.
f
);
__m256
__d_4_16
=
_mm256_set1_ps
(
4.
f
/
16.
f
);
__m256
__d_6_16
=
_mm256_set1_ps
(
6.
f
/
16.
f
);
for
(
;
i
<=
n
-
8
;
i
+=
8
)
{
#if CV_FMA3
__m256
__hist
=
_mm256_fmadd_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
2
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
2
])),
__d_1_16
,
_mm256_fmadd_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
1
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
1
])),
__d_4_16
,
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
]),
__d_6_16
)));
#else
__m256
__hist
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
2
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
2
])),
__d_1_16
),
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_add_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
-
1
]),
_mm256_loadu_ps
(
&
temphist
[
i
+
1
])),
__d_4_16
),
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
temphist
[
i
]),
__d_6_16
)));
#endif
_mm256_storeu_ps
(
&
hist
[
i
],
__hist
);
}
}
#endif
for
(
;
i
<
n
;
i
++
)
{
hist
[
i
]
=
(
temphist
[
i
-
2
]
+
temphist
[
i
+
2
])
*
(
1.
f
/
16.
f
)
+
(
temphist
[
i
-
1
]
+
temphist
[
i
+
1
])
*
(
4.
f
/
16.
f
)
+
...
...
@@ -627,7 +693,99 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
cv
::
hal
::
magnitude32f
(
X
,
Y
,
Mag
,
len
);
cv
::
hal
::
exp32f
(
W
,
W
,
len
);
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
int
CV_DECL_ALIGNED
(
32
)
idx_buf
[
8
];
float
CV_DECL_ALIGNED
(
32
)
rco_buf
[
64
];
__m256
__ori
=
_mm256_set1_ps
(
ori
);
__m256
__bins_per_rad
=
_mm256_set1_ps
(
bins_per_rad
);
__m256i
__n
=
_mm256_set1_epi32
(
n
);
for
(
;
k
<=
len
-
8
;
k
+=
8
)
{
__m256
__rbin
=
_mm256_loadu_ps
(
&
RBin
[
k
]);
__m256
__cbin
=
_mm256_loadu_ps
(
&
CBin
[
k
]);
__m256
__obin
=
_mm256_mul_ps
(
_mm256_sub_ps
(
_mm256_loadu_ps
(
&
Ori
[
k
]),
__ori
),
__bins_per_rad
);
__m256
__mag
=
_mm256_mul_ps
(
_mm256_loadu_ps
(
&
Mag
[
k
]),
_mm256_loadu_ps
(
&
W
[
k
]));
__m256
__r0
=
_mm256_floor_ps
(
__rbin
);
__rbin
=
_mm256_sub_ps
(
__rbin
,
__r0
);
__m256
__c0
=
_mm256_floor_ps
(
__cbin
);
__cbin
=
_mm256_sub_ps
(
__cbin
,
__c0
);
__m256
__o0
=
_mm256_floor_ps
(
__obin
);
__obin
=
_mm256_sub_ps
(
__obin
,
__o0
);
__m256i
__o0i
=
_mm256_cvtps_epi32
(
__o0
);
// _o0 += (o0 < 0) * n
__o0i
=
_mm256_add_epi32
(
__o0i
,
_mm256_and_si256
(
__n
,
_mm256_cmpgt_epi32
(
_mm256_setzero_si256
(),
__o0i
)));
__o0i
=
_mm256_sub_epi32
(
__o0i
,
_mm256_and_si256
(
__n
,
_mm256_or_si256
(
_mm256_cmpeq_epi32
(
__o0i
,
__n
),
_mm256_cmpgt_epi32
(
__o0i
,
__n
))));
__m256
__v_r1
=
_mm256_mul_ps
(
__mag
,
__rbin
);
__m256
__v_r0
=
_mm256_sub_ps
(
__mag
,
__v_r1
);
__m256
__v_rc11
=
_mm256_mul_ps
(
__v_r1
,
__cbin
);
__m256
__v_rc10
=
_mm256_sub_ps
(
__v_r1
,
__v_rc11
);
__m256
__v_rc01
=
_mm256_mul_ps
(
__v_r0
,
__cbin
);
__m256
__v_rc00
=
_mm256_sub_ps
(
__v_r0
,
__v_rc01
);
__m256
__v_rco111
=
_mm256_mul_ps
(
__v_rc11
,
__obin
);
__m256
__v_rco110
=
_mm256_sub_ps
(
__v_rc11
,
__v_rco111
);
__m256
__v_rco101
=
_mm256_mul_ps
(
__v_rc10
,
__obin
);
__m256
__v_rco100
=
_mm256_sub_ps
(
__v_rc10
,
__v_rco101
);
__m256
__v_rco011
=
_mm256_mul_ps
(
__v_rc01
,
__obin
);
__m256
__v_rco010
=
_mm256_sub_ps
(
__v_rc01
,
__v_rco011
);
__m256
__v_rco001
=
_mm256_mul_ps
(
__v_rc00
,
__obin
);
__m256
__v_rco000
=
_mm256_sub_ps
(
__v_rc00
,
__v_rco001
);
__m256i
__one
=
_mm256_set1_epi32
(
1
);
__m256i
__idx
=
_mm256_add_epi32
(
_mm256_mullo_epi32
(
_mm256_add_epi32
(
_mm256_mullo_epi32
(
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
__r0
),
__one
),
_mm256_set1_epi32
(
d
+
2
)),
_mm256_add_epi32
(
_mm256_cvtps_epi32
(
__c0
),
__one
)),
_mm256_set1_epi32
(
n
+
2
)),
__o0i
);
_mm256_store_si256
((
__m256i
*
)
idx_buf
,
__idx
);
_mm256_store_ps
(
&
(
rco_buf
[
0
]),
__v_rco000
);
_mm256_store_ps
(
&
(
rco_buf
[
8
]),
__v_rco001
);
_mm256_store_ps
(
&
(
rco_buf
[
16
]),
__v_rco010
);
_mm256_store_ps
(
&
(
rco_buf
[
24
]),
__v_rco011
);
_mm256_store_ps
(
&
(
rco_buf
[
32
]),
__v_rco100
);
_mm256_store_ps
(
&
(
rco_buf
[
40
]),
__v_rco101
);
_mm256_store_ps
(
&
(
rco_buf
[
48
]),
__v_rco110
);
_mm256_store_ps
(
&
(
rco_buf
[
56
]),
__v_rco111
);
#define HIST_SUM_HELPER(id) \
hist[idx_buf[(id)]] += rco_buf[(id)]; \
hist[idx_buf[(id)]+1] += rco_buf[8 + (id)]; \
hist[idx_buf[(id)]+(n+2)] += rco_buf[16 + (id)]; \
hist[idx_buf[(id)]+(n+3)] += rco_buf[24 + (id)]; \
hist[idx_buf[(id)]+(d+2)*(n+2)] += rco_buf[32 + (id)]; \
hist[idx_buf[(id)]+(d+2)*(n+2)+1] += rco_buf[40 + (id)]; \
hist[idx_buf[(id)]+(d+3)*(n+2)] += rco_buf[48 + (id)]; \
hist[idx_buf[(id)]+(d+3)*(n+2)+1] += rco_buf[56 + (id)];
HIST_SUM_HELPER
(
0
);
HIST_SUM_HELPER
(
1
);
HIST_SUM_HELPER
(
2
);
HIST_SUM_HELPER
(
3
);
HIST_SUM_HELPER
(
4
);
HIST_SUM_HELPER
(
5
);
HIST_SUM_HELPER
(
6
);
HIST_SUM_HELPER
(
7
);
#undef HIST_SUM_HELPER
}
}
#endif
for
(
;
k
<
len
;
k
++
)
{
float
rbin
=
RBin
[
k
],
cbin
=
CBin
[
k
];
float
obin
=
(
Ori
[
k
]
-
ori
)
*
bins_per_rad
;
...
...
@@ -681,10 +839,59 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
// to byte array
float
nrm2
=
0
;
len
=
d
*
d
*
n
;
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
float
CV_DECL_ALIGNED
(
32
)
nrm2_buf
[
8
];
__m256
__nrm2
=
_mm256_setzero_ps
();
__m256
__dst
;
for
(
;
k
<=
len
-
8
;
k
+=
8
)
{
__dst
=
_mm256_loadu_ps
(
&
dst
[
k
]);
#if CV_FMA3
__nrm2
=
_mm256_fmadd_ps
(
__dst
,
__dst
,
__nrm2
);
#else
__nrm2
=
_mm256_add_ps
(
__nrm2
,
_mm256_mul_ps
(
__dst
,
__dst
));
#endif
}
_mm256_store_ps
(
nrm2_buf
,
__nrm2
);
nrm2
=
nrm2_buf
[
0
]
+
nrm2_buf
[
1
]
+
nrm2_buf
[
2
]
+
nrm2_buf
[
3
]
+
nrm2_buf
[
4
]
+
nrm2_buf
[
5
]
+
nrm2_buf
[
6
]
+
nrm2_buf
[
7
];
}
#endif
for
(
;
k
<
len
;
k
++
)
nrm2
+=
dst
[
k
]
*
dst
[
k
];
float
thr
=
std
::
sqrt
(
nrm2
)
*
SIFT_DESCR_MAG_THR
;
for
(
i
=
0
,
nrm2
=
0
;
i
<
k
;
i
++
)
i
=
0
,
nrm2
=
0
;
#if 0 //CV_AVX2
// This code cannot be enabled because it sums nrm2 in a different order,
// thus producing slightly different results
if( USE_AVX2 )
{
float CV_DECL_ALIGNED(32) nrm2_buf[8];
__m256 __dst;
__m256 __nrm2 = _mm256_setzero_ps();
__m256 __thr = _mm256_set1_ps(thr);
for( ; i <= len - 8; i += 8 )
{
__dst = _mm256_loadu_ps(&dst[i]);
__dst = _mm256_min_ps(__dst, __thr);
_mm256_storeu_ps(&dst[i], __dst);
#if CV_FMA3
__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
#else
__nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
#endif
}
_mm256_store_ps
(
nrm2_buf
,
__nrm2
);
nrm2
=
nrm2_buf
[
0
]
+
nrm2_buf
[
1
]
+
nrm2_buf
[
2
]
+
nrm2_buf
[
3
]
+
nrm2_buf
[
4
]
+
nrm2_buf
[
5
]
+
nrm2_buf
[
6
]
+
nrm2_buf
[
7
];
}
#endif
for
(
;
i
<
len
;
i
++
)
{
float
val
=
std
::
min
(
dst
[
i
],
thr
);
dst
[
i
]
=
val
;
...
...
@@ -693,7 +900,23 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
nrm2
=
SIFT_INT_DESCR_FCTR
/
std
::
max
(
std
::
sqrt
(
nrm2
),
FLT_EPSILON
);
#if 1
for
(
k
=
0
;
k
<
len
;
k
++
)
k
=
0
;
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256
__dst
;
__m256
__min
=
_mm256_setzero_ps
();
__m256
__max
=
_mm256_set1_ps
(
255.0
f
);
// max of uchar
__m256
__nrm2
=
_mm256_set1_ps
(
nrm2
);
for
(
k
=
0
;
k
<=
len
-
8
;
k
+=
8
)
{
__dst
=
_mm256_loadu_ps
(
&
dst
[
k
]);
__dst
=
_mm256_min_ps
(
_mm256_max_ps
(
_mm256_round_ps
(
_mm256_mul_ps
(
__dst
,
__nrm2
),
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
),
__min
),
__max
);
_mm256_storeu_ps
(
&
dst
[
k
],
__dst
);
}
}
#endif
for
(
;
k
<
len
;
k
++
)
{
dst
[
k
]
=
saturate_cast
<
uchar
>
(
dst
[
k
]
*
nrm2
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment