Commit bf718b08 authored by Tomoaki Teshima's avatar Tomoaki Teshima

use universal intrinsic in FAST

parent 6a5298a5
...@@ -42,7 +42,7 @@ The references are: ...@@ -42,7 +42,7 @@ The references are:
*/ */
#include "fast_score.hpp" #include "fast_score.hpp"
#include "opencv2/core/hal/intrin.hpp"
#define VERIFY_CORNERS 0 #define VERIFY_CORNERS 0
namespace cv { namespace cv {
...@@ -125,45 +125,48 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold) ...@@ -125,45 +125,48 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
for( k = 0; k < N; k++ ) for( k = 0; k < N; k++ )
d[k] = (short)(v - ptr[pixel[k]]); d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2 #if CV_SIMD128
__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); if (hasSIMD128())
for( k = 0; k < 16; k += 8 ) {
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
for (k = 0; k < 16; k += 8)
{ {
__m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); v_int16x8 v0 = v_load(d + k + 1);
__m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); v_int16x8 v1 = v_load(d + k + 2);
__m128i a = _mm_min_epi16(v0, v1); v_int16x8 a = v_min(v0, v1);
__m128i b = _mm_max_epi16(v0, v1); v_int16x8 b = v_max(v0, v1);
v0 = _mm_loadu_si128((__m128i*)(d+k+3)); v0 = v_load(d + k + 3);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+4)); v0 = v_load(d + k + 4);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+5)); v0 = v_load(d + k + 5);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+6)); v0 = v_load(d + k + 6);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+7)); v0 = v_load(d + k + 7);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+8)); v0 = v_load(d + k + 8);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k)); v0 = v_load(d + k);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q0 = v_max(q0, v_min(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q1 = v_min(q1, v_max(b, v0));
v0 = _mm_loadu_si128((__m128i*)(d+k+9)); v0 = v_load(d + k + 9);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q0 = v_max(q0, v_min(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q1 = v_min(q1, v_max(b, v0));
} }
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = v_max(q0, v_setzero_s16() - q1);
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); threshold = v_reduce_max(q0) - 1;
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); }
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); else
threshold = (short)_mm_cvtsi128_si32(q0) - 1; #endif
#else {
int a0 = threshold; int a0 = threshold;
for( k = 0; k < 16; k += 2 ) for( k = 0; k < 16; k += 2 )
{ {
...@@ -197,8 +200,8 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold) ...@@ -197,8 +200,8 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
b0 = std::min(b0, std::max(b, (int)d[k+9])); b0 = std::min(b0, std::max(b, (int)d[k+9]));
} }
threshold = -b0-1; threshold = -b0 - 1;
#endif }
#if VERIFY_CORNERS #if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold); testCorner(ptr, pixel, K, N, threshold);
...@@ -214,44 +217,46 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold) ...@@ -214,44 +217,46 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
short d[N + 4]; short d[N + 4];
for( k = 0; k < N; k++ ) for( k = 0; k < N; k++ )
d[k] = (short)(v - ptr[pixel[k]]); d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2 #if CV_SIMD128
for( k = 0; k < 4; k++ ) for( k = 0; k < 4; k++ )
d[N+k] = d[k]; d[N+k] = d[k];
#endif #endif
#if CV_SSE2 #if CV_SIMD128
__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); if (hasSIMD128())
for( k = 0; k < 16; k += 8 )
{ {
__m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
__m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); for (k = 0; k < 16; k += 8)
__m128i a = _mm_min_epi16(v0, v1); {
__m128i b = _mm_max_epi16(v0, v1); v_int16x8 v0 = v_load(d + k + 1);
v0 = _mm_loadu_si128((__m128i*)(d+k+3)); v_int16x8 v1 = v_load(d + k + 2);
a = _mm_min_epi16(a, v0); v_int16x8 a = v_min(v0, v1);
b = _mm_max_epi16(b, v0); v_int16x8 b = v_max(v0, v1);
v0 = _mm_loadu_si128((__m128i*)(d+k+4)); v0 = v_load(d + k + 3);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+5)); v0 = v_load(d + k + 4);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+6)); v0 = v_load(d + k + 5);
a = _mm_min_epi16(a, v0); a = v_min(a, v0);
b = _mm_max_epi16(b, v0); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k)); v0 = v_load(d + k + 6);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); a = v_min(a, v0);
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); b = v_max(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+7)); v0 = v_load(d + k);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q0 = v_max(q0, v_min(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); q1 = v_min(q1, v_max(b, v0));
v0 = v_load(d + k + 7);
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
}
q0 = v_max(q0, v_setzero_s16() - q1);
threshold = v_reduce_max(q0) - 1;
} }
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); else
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); #endif
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); {
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
threshold = (short)_mm_cvtsi128_si32(q0) - 1;
#else
int a0 = threshold; int a0 = threshold;
for( k = 0; k < 12; k += 2 ) for( k = 0; k < 12; k += 2 )
{ {
...@@ -282,8 +287,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold) ...@@ -282,8 +287,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
} }
threshold = -b0-1; threshold = -b0-1;
#endif }
#if VERIFY_CORNERS #if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold); testCorner(ptr, pixel, K, N, threshold);
#endif #endif
...@@ -293,35 +297,37 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold) ...@@ -293,35 +297,37 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
template<> template<>
int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold) int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
{ {
const int K = 4, N = K*3 + 1; const int K = 4, N = K * 3 + 1;
int k, v = ptr[0]; int k, v = ptr[0];
short d[N]; short d[N];
for( k = 0; k < N; k++ ) for (k = 0; k < N; k++)
d[k] = (short)(v - ptr[pixel[k]]); d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2 #if CV_SIMD128
__m128i v0 = _mm_loadu_si128((__m128i*)(d+1)); if (hasSIMD128())
__m128i v1 = _mm_loadu_si128((__m128i*)(d+2)); {
__m128i a = _mm_min_epi16(v0, v1); v_int16x8 v0 = v_load(d + 1);
__m128i b = _mm_max_epi16(v0, v1); v_int16x8 v1 = v_load(d + 2);
v0 = _mm_loadu_si128((__m128i*)(d+3)); v_int16x8 a = v_min(v0, v1);
a = _mm_min_epi16(a, v0); v_int16x8 b = v_max(v0, v1);
b = _mm_max_epi16(b, v0); v0 = v_load(d + 3);
v0 = _mm_loadu_si128((__m128i*)(d+4)); a = v_min(a, v0);
a = _mm_min_epi16(a, v0); b = v_max(b, v0);
b = _mm_max_epi16(b, v0); v0 = v_load(d + 4);
v0 = _mm_loadu_si128((__m128i*)(d)); a = v_min(a, v0);
__m128i q0 = _mm_min_epi16(a, v0); b = v_max(b, v0);
__m128i q1 = _mm_max_epi16(b, v0); v0 = v_load(d);
v0 = _mm_loadu_si128((__m128i*)(d+5)); v_int16x8 q0 = v_min(a, v0);
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); v_int16x8 q1 = v_max(b, v0);
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); v0 = v_load(d + 5);
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = v_max(q0, v_min(a, v0));
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); q1 = v_min(q1, v_max(b, v0));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); q0 = v_max(q0, v_setzero_s16() - q1);
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); threshold = v_reduce_max(q0) - 1;
threshold = (short)_mm_cvtsi128_si32(q0) - 1; }
#else else
#endif
{
int a0 = threshold; int a0 = threshold;
for( k = 0; k < 8; k += 2 ) for( k = 0; k < 8; k += 2 )
{ {
...@@ -348,7 +354,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold) ...@@ -348,7 +354,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
} }
threshold = -b0-1; threshold = -b0-1;
#endif }
#if VERIFY_CORNERS #if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold); testCorner(ptr, pixel, K, N, threshold);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment