Commit d7ff9243 authored by Andrey Kamaev's avatar Andrey Kamaev

Better NEON Hamming distance

parent 509730c1
...@@ -119,7 +119,7 @@ CV_INLINE IppiSize ippiSize(int width, int height) ...@@ -119,7 +119,7 @@ CV_INLINE IppiSize ippiSize(int width, int height)
#define CV_SSE3 0 #define CV_SSE3 0
#endif #endif
#if defined ANDROID && defined __ARM_NEON__ #if defined ANDROID && defined __ARM_NEON__ && defined __GNUC__
#include "arm_neon.h" #include "arm_neon.h"
#define CV_NEON 1 #define CV_NEON 1
......
...@@ -963,26 +963,22 @@ static const uchar popCountTable4[] = ...@@ -963,26 +963,22 @@ static const uchar popCountTable4[] =
int normHamming(const uchar* a, const uchar* b, int n) int normHamming(const uchar* a, const uchar* b, int n)
{ {
int i = 0, result = 0; int i = 0, result = 0;
#if defined __GNUC__ && CV_NEON #if CV_NEON
if (CPU_HAS_NEON_FEATURE) if (CPU_HAS_NEON_FEATURE)
{ {
result = 0; uint32x4_t bits = vmovq_n_u32(0);
for( ; i <= n - 16; i += 16 ) for (; i <= n - 16; i += 16) {
{
uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t A_vec = vld1q_u8 (a + i);
uint8x16_t B_vec = vld1q_u8 (b + i); uint8x16_t B_vec = vld1q_u8 (b + i);
//uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
uint8x16_t bitsSet = vcntq_u8 (AxorB); uint8x16_t bitsSet = vcntq_u8 (AxorB);
//uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
bits = vaddq_u32(bits, bitSet4);
uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
result += vgetq_lane_u64 (bitSet2,0);
result += vgetq_lane_u64 (bitSet2,1);
} }
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
} }
else else
#endif #endif
......
...@@ -421,43 +421,42 @@ struct Hamming ...@@ -421,43 +421,42 @@ struct Hamming
{ {
ResultType result = 0; ResultType result = 0;
#if __GNUC__ #if __GNUC__
#if ANDROID && HAVE_NEON #if CV_NEON
static uint64_t features = android_getCpuFeatures(); if (CPU_HAS_NEON_FEATURE) {
if ((features& ANDROID_CPU_ARM_FEATURE_NEON)) { uint32x4_t bits = vmovq_n_u32(0);
for (size_t i = 0; i < size; i += 16) { for (size_t i = 0; i < size; i += 16) {
uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t A_vec = vld1q_u8 (a + i);
uint8x16_t B_vec = vld1q_u8 (b + i); uint8x16_t B_vec = vld1q_u8 (b + i);
//uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
uint8x16_t bitsSet = vcntq_u8 (AxorB);
uint8x16_t bitsSet += vcntq_u8 (AxorB);
//uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
bits = vaddq_u32(bits, bitSet4);
uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
result += vgetq_lane_u64 (bitSet2,0);
result += vgetq_lane_u64 (bitSet2,1);
} }
uint64x2_t bitSet2 = vpaddlq_u32 (bits);
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
} }
else else
#endif #endif
//for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) {
typedef unsigned long long pop_t; //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
const size_t modulo = size % sizeof(pop_t); typedef unsigned long long pop_t;
const pop_t* a2 = reinterpret_cast<const pop_t*> (a); const size_t modulo = size % sizeof(pop_t);
const pop_t* b2 = reinterpret_cast<const pop_t*> (b); const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
const pop_t* a2_end = a2 + (size / sizeof(pop_t)); const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
const pop_t* a2_end = a2 + (size / sizeof(pop_t));
for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));
for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));
if (modulo) {
//in the case where size is not dividable by sizeof(size_t) if (modulo) {
//need to mask off the bits at the end //in the case where size is not dividable by sizeof(size_t)
pop_t a_final = 0, b_final = 0; //need to mask off the bits at the end
memcpy(&a_final, a2, modulo); pop_t a_final = 0, b_final = 0;
memcpy(&b_final, b2, modulo); memcpy(&a_final, a2, modulo);
result += __builtin_popcountll(a_final ^ b_final); memcpy(&b_final, b2, modulo);
result += __builtin_popcountll(a_final ^ b_final);
}
} }
#else #else
HammingLUT lut; HammingLUT lut;
......
...@@ -312,7 +312,11 @@ buildIndex(void*& index, const Mat& data, const IndexParams& params, const Dista ...@@ -312,7 +312,11 @@ buildIndex(void*& index, const Mat& data, const IndexParams& params, const Dista
buildIndex_<Distance, ::cvflann::Index<Distance> >(index, data, params, dist); buildIndex_<Distance, ::cvflann::Index<Distance> >(index, data, params, dist);
} }
#if CV_NEON
typedef ::cvflann::Hamming<uchar> HammingDistance;
#else
typedef ::cvflann::HammingLUT HammingDistance; typedef ::cvflann::HammingLUT HammingDistance;
#endif
typedef ::cvflann::LshIndex<HammingDistance> LshIndex; typedef ::cvflann::LshIndex<HammingDistance> LshIndex;
Index::Index() Index::Index()
......
...@@ -5,6 +5,12 @@ ...@@ -5,6 +5,12 @@
#include <cstdarg> #include <cstdarg>
#include <sstream> #include <sstream>
#ifdef HAVE_CVCONFIG_H
# include "cvconfig.h"
#endif
#include "opencv2/core/core.hpp"
#include "opencv2/core/internal.hpp"
#include "opencv2/flann/miniflann.hpp" #include "opencv2/flann/miniflann.hpp"
#include "opencv2/flann/dist.h" #include "opencv2/flann/dist.h"
#include "opencv2/flann/index_testing.h" #include "opencv2/flann/index_testing.h"
...@@ -15,7 +21,6 @@ ...@@ -15,7 +21,6 @@
// index types // index types
#include "opencv2/flann/all_indices.h" #include "opencv2/flann/all_indices.h"
#include "opencv2/flann/flann_base.hpp" #include "opencv2/flann/flann_base.hpp"
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment