Better NEON Hamming distance

d7ff9243 · Andrey Kamaev · 509730c1 · d7ff9243 · d7ff9243 · d7ff9243
Commit d7ff9243 authored Oct 26, 2011 by Andrey Kamaev
5 changed files
--- a/modules/core/include/opencv2/core/internal.hpp
+++ b/modules/core/include/opencv2/core/internal.hpp
@@ -119,7 +119,7 @@ CV_INLINE IppiSize ippiSize(int width, int height)
 #define CV_SSE3 0
 #endif
-#if defined ANDROID && defined __ARM_NEON__
+#if defined ANDROID && defined __ARM_NEON__ && defined __GNUC__
 #include "arm_neon.h"
 #define CV_NEON 1

--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -963,26 +963,22 @@ static const uchar popCountTable4[] =
 int normHamming(const uchar* a, const uchar* b, int n)
 {
    int i = 0, result = 0;
-#if defined __GNUC__ && CV_NEON
+#if CV_NEON
    if (CPU_HAS_NEON_FEATURE)
    {
-        result = 0;  
+        uint32x4_t bits = vmovq_n_u32(0);
-        for( ; i <= n - 16; i += 16 )
+        for (; i <= n - 16; i += 16) {
-        {
            uint8x16_t A_vec = vld1q_u8 (a + i);
            uint8x16_t B_vec = vld1q_u8 (b + i);
-            //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
            uint8x16_t bitsSet = vcntq_u8 (AxorB);
-            //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
-            uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
-            result += vgetq_lane_u64 (bitSet2,0);
-            result += vgetq_lane_u64 (bitSet2,1);
        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    }
    else
 #endif

--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -421,43 +421,42 @@ struct Hamming
    {
        ResultType result = 0;
 #if __GNUC__
-#if ANDROID && HAVE_NEON
+#if CV_NEON
-        static uint64_t features = android_getCpuFeatures();
+        if (CPU_HAS_NEON_FEATURE) {
-        if ((features& ANDROID_CPU_ARM_FEATURE_NEON)) {
+            uint32x4_t bits = vmovq_n_u32(0);
            for (size_t i = 0; i < size; i += 16) {
                uint8x16_t A_vec = vld1q_u8 (a + i);
                uint8x16_t B_vec = vld1q_u8 (b + i);
-                //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
                uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+                uint8x16_t bitsSet = vcntq_u8 (AxorB);
-                uint8x16_t bitsSet += vcntq_u8 (AxorB);
-                //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
                uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
                uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+                bits = vaddq_u32(bits, bitSet4);
-                uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
-                result += vgetq_lane_u64 (bitSet2,0);
-                result += vgetq_lane_u64 (bitSet2,1);
            }
+            uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+            result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+            result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
        }
        else
 #endif
-        //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
+        {
-        typedef unsigned long long pop_t;
+            //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
-        const size_t modulo = size % sizeof(pop_t);
+            typedef unsigned long long pop_t;
-        const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
+            const size_t modulo = size % sizeof(pop_t);
-        const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
+            const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
-        const pop_t* a2_end = a2 + (size / sizeof(pop_t));
+            const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
+            const pop_t* a2_end = a2 + (size / sizeof(pop_t));
-        for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));
+            for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));
-        if (modulo) {
-            //in the case where size is not dividable by sizeof(size_t)
+            if (modulo) {
-            //need to mask off the bits at the end
+                //in the case where size is not dividable by sizeof(size_t)
-            pop_t a_final = 0, b_final = 0;
+                //need to mask off the bits at the end
-            memcpy(&a_final, a2, modulo);
+                pop_t a_final = 0, b_final = 0;
-            memcpy(&b_final, b2, modulo);
+                memcpy(&a_final, a2, modulo);
-            result += __builtin_popcountll(a_final ^ b_final);
+                memcpy(&b_final, b2, modulo);
+                result += __builtin_popcountll(a_final ^ b_final);
+            }
        }
 #else
        HammingLUT lut;

--- a/modules/flann/src/miniflann.cpp
+++ b/modules/flann/src/miniflann.cpp
@@ -312,7 +312,11 @@ buildIndex(void*& index, const Mat& data, const IndexParams& params, const Dista
    buildIndex_<Distance, ::cvflann::Index<Distance> >(index, data, params, dist);
 }
+#if CV_NEON
+typedef ::cvflann::Hamming<uchar> HammingDistance;
+#else
 typedef ::cvflann::HammingLUT HammingDistance;
+#endif
 typedef ::cvflann::LshIndex<HammingDistance> LshIndex;
 Index::Index()

--- a/modules/flann/src/precomp.hpp
+++ b/modules/flann/src/precomp.hpp
@@ -5,6 +5,12 @@
 #include <cstdarg>
 #include <sstream>
+#ifdef HAVE_CVCONFIG_H
+# include "cvconfig.h"
+#endif
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/internal.hpp"
 #include "opencv2/flann/miniflann.hpp"
 #include "opencv2/flann/dist.h"
 #include "opencv2/flann/index_testing.h"
@@ -15,7 +21,6 @@
 // index types
 #include "opencv2/flann/all_indices.h"
 #include "opencv2/flann/flann_base.hpp"
 #endif