Merge pull request #2480 from vpisarev:ocl_orb

6b434bef · Andrey Pavlenko · OpenCV Buildbot · 4a4151ec · 4fafab37 · 6b434bef
Commit 6b434bef authored Mar 14, 2014 by Andrey Pavlenko Committed by OpenCV Buildbot Mar 14, 2014
5 changed files
--- a/modules/features2d/perf/perf_orb.cpp
+++ b/modules/features2d/perf/perf_orb.cpp
@@ -28,7 +28,7 @@ PERF_TEST_P(orb, detect, testing::Values(ORB_IMAGES))
    TEST_CYCLE() detector(frame, mask, points);
    sort(points.begin(), points.end(), comparators::KeypointGreater());
-    SANITY_CHECK_KEYPOINTS(points);
+    SANITY_CHECK_KEYPOINTS(points, 1e-5);
 }
 PERF_TEST_P(orb, extract, testing::Values(ORB_IMAGES))
@@ -72,6 +72,6 @@ PERF_TEST_P(orb, full, testing::Values(ORB_IMAGES))
    TEST_CYCLE() detector(frame, mask, points, descriptors, false);
    perf::sort(points, descriptors);
-    SANITY_CHECK_KEYPOINTS(points);
+    SANITY_CHECK_KEYPOINTS(points, 1e-5);
    SANITY_CHECK(descriptors);
 }
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -43,6 +43,7 @@ The references are:
 #include "precomp.hpp"
 #include "fast_score.hpp"
+#include "opencl_kernels.hpp"
 #if defined _MSC_VER
 # pragma warning( disable : 4127)
@@ -249,8 +250,90 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
    }
 }
+template<typename pt>
+struct cmp_pt
+{
+    bool operator ()(const pt& a, const pt& b) const { return a.y < b.y || (a.y == b.y && a.x < b.x); }
+};
+static bool ocl_FAST( InputArray _img, std::vector<KeyPoint>& keypoints,
+                     int threshold, bool nonmax_suppression, int maxKeypoints )
+{
+    UMat img = _img.getUMat();
+    if( img.cols < 7 || img.rows < 7 )
+        return false;
+    size_t globalsize[] = { img.cols-6, img.rows-6 };
+    ocl::Kernel fastKptKernel("FAST_findKeypoints", ocl::features2d::fast_oclsrc);
+    if (fastKptKernel.empty())
+        return false;
+    UMat kp1(1, maxKeypoints*2+1, CV_32S);
+    UMat ucounter1(kp1, Rect(0,0,1,1));
+    ucounter1.setTo(Scalar::all(0));
+    if( !fastKptKernel.args(ocl::KernelArg::ReadOnly(img),
+                            ocl::KernelArg::PtrReadWrite(kp1),
+                            maxKeypoints, threshold).run(2, globalsize, 0, true))
+        return false;
+    Mat mcounter;
+    ucounter1.copyTo(mcounter);
+    int i, counter = mcounter.at<int>(0);
+    counter = std::min(counter, maxKeypoints);
+    keypoints.clear();
+    if( counter == 0 )
+        return true;
+    if( !nonmax_suppression )
+    {
+        Mat m;
+        kp1(Rect(0, 0, counter*2+1, 1)).copyTo(m);
+        const Point* pt = (const Point*)(m.ptr<int>() + 1);
+        for( i = 0; i < counter; i++ )
+            keypoints.push_back(KeyPoint((float)pt[i].x, (float)pt[i].y, 7.f, -1, 1.f));
+    }
+    else
+    {
+        UMat kp2(1, maxKeypoints*3+1, CV_32S);
+        UMat ucounter2 = kp2(Rect(0,0,1,1));
+        ucounter2.setTo(Scalar::all(0));
+        ocl::Kernel fastNMSKernel("FAST_nonmaxSupression", ocl::features2d::fast_oclsrc);
+        if (fastNMSKernel.empty())
+            return false;
+        size_t globalsize_nms[] = { counter };
+        if( !fastNMSKernel.args(ocl::KernelArg::PtrReadOnly(kp1),
+                                ocl::KernelArg::PtrReadWrite(kp2),
+                                ocl::KernelArg::ReadOnly(img),
+                                counter, counter).run(1, globalsize_nms, 0, true))
+            return false;
+        Mat m2;
+        kp2(Rect(0, 0, counter*3+1, 1)).copyTo(m2);
+        Point3i* pt2 = (Point3i*)(m2.ptr<int>() + 1);
+        int newcounter = std::min(m2.at<int>(0), counter);
+        std::sort(pt2, pt2 + newcounter, cmp_pt<Point3i>());
+        for( i = 0; i < newcounter; i++ )
+            keypoints.push_back(KeyPoint((float)pt2[i].x, (float)pt2[i].y, 7.f, -1, (float)pt2[i].z));
+    }
+    return true;
+}
 void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression, int type)
 {
+  if( ocl::useOpenCL() && _img.isUMat() && type == FastFeatureDetector::TYPE_9_16 &&
+      ocl_FAST(_img, keypoints, threshold, nonmax_suppression, 10000))
+      return;
  switch(type) {
    case FastFeatureDetector::TYPE_5_8:
      FAST_t<8>(_img, keypoints, threshold, nonmax_suppression);
@@ -268,6 +351,7 @@ void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool
  }
 }
 void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
 {
    FAST(_img, keypoints, threshold, nonmax_suppression, FastFeatureDetector::TYPE_9_16);
@@ -285,10 +369,16 @@ FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppressio
 void FastFeatureDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
-    Mat image = _image.getMat(), mask = _mask.getMat(), grayImage = image;
+    Mat mask = _mask.getMat(), grayImage;
-    if( image.type() != CV_8U )
+    UMat ugrayImage;
-        cvtColor( image, grayImage, COLOR_BGR2GRAY );
+    _InputArray gray = _image;
-    FAST( grayImage, keypoints, threshold, nonmaxSuppression, type );
+    if( _image.type() != CV_8U )
+    {
+        _OutputArray ogray = _image.isUMat() ? _OutputArray(ugrayImage) : _OutputArray(grayImage);
+        cvtColor( _image, ogray, COLOR_BGR2GRAY );
+        gray = ogray;
+    }
+    FAST( gray, keypoints, threshold, nonmaxSuppression, type );
    KeyPointsFilter::runByPixelsMask( keypoints, mask );
 }

--- a/modules/features2d/src/opencl/fast.cl
+++ b/modules/features2d/src/opencl/fast.cl
+// OpenCL port of the FAST corner detector.
+// Copyright (C) 2014, Itseez Inc. See the license at http://opencv.org
+inline int cornerScore(__global const uchar* img, int step)
+{
+    int k, tofs, v = img[0], a0 = 0, b0;
+    int d[16];
+    #define LOAD2(idx, ofs) \
+        tofs = ofs; d[idx] = (short)(v - img[tofs]); d[idx+8] = (short)(v - img[-tofs])
+    LOAD2(0, 3);
+    LOAD2(1, -step+3);
+    LOAD2(2, -step*2+2);
+    LOAD2(3, -step*3+1);
+    LOAD2(4, -step*3);
+    LOAD2(5, -step*3-1);
+    LOAD2(6, -step*2-2);
+    LOAD2(7, -step-3);
+    #pragma unroll
+    for( k = 0; k < 16; k += 2 )
+    {
+        int a = min((int)d[(k+1)&15], (int)d[(k+2)&15]);
+        a = min(a, (int)d[(k+3)&15]);
+        a = min(a, (int)d[(k+4)&15]);
+        a = min(a, (int)d[(k+5)&15]);
+        a = min(a, (int)d[(k+6)&15]);
+        a = min(a, (int)d[(k+7)&15]);
+        a = min(a, (int)d[(k+8)&15]);
+        a0 = max(a0, min(a, (int)d[k&15]));
+        a0 = max(a0, min(a, (int)d[(k+9)&15]));
+    }
+    b0 = -a0;
+    #pragma unroll
+    for( k = 0; k < 16; k += 2 )
+    {
+        int b = max((int)d[(k+1)&15], (int)d[(k+2)&15]);
+        b = max(b, (int)d[(k+3)&15]);
+        b = max(b, (int)d[(k+4)&15]);
+        b = max(b, (int)d[(k+5)&15]);
+        b = max(b, (int)d[(k+6)&15]);
+        b = max(b, (int)d[(k+7)&15]);
+        b = max(b, (int)d[(k+8)&15]);
+        b0 = min(b0, max(b, (int)d[k]));
+        b0 = min(b0, max(b, (int)d[(k+9)&15]));
+    }
+    return -b0-1;
+}
+__kernel
+void FAST_findKeypoints(
+    __global const uchar * _img, int step, int img_offset,
+    int img_rows, int img_cols,
+    volatile __global int* kp_loc,
+    int max_keypoints, int threshold )
+{
+    int j = get_global_id(0) + 3;
+    int i = get_global_id(1) + 3;
+    if (i < img_rows - 3 && j < img_cols - 3)
+    {
+        __global const uchar* img = _img + mad24(i, step, j + img_offset);
+        int v = img[0], t0 = v - threshold, t1 = v + threshold;
+        int k, tofs, v0, v1;
+        int m0 = 0, m1 = 0;
+        #define UPDATE_MASK(idx, ofs) \
+            tofs = ofs; v0 = img[tofs]; v1 = img[-tofs]; \
+            m0 |= ((v0 < t0) << idx) | ((v1 < t0) << (8 + idx)); \
+            m1 |= ((v0 > t1) << idx) | ((v1 > t1) << (8 + idx))
+        UPDATE_MASK(0, 3);
+        if( (m0 | m1) == 0 )
+            return;
+        UPDATE_MASK(2, -step*2+2);
+        UPDATE_MASK(4, -step*3);
+        UPDATE_MASK(6, -step*2-2);
+        #define EVEN_MASK (1+4+16+64)
+        if( ((m0 | (m0 >> 8)) & EVEN_MASK) != EVEN_MASK &&
+            ((m1 | (m1 >> 8)) & EVEN_MASK) != EVEN_MASK )
+            return;
+        UPDATE_MASK(1, -step+3);
+        UPDATE_MASK(3, -step*3+1);
+        UPDATE_MASK(5, -step*3-1);
+        UPDATE_MASK(7, -step-3);
+        if( ((m0 | (m0 >> 8)) & 255) != 255 &&
+            ((m1 | (m1 >> 8)) & 255) != 255 )
+            return;
+        m0 |= m0 << 16;
+        m1 |= m1 << 16;
+        #define CHECK0(i) ((m0 & (511 << i)) == (511 << i))
+        #define CHECK1(i) ((m1 & (511 << i)) == (511 << i))
+        if( CHECK0(0) + CHECK0(1) + CHECK0(2) + CHECK0(3) +
+            CHECK0(4) + CHECK0(5) + CHECK0(6) + CHECK0(7) +
+            CHECK0(8) + CHECK0(9) + CHECK0(10) + CHECK0(11) +
+            CHECK0(12) + CHECK0(13) + CHECK0(14) + CHECK0(15) +
+            CHECK1(0) + CHECK1(1) + CHECK1(2) + CHECK1(3) +
+            CHECK1(4) + CHECK1(5) + CHECK1(6) + CHECK1(7) +
+            CHECK1(8) + CHECK1(9) + CHECK1(10) + CHECK1(11) +
+            CHECK1(12) + CHECK1(13) + CHECK1(14) + CHECK1(15) == 0 )
+            return;
+        {
+            int idx = atomic_inc(kp_loc);
+            if( idx < max_keypoints )
+            {
+                kp_loc[1 + 2*idx] = j;
+                kp_loc[2 + 2*idx] = i;
+            }
+        }
+    }
+}
+///////////////////////////////////////////////////////////////////////////
+// nonmaxSupression
+__kernel
+void FAST_nonmaxSupression(
+    __global const int* kp_in, volatile __global int* kp_out,
+    __global const uchar * _img, int step, int img_offset,
+    int rows, int cols, int counter, int max_keypoints)
+{
+    const int idx = get_global_id(0);
+    if (idx < counter)
+    {
+        int x = kp_in[1 + 2*idx];
+        int y = kp_in[2 + 2*idx];
+        __global const uchar* img = _img + mad24(y, step, x + img_offset);
+        int s = cornerScore(img, step);
+        if( (x < 4 || s > cornerScore(img-1, step)) +
+            (y < 4 || s > cornerScore(img-step, step)) != 2 )
+            return;
+        if( (x >= cols - 4 || s > cornerScore(img+1, step)) +
+            (y >= rows - 4 || s > cornerScore(img+step, step)) +
+            (x < 4 || y < 4 || s > cornerScore(img-step-1, step)) +
+            (x >= cols - 4 || y < 4 || s > cornerScore(img-step+1, step)) +
+            (x < 4 || y >= rows - 4 || s > cornerScore(img+step-1, step)) +
+            (x >= cols - 4 || y >= rows - 4 || s > cornerScore(img+step+1, step)) == 6)
+        {
+            int new_idx = atomic_inc(kp_out);
+            if( new_idx < max_keypoints )
+            {
+                kp_out[1 + 3*new_idx] = x;
+                kp_out[2 + 3*new_idx] = y;
+                kp_out[3 + 3*new_idx] = s;
+            }
+        }
+    }
+}
--- a/modules/features2d/src/opencl/orb.cl
+++ b/modules/features2d/src/opencl/orb.cl
+// OpenCL port of the ORB feature detector and descriptor extractor
+// Copyright (C) 2014, Itseez Inc. See the license at http://opencv.org
+//
+// The original code has been contributed by Peter Andreas Entschev, peter@entschev.com
+#define LAYERINFO_SIZE 1
+#define LAYERINFO_OFS 0
+#define KEYPOINT_SIZE 3
+#define ORIENTED_KEYPOINT_SIZE 4
+#define KEYPOINT_X 0
+#define KEYPOINT_Y 1
+#define KEYPOINT_Z 2
+#define KEYPOINT_ANGLE 3
+/////////////////////////////////////////////////////////////
+#ifdef ORB_RESPONSES
+__kernel void
+ORB_HarrisResponses(__global const uchar* imgbuf, int imgstep, int imgoffset0,
+                    __global const int* layerinfo, __global const int* keypoints,
+                    __global float* responses, int nkeypoints )
+{
+    int idx = get_global_id(0);
+    if( idx < nkeypoints )
+    {
+        __global const int* kpt = keypoints + idx*KEYPOINT_SIZE;
+        __global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
+        __global const uchar* img = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
+            (kpt[KEYPOINT_Y] - blockSize/2)*imgstep + (kpt[KEYPOINT_X] - blockSize/2);
+        int i, j;
+        int a = 0, b = 0, c = 0;
+        for( i = 0; i < blockSize; i++, img += imgstep-blockSize )
+        {
+            for( j = 0; j < blockSize; j++, img++ )
+            {
+                int Ix = (img[1] - img[-1])*2 + img[-imgstep+1] - img[-imgstep-1] + img[imgstep+1] - img[imgstep-1];
+                int Iy = (img[imgstep] - img[-imgstep])*2 + img[imgstep-1] - img[-imgstep-1] + img[imgstep+1] - img[-imgstep+1];
+                a += Ix*Ix;
+                b += Iy*Iy;
+                c += Ix*Iy;
+            }
+        }
+        responses[idx] = ((float)a * b - (float)c * c - HARRIS_K * (float)(a + b) * (a + b))*scale_sq_sq;
+    }
+}
+#endif
+/////////////////////////////////////////////////////////////
+#ifdef ORB_ANGLES
+#define _DBL_EPSILON 2.2204460492503131e-16f
+#define atan2_p1 (0.9997878412794807f*57.29577951308232f)
+#define atan2_p3 (-0.3258083974640975f*57.29577951308232f)
+#define atan2_p5 (0.1555786518463281f*57.29577951308232f)
+#define atan2_p7 (-0.04432655554792128f*57.29577951308232f)
+inline float fastAtan2( float y, float x )
+{
+    float ax = fabs(x), ay = fabs(y);
+    float a, c, c2;
+    if( ax >= ay )
+    {
+        c = ay/(ax + _DBL_EPSILON);
+        c2 = c*c;
+        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+    }
+    else
+    {
+        c = ax/(ay + _DBL_EPSILON);
+        c2 = c*c;
+        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+    }
+    if( x < 0 )
+        a = 180.f - a;
+    if( y < 0 )
+        a = 360.f - a;
+    return a;
+}
+__kernel void
+ORB_ICAngle(__global const uchar* imgbuf, int imgstep, int imgoffset0,
+            __global const int* layerinfo, __global const int* keypoints,
+            __global float* responses, const __global int* u_max,
+            int nkeypoints, int half_k )
+{
+    int idx = get_global_id(0);
+    if( idx < nkeypoints )
+    {
+        __global const int* kpt = keypoints + idx*KEYPOINT_SIZE;
+        __global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
+        __global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
+            kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];
+        int u, v, m_01 = 0, m_10 = 0;
+        // Treat the center line differently, v=0
+        for( u = -half_k; u <= half_k; u++ )
+            m_10 += u * center[u];
+        // Go line by line in the circular patch
+        for( v = 1; v <= half_k; v++ )
+        {
+            // Proceed over the two lines
+            int v_sum = 0;
+            int d = u_max[v];
+            for( u = -d; u <= d; u++ )
+            {
+                int val_plus = center[u + v*imgstep], val_minus = center[u - v*imgstep];
+                v_sum += (val_plus - val_minus);
+                m_10 += u * (val_plus + val_minus);
+            }
+            m_01 += v * v_sum;
+        }
+        // we do not use OpenCL's atan2 intrinsic,
+        // because we want to get _exactly_ the same results as the CPU version
+        responses[idx] = fastAtan2((float)m_01, (float)m_10);
+    }
+}
+#endif
+/////////////////////////////////////////////////////////////
+#ifdef ORB_DESCRIPTORS
+__kernel void
+ORB_computeDescriptor(__global const uchar* imgbuf, int imgstep, int imgoffset0,
+                      __global const int* layerinfo, __global const int* keypoints,
+                      __global uchar* _desc, const __global int* pattern,
+                      int nkeypoints, int dsize )
+{
+    int idx = get_global_id(0);
+    if( idx < nkeypoints )
+    {
+        int i;
+        __global const int* kpt = keypoints + idx*ORIENTED_KEYPOINT_SIZE;
+        __global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
+        __global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
+                                kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];
+        float angle = as_float(kpt[KEYPOINT_ANGLE]);
+        angle *= 0.01745329251994329547f;
+        float sina = sin(angle);
+        float cosa = cos(angle);
+        __global uchar* desc = _desc + idx*dsize;
+        #define GET_VALUE(idx) \
+            center[mad24(convert_int_rte(pattern[(idx)*2] * sina + pattern[(idx)*2+1] * cosa), imgstep, \
+                        convert_int_rte(pattern[(idx)*2] * cosa - pattern[(idx)*2+1] * sina))]
+        for( i = 0; i < dsize; i++ )
+        {
+            int val;
+        #if WTA_K == 2
+            int t0, t1;
+            t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+            val = t0 < t1;
+            t0 = GET_VALUE(2); t1 = GET_VALUE(3);
+            val |= (t0 < t1) << 1;
+            t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+            val |= (t0 < t1) << 2;
+            t0 = GET_VALUE(6); t1 = GET_VALUE(7);
+            val |= (t0 < t1) << 3;
+            t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+            val |= (t0 < t1) << 4;
+            t0 = GET_VALUE(10); t1 = GET_VALUE(11);
+            val |= (t0 < t1) << 5;
+            t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+            val |= (t0 < t1) << 6;
+            t0 = GET_VALUE(14); t1 = GET_VALUE(15);
+            val |= (t0 < t1) << 7;
+            pattern += 16*2;
+        #elif WTA_K == 3
+            int t0, t1, t2;
+            t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
+            val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
+            t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
+            val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
+            t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
+            val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
+            t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
+            val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
+            pattern += 12*2;
+        #elif WTA_K == 4
+            int t0, t1, t2, t3, k, val;
+            int a, b;
+            t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+            t2 = GET_VALUE(2); t3 = GET_VALUE(3);
+            a = 0, b = 2;
+            if( t1 > t0 ) t0 = t1, a = 1;
+            if( t3 > t2 ) t2 = t3, b = 3;
+            k = t0 > t2 ? a : b;
+            val = k;
+            t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+            t2 = GET_VALUE(6); t3 = GET_VALUE(7);
+            a = 0, b = 2;
+            if( t1 > t0 ) t0 = t1, a = 1;
+            if( t3 > t2 ) t2 = t3, b = 3;
+            k = t0 > t2 ? a : b;
+            val |= k << 2;
+            t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+            t2 = GET_VALUE(10); t3 = GET_VALUE(11);
+            a = 0, b = 2;
+            if( t1 > t0 ) t0 = t1, a = 1;
+            if( t3 > t2 ) t2 = t3, b = 3;
+            k = t0 > t2 ? a : b;
+            val |= k << 4;
+            t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+            t2 = GET_VALUE(14); t3 = GET_VALUE(15);
+            a = 0, b = 2;
+            if( t1 > t0 ) t0 = t1, a = 1;
+            if( t3 > t2 ) t2 = t3, b = 3;
+            k = t0 > t2 ? a : b;
+            val |= k << 6;
+            pattern += 16*2;
+        #else
+            #error "unknown/undefined WTA_K value; should be 2, 3 or 4"
+        #endif
+            desc[i] = (uchar)val;
+        }
+    }
+}
+#endif
--- a/modules/features2d/src/orb.cpp
+++ b/modules/features2d/src/orb.cpp