optimize SURF by

Inlining and customizing sampling functions to reduce memory traffic and compute Improve calcOrientation implementation. Using more efficient rounding routines. Removing unnecessary use of local memory

optimize SURF by
Inlining and customizing sampling functions to reduce memory traffic and compute Improve calcOrientation implementation. Using more efficient rounding routines. Removing unnecessary use of local memory
fffac2f0 · krodyush · 5d5527d0 · fffac2f0 · fffac2f0
Commit fffac2f0 authored Dec 17, 2013 by krodyush
Hide whitespace changes
Inline Side-by-side

Showing with 235 additions and 195 deletions

surf.cl modules/nonfree/src/opencl/surf.cl +220 -188

surf.ocl.cpp modules/nonfree/src/surf.ocl.cpp +15 -7

No files found.
--- a/modules/nonfree/src/opencl/surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@@ -66,8 +67,8 @@ uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols,
 uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
 {
 #ifdef DISABLE_IMAGE2D
-    int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
+    int x = clamp(round(coord.x), 0, cols - 1);
-    int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
+    int y = clamp(round(coord.y), 0, rows - 1);
    return img[elemPerRow * y + x];
 #else
    return (uchar)read_imageui(img, sam, coord).x;
@@ -98,6 +99,7 @@ __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
 #define CV_PI_F 3.14159265f
 #endif
 // Use integral image to calculate haar wavelets.
 // N = 2
 // for simple haar paatern
@@ -114,89 +116,10 @@ float icvCalcHaarPatternSum_2(
    F d = 0;
-    int2 dx1 = convert_int2_rte(ratio * src[0]);
+    int2 dx1 = convert_int2(round(ratio * src[0]));
-    int2 dy1 = convert_int2_rte(ratio * src[1]);
+    int2 dy1 = convert_int2(round(ratio * src[1]));
-    int2 dx2 = convert_int2_rte(ratio * src[2]);
+    int2 dx2 = convert_int2(round(ratio * src[2]));
-    int2 dy2 = convert_int2_rte(ratio * src[3]);
+    int2 dy2 = convert_int2(round(ratio * src[3]));
-    F t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
-    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
-    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-    return (float)d;
-}
-// N = 3
-float icvCalcHaarPatternSum_3(
-    IMAGE_INT32 sumTex,
-    __constant float4 *src,
-    int oldSize,
-    int newSize,
-    int y, int x,
-    int rows, int cols, int elemPerRow)
-{
-    float ratio = (float)newSize / oldSize;
-    F d = 0;
-    int4 dx1 = convert_int4_rte(ratio * src[0]);
-    int4 dy1 = convert_int4_rte(ratio * src[1]);
-    int4 dx2 = convert_int4_rte(ratio * src[2]);
-    int4 dy2 = convert_int4_rte(ratio * src[3]);
-    F t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
-    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
-    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
-    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
-    return (float)d;
-}
-// N = 4
-float icvCalcHaarPatternSum_4(
-    IMAGE_INT32 sumTex,
-    __constant float4 *src,
-    int oldSize,
-    int newSize,
-    int y, int x,
-    int rows, int cols, int elemPerRow)
-{
-    float ratio = (float)newSize / oldSize;
-    F d = 0;
-    int4 dx1 = convert_int4_rte(ratio * src[0]);
-    int4 dy1 = convert_int4_rte(ratio * src[1]);
-    int4 dx2 = convert_int4_rte(ratio * src[2]);
-    int4 dy2 = convert_int4_rte(ratio * src[3]);
    F t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
@@ -212,30 +135,12 @@ float icvCalcHaarPatternSum_4(
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
-    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
-    d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
    return (float)d;
 }
 ////////////////////////////////////////////////////////////////////////
 // Hessian
-__constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
-__constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
-__constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
 __inline int calcSize(int octave, int layer)
 {
    /* Wavelet size at first layer of first octave. */
@@ -250,6 +155,24 @@ __inline int calcSize(int octave, int layer)
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }
+// Calculate a derivative in an axis-aligned direction (x or y).  The "plus1"
+// boxes contribute 1 * (area), and the "minus2" box contributes -2 * (area).
+// So the final computation is plus1a + plus1b - 2 * minus2.  The corners are
+// labeled A, B, C, and D, with A being the top left, B being top right, C
+// being bottom left, and D being bottom right.
+F calcAxisAlignedDerivative(
+        int plus1a_A, int plus1a_B, int plus1a_C, int plus1a_D, F plus1a_scale,
+        int plus1b_A, int plus1b_B, int plus1b_C, int plus1b_D, F plus1b_scale,
+        int minus2_A, int minus2_B, int minus2_C, int minus2_D, F minus2_scale)
+{
+    F plus1a = plus1a_A - plus1a_B - plus1a_C + plus1a_D;
+    F plus1b = plus1b_A - plus1b_B - plus1b_C + plus1b_D;
+    F minus2 = minus2_A - minus2_B - minus2_C + minus2_D;
+    return (plus1a / plus1a_scale -
+            2.0f * minus2 / minus2_scale +
+            plus1b / plus1b_scale);
+}
 //calculate targeted layer per-pixel determinant and trace with an integral image
 __kernel void icvCalcLayerDetAndTrace(
@@ -264,7 +187,7 @@ __kernel void icvCalcLayerDetAndTrace(
    int c_octave,
    int c_layer_rows,
    int sumTex_step
-)
+    )
 {
    det_step   /= sizeof(*det);
    trace_step /= sizeof(*trace);
@@ -288,16 +211,103 @@ __kernel void icvCalcLayerDetAndTrace(
    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
    {
-        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        int x = j << c_octave;
-        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        int y = i << c_octave;
-        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        float ratio = (float)size / 9;
+        // Precompute some commonly used values, which are used to offset
+        // texture coordinates in the integral image.
+        int r1 = round(ratio);
+        int r2 = round(ratio * 2.0f);
+        int r3 = round(ratio * 3.0f);
+        int r4 = round(ratio * 4.0f);
+        int r5 = round(ratio * 5.0f);
+        int r6 = round(ratio * 6.0f);
+        int r7 = round(ratio * 7.0f);
+        int r8 = round(ratio * 8.0f);
+        int r9 = round(ratio * 9.0f);
+        // Calculate the approximated derivative in the x-direction
+        F d = 0;
+        {
+            // Some of the pixels needed to compute the derivative are
+            // repeated, so we only don't duplicate the fetch here.
+            int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step );
+            int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step );
+            int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step );
+            int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step );
+            d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2),
+                                          t62, t67, t92, t97, (r9 - r6) * (r7 - r2),
+                                          t32, t37, t62, t67, (r6 - r3) * (r7 - r2));
+        }
+        const float dx  = (float)d;
+        // Calculate the approximated derivative in the y-direction
+        d = 0;
+        {
+            // Some of the pixels needed to compute the derivative are
+            // repeated, so we only don't duplicate the fetch here.
+            int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step );
+            int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step );
+            int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step );
+            int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step );
+            int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step );
+            int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step );
+            int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step );
+            int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step );
+            d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3),
+                                          t26, t29, t76, t79, (r7 - r2) * (r9 - r6),
+                                          t23, t26, t73, t76, (r7 - r2) * (r6 - r3));
+        }
+        const float dy  = (float)d;
+        // Calculate the approximated derivative in the xy-direction
+        d = 0;
+        {
+            // There's no saving us here, we just have to get all of the pixels in
+            // separate fetches
+            F t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            d += t / ((r4 - r1) * (r4 - r1));
+            t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            d -= t / ((r8 - r5) * (r4 - r1));
+            t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            d -= t / ((r4 - r1) * (r8 - r5));
+            t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            d += t / ((r8 - r5) * (r8 - r5));
+        }
+        const float dxy = (float)d;
        det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
    }
 }
 ////////////////////////////////////////////////////////////////////////
 // NONMAX
@@ -309,10 +319,10 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
    float d = 0;
-    int dx1 = convert_int_rte(ratio * c_DM[0]);
+    int dx1 = round(ratio * c_DM[0]);
-    int dy1 = convert_int_rte(ratio * c_DM[1]);
+    int dy1 = round(ratio * c_DM[1]);
-    int dx2 = convert_int_rte(ratio * c_DM[2]);
+    int dx2 = round(ratio * c_DM[2]);
-    int dy2 = convert_int_rte(ratio * c_DM[3]);
+    int dy2 = round(ratio * c_DM[3]);
    float t = 0;
@@ -572,7 +582,7 @@ void icvFindMaximaInLayer(
 }
 // solve 3x3 linear system Ax=b for floating point input
-inline bool solve3x3_float(volatile __local  const float4 *A, volatile __local  const float *b, volatile __local  float *x)
+inline bool solve3x3_float(const float4 *A, const float *b, float *x)
 {
    float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
                - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
@@ -651,7 +661,7 @@ void icvInterpolateKeypoint(
    if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
    {
-        volatile __local  float dD[3];
+        float dD[3];
        //dx
        dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
@@ -660,7 +670,7 @@ void icvInterpolateKeypoint(
        //ds
        dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
-        volatile __local  float4 H[3];
+        float4 H[3];
        //dxx
        H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
@@ -681,7 +691,7 @@ void icvInterpolateKeypoint(
        //dss
        H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
-        volatile __local  float x[3];
+        float x[3];
        if (solve3x3_float(H, dD, x))
        {
@@ -711,7 +721,7 @@ void icvInterpolateKeypoint(
                sampled in a circle of radius 6s using wavelets of size 4s.
                We ensure the gradient wavelet size is even to ensure the
                wavelet pattern is balanced and symmetric around its center */
-                const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+                const int grad_wav_size = 2 * round(2.0f * s);
                // check when grad_wav_size is too big
                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
@@ -737,9 +747,12 @@ void icvInterpolateKeypoint(
 ////////////////////////////////////////////////////////////////////////
 // Orientation
-#define ORI_SEARCH_INC 5
+#define ORI_WIN			 60
-#define ORI_WIN        60
+#define ORI_SAMPLES		 113
-#define ORI_SAMPLES    113
+// The distance between samples in the beginning of the the reduction
+#define ORI_RESPONSE_REDUCTION_WIDTH		 48
+#define ORI_RESPONSE_ARRAY_SIZE			     (ORI_RESPONSE_REDUCTION_WIDTH * 2)
 __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
 __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
@@ -833,12 +846,15 @@ void icvCalcOrientation(
    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
-    volatile __local  float s_X[128];
+    __local  float s_X[ORI_SAMPLES];
-    volatile __local  float s_Y[128];
+    __local  float s_Y[ORI_SAMPLES];
-    volatile __local  float s_angle[128];
+    __local  float s_angle[ORI_SAMPLES];
-    volatile __local  float s_sumx[32 * 4];
+    // Need to allocate enough to make the reduction work without accessing
-    volatile __local  float s_sumy[32 * 4];
+    // past the end of the array.
+    __local  float s_sumx[ORI_RESPONSE_ARRAY_SIZE];
+    __local  float s_sumy[ORI_RESPONSE_ARRAY_SIZE];
+    __local  float s_mod[ORI_RESPONSE_ARRAY_SIZE];
    /* The sampling intervals and wavelet sized for selecting an orientation
    and building the keypoint descriptor are defined relative to 's' */
@@ -849,28 +865,60 @@ void icvCalcOrientation(
    sampled in a circle of radius 6s using wavelets of size 4s.
    We ensure the gradient wavelet size is even to ensure the
    wavelet pattern is balanced and symmetric around its center */
-    const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+    const int grad_wav_size = 2 * round(2.0f * s);
    // check when grad_wav_size is too big
    if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
        return;
    // Calc X, Y, angle and store it to shared memory
-    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    const int tid = get_local_id(0);
+    // Initialize values that are only used as part of the reduction later.
+    if (tid < ORI_RESPONSE_ARRAY_SIZE - ORI_LOCAL_SIZE) {
+        s_mod[tid + ORI_LOCAL_SIZE] = 0.0f;
+    }
-    float X = 0.0f, Y = 0.0f, angle = 0.0f;
+    float ratio = (float)grad_wav_size / 4;
-    if (tid < ORI_SAMPLES)
+    int r2 = round(ratio * 2.0);
+    int r4 = round(ratio * 4.0);
+    for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE )
    {
+        float X = 0.0f, Y = 0.0f, angle = 0.0f;
        const float margin = (float)(grad_wav_size - 1) / 2.0f;
-        const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
+        const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin);
-        const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
+        const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin);
        if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
-                x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
+            x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
        {
-            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
-            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+            float apt = c_aptW[i];
+            // Compute the haar sum without fetching duplicate pixels.
+            float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step);
+            float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step);
+            float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step);
+            float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step);
+            float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step);
+            float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step);
+            float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step);
+            float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step);
+            F t = t00 - t04 - t20 + t24;
+            X -= t / ((r2) * (r4));
+            t = t20 - t24 - t40 + t44;
+            X += t / ((r4 - r2) * (r4));
+            t = t00 - t02 - t40 + t42;
+            Y += t / ((r2) * (r4));
+            t = t02 - t04 - t42 + t44;
+            Y -= t  / ((r4) * (r4 - r2));
+            X = apt*X;
+            Y = apt*Y;
            angle = atan2(Y, X);
@@ -879,76 +927,61 @@ void icvCalcOrientation(
            angle *= 180.0f / CV_PI_F;
        }
+        s_X[i] = X;
+        s_Y[i] = Y;
+        s_angle[i] = angle;
    }
-    s_X[tid] = X;
-    s_Y[tid] = Y;
-    s_angle[tid] = angle;
    barrier(CLK_LOCAL_MEM_FENCE);
    float bestx = 0, besty = 0, best_mod = 0;
+    float sumx = 0.0f, sumy = 0.0f;
+    const int dir = tid * ORI_SEARCH_INC;
+    #pragma unroll
+    for (int i = 0; i < ORI_SAMPLES; ++i) {
+        int angle = round(s_angle[i]);
-#pragma unroll
+        int d = abs(angle - dir);
-    for (int i = 0; i < 18; ++i)
-    {
-        const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
-        volatile float sumx = 0.0f, sumy = 0.0f;
-        int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
        {
-            sumx = s_X[get_local_id(0)];
+            sumx += s_X[i];
-            sumy = s_Y[get_local_id(0)];
+            sumy += s_Y[i];
        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
+    }
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+    s_sumx[tid] = sumx;
-        {
+    s_sumy[tid] = sumy;
-            sumx += s_X[get_local_id(0) + 32];
+    s_mod[tid] = sumx*sumx + sumy*sumy;
-            sumy += s_Y[get_local_id(0) + 32];
+    barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 64];
-            sumy += s_Y[get_local_id(0) + 64];
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 96];
-            sumy += s_Y[get_local_id(0) + 96];
-        }
-        reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
-        reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
-        const float temp_mod = sumx * sumx + sumy * sumy;
+    // This reduction searches for the longest wavelet response vector.  The first
-        if (temp_mod > best_mod)
+    // step uses all of the work items in the workgroup to narrow the search
-        {
+    // down to the three candidates.  It requires s_mod to have a few more
-            best_mod = temp_mod;
+    // elements alocated past the work-group size, which are pre-initialized to
-            bestx = sumx;
+    // 0.0f above.
-            besty = sumy;
+    for(int t = ORI_RESPONSE_REDUCTION_WIDTH; t >= 3; t /= 2) {
+        if (tid < t) {
+            if (s_mod[tid] < s_mod[tid + t]) {
+                s_mod[tid] = s_mod[tid + t];
+                s_sumx[tid] = s_sumx[tid + t];
+                s_sumy[tid] = s_sumy[tid + t];
+            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-    if (get_local_id(0) == 0)
-    {
-        s_X[get_local_id(1)] = bestx;
-        s_Y[get_local_id(1)] = besty;
-        s_angle[get_local_id(1)] = best_mod;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(1) == 0 && get_local_id(0) == 0)
+    // Do the final reduction and write out the result.
+    if (tid == 0)
    {
        int bestIdx = 0;
-        if (s_angle[1] > s_angle[bestIdx])
+        // The loop above narrowed the search of the longest vector to three
+        // possibilities.  Pick the best here.
+        if (s_mod[1] > s_mod[bestIdx])
            bestIdx = 1;
-        if (s_angle[2] > s_angle[bestIdx])
+        if (s_mod[2] > s_mod[bestIdx])
            bestIdx = 2;
-        if (s_angle[3] > s_angle[bestIdx])
-            bestIdx = 3;
-        float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
+        float kp_dir = atan2(s_sumy[bestIdx], s_sumx[bestIdx]);
        if (kp_dir < 0)
            kp_dir += 2.0f * CV_PI_F;
        kp_dir *= 180.0f / CV_PI_F;
@@ -961,7 +994,6 @@ void icvCalcOrientation(
    }
 }
 __kernel
 void icvSetUpright(
    __global float * keypoints,
@@ -1035,8 +1067,8 @@ inline float linearFilter(
    float out = 0.0f;
-    const int x1 = convert_int_rtn(x);
+    const int x1 = round(x);
-    const int y1 = convert_int_rtn(y);
+    const int y1 = round(y);
    const int x2 = x1 + 1;
    const int y2 = y1 + 1;

--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -46,6 +46,7 @@
 #ifdef HAVE_OPENCV_OCL
 #include <cstdio>
+#include <sstream>
 #include "opencl_kernels.hpp"
 using namespace cv;
@@ -55,18 +56,25 @@ namespace cv
 {
    namespace ocl
    {
+        // The number of degrees between orientation samples in calcOrientation
+        const static int ORI_SEARCH_INC = 5;
+        // The local size of the calcOrientation kernel
+        const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC);
        static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
            size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
        {
-            char optBuf [100] = {0};
+            std::stringstream optsStr;
-            char * optBufPtr = optBuf;
+            optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " ";
+            optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " ";
            cl_kernel kernel;
-            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
+            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str());
            size_t wave_size = queryWaveFrontSize(kernel);
            CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
-            sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
+            optsStr << "-D WAVE_SIZE=" << wave_size;
-            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
+            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str());
        }
    }
 }
@@ -594,8 +602,8 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
-    size_t localThreads[3]  = {32, 4, 1};
+    size_t localThreads[3]  = {ORI_LOCAL_SIZE, 1, 1};
-    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
+    size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1};
    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }