add SURF and HOG to ocl module

64e9cf5d · yao · a2df4909 · 64e9cf5d · 64e9cf5d · 64e9cf5d
Commit 64e9cf5d authored Aug 08, 2012 by yao
6 changed files
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -924,6 +925,154 @@ namespace cv
 		const oclMat& src3, double beta, oclMat& dst, int flags = 0);
 #endif

+        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
+        struct CV_EXPORTS HOGDescriptor
+        {
+            enum { DEFAULT_WIN_SIGMA = -1 };
+            enum { DEFAULT_NLEVELS = 64 };
+            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
+
+            HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
+                          Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
+                          int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
+                          double threshold_L2hys=0.2, bool gamma_correction=true,
+                          int nlevels=DEFAULT_NLEVELS);
+
+            size_t getDescriptorSize() const;
+            size_t getBlockHistogramSize() const;
+
+            void setSVMDetector(const vector<float>& detector);
+
+            static vector<float> getDefaultPeopleDetector();
+            static vector<float> getPeopleDetector48x96();
+            static vector<float> getPeopleDetector64x128();
+
+            void detect(const oclMat& img, vector<Point>& found_locations,
+                        double hit_threshold=0, Size win_stride=Size(),
+                        Size padding=Size());
+
+            void detectMultiScale(const oclMat& img, vector<Rect>& found_locations,
+                                  double hit_threshold=0, Size win_stride=Size(),
+                                  Size padding=Size(), double scale0=1.05,
+                                  int group_threshold=2);
+
+            void getDescriptors(const oclMat& img, Size win_stride,
+                                oclMat& descriptors,
+                                int descr_format=DESCR_FORMAT_COL_BY_COL);
+
+            Size win_size;
+            Size block_size;
+            Size block_stride;
+            Size cell_size;
+            int nbins;
+            double win_sigma;
+            double threshold_L2hys;
+            bool gamma_correction;
+            int nlevels;
+
+        protected:
+            void computeBlockHistograms(const oclMat& img);
+            void computeGradient(const oclMat& img, oclMat& grad, oclMat& qangle);
+
+            double getWinSigma() const;
+            bool checkDetectorSize() const;
+
+            static int numPartsWithin(int size, int part_size, int stride);
+            static Size numPartsWithin(Size size, Size part_size, Size stride);
+
+            // Coefficients of the separating plane
+            float free_coef;
+            oclMat detector;
+
+            // Results of the last classification step
+            oclMat labels;
+            Mat labels_host;
+
+            // Results of the last histogram evaluation step
+            oclMat block_hists;
+
+            // Gradients conputation results
+            oclMat grad, qangle;
+
+            std::vector<oclMat> image_scales;
+        };
+
+        //! Speeded up robust features, port from GPU module.
+        ////////////////////////////////// SURF //////////////////////////////////////////
+        class CV_EXPORTS SURF_OCL
+        {
+        public:
+            enum KeypointLayout
+            {
+                X_ROW = 0,
+                Y_ROW,
+                LAPLACIAN_ROW,
+                OCTAVE_ROW,
+                SIZE_ROW,
+                ANGLE_ROW,
+                HESSIAN_ROW,
+                ROWS_COUNT
+            };
+
+            //! the default constructor
+            SURF_OCL();
+            //! the full constructor taking all the necessary parameters
+            explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
+                int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
+
+            //! returns the descriptor size in float's (64 or 128)
+            int descriptorSize() const;
+            
+            //! upload host keypoints to device memory
+            void uploadKeypoints(const vector<cv::KeyPoint>& keypoints, oclMat& keypointsocl);
+            //! download keypoints from device to host memory
+            void downloadKeypoints(const oclMat& keypointsocl, vector<KeyPoint>& keypoints);
+
+            //! download descriptors from device to host memory
+            void downloadDescriptors(const oclMat& descriptorsocl, vector<float>& descriptors);
+
+            //! finds the keypoints using fast hessian detector used in SURF
+            //! supports CV_8UC1 images
+            //! keypoints will have nFeature cols and 6 rows
+            //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
+            //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
+            //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
+            //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
+            //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
+            //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
+            //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
+            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints);
+            //! finds the keypoints and computes their descriptors.
+            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
+            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints, oclMat& descriptors,
+                bool useProvidedKeypoints = false);
+
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints);
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, oclMat& descriptors,
+                bool useProvidedKeypoints = false);
+
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
+                bool useProvidedKeypoints = false);
+
+            void releaseMemory();
+
+            // SURF parameters
+            float hessianThreshold;
+            int nOctaves;
+            int nOctaveLayers;
+            bool extended;
+            bool upright;
+
+            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+            float keypointsRatio;
+
+            oclMat sum, mask1, maskSum, intBuffer;
+
+            oclMat det, trace;
+
+            oclMat maxPosBuffer;
+
+        };
    }
 }
 #include "opencv2/ocl/matrix_operations.hpp"

--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+// dynamically change the precision used for floating type
+
+#if defined (__ATI__) || defined (__NVIDIA__)
+#define F double
+#else
+#define F float
+#endif
+
+// Image read mode
+__constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+#define CV_PI_F 3.14159265f
+
+// print greyscale image to show image layout
+__kernel void printImage(image2d_t img)
+{
+    printf("(%d, %d) - %3d \n", 
+        get_global_id(0), 
+        get_global_id(1), 
+        read_imageui(img, (int2)(get_global_id(0), get_global_id(1))).x
+        );
+}
+
+// Use integral image to calculate haar wavelets.
+// N = 2
+// for simple haar paatern
+float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x)
+{
+
+    float ratio = (float)newSize / oldSize;
+
+    F d = 0;
+
+#pragma unroll
+    for (int k = 0; k < 2; ++k)
+    {
+        int dx1 = convert_int_rte(ratio * src[k][0]);
+        int dy1 = convert_int_rte(ratio * src[k][1]);
+        int dx2 = convert_int_rte(ratio * src[k][2]);
+        int dy2 = convert_int_rte(ratio * src[k][3]);
+
+        F t = 0;
+        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
+        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
+        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
+        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
+
+        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+    }
+
+    return (float)d;
+}
+
+// N = 3
+float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x)
+{
+
+    float ratio = (float)newSize / oldSize;
+
+    F d = 0;
+
+#pragma unroll
+    for (int k = 0; k < 3; ++k)
+    {
+        int dx1 = convert_int_rte(ratio * src[k][0]);
+        int dy1 = convert_int_rte(ratio * src[k][1]);
+        int dx2 = convert_int_rte(ratio * src[k][2]);
+        int dy2 = convert_int_rte(ratio * src[k][3]);
+
+        F t = 0;
+        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
+        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
+        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
+        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
+
+        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+    }
+
+    return (float)d;
+}
+
+// N = 4
+float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x)
+{
+
+    float ratio = (float)newSize / oldSize;
+
+    F d = 0;
+
+#pragma unroll
+    for (int k = 0; k < 4; ++k)
+    {
+        int dx1 = convert_int_rte(ratio * src[k][0]);
+        int dy1 = convert_int_rte(ratio * src[k][1]);
+        int dx2 = convert_int_rte(ratio * src[k][2]);
+        int dy2 = convert_int_rte(ratio * src[k][3]);
+
+        F t = 0;
+        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
+        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
+        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
+        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
+
+        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+    }
+
+    return (float)d;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Hessian
+
+__constant float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
+__constant float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
+__constant float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
+
+__inline int calcSize(int octave, int layer)
+{
+    /* Wavelet size at first layer of first octave. */
+    const int HAAR_SIZE0 = 9;
+
+    /* Wavelet size increment between layers. This should be an even number,
+    such that the wavelet sizes in an octave are either all even or all odd.
+    This ensures that when looking for the neighbours of a sample, the layers
+    above and below are aligned correctly. */
+    const int HAAR_SIZE_INC = 6;
+
+    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
+}
+
+
+//calculate targeted layer per-pixel determinant and trace with an integral image
+__kernel void icvCalcLayerDetAndTrace(
+    image2d_t sumTex, // input integral image
+    __global float * det,      // output Determinant
+    __global float * trace,    // output trace
+    int det_step,     // the step of det in bytes
+    int trace_step,   // the step of trace in bytes
+    int c_img_rows,
+    int c_img_cols,
+    int c_nOctaveLayers,
+    int c_octave,
+    int c_layer_rows
+    )
+{
+    det_step   /= sizeof(*det);
+    trace_step /= sizeof(*trace);
+    // Determine the indices
+    const int gridDim_y  = get_num_groups(1) / (c_nOctaveLayers + 2);
+    const int blockIdx_y = get_group_id(1) % gridDim_y;
+    const int blockIdx_z = get_group_id(1) / gridDim_y;
+
+    const int j = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    const int i = get_local_id(1) + blockIdx_y * get_local_size(1);
+    const int layer = blockIdx_z;
+
+    const int size = calcSize(c_octave, layer);
+
+    const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
+    const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
+
+    // Ignore pixels where some of the kernel is outside the image
+    const int margin = (size >> 1) >> c_octave;
+
+    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
+    {
+        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
+        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
+        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
+
+        det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
+        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; 
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// NONMAX
+
+__constant float c_DM[5] = {0, 0, 9, 9, 1};
+
+bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
+{
+    float ratio = (float)size / 9.0f;
+
+    float d = 0;
+
+    int dx1 = convert_int_rte(ratio * c_DM[0]);
+    int dy1 = convert_int_rte(ratio * c_DM[1]);
+    int dx2 = convert_int_rte(ratio * c_DM[2]);
+    int dy2 = convert_int_rte(ratio * c_DM[3]);
+
+    float t = 0;
+
+    t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x;
+    t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x;
+    t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x;
+    t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x;
+
+    d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
+
+    return (d >= 0.5f);
+}
+
+// Non-maximal suppression to further filtering the candidates from previous step
+__kernel
+    void icvFindMaximaInLayer_withmask(
+    __global const float * det, 
+    __global const float * trace, 
+    __global int4 * maxPosBuffer, 
+    volatile __global unsigned int* maxCounter,
+    int counter_offset,
+    int det_step,     // the step of det in bytes
+    int trace_step,   // the step of trace in bytes
+    int c_img_rows,
+    int c_img_cols,
+    int c_nOctaveLayers,
+    int c_octave,
+    int c_layer_rows,
+    int c_layer_cols,
+    int c_max_candidates,
+    float c_hessianThreshold,
+    image2d_t maskSumTex
+    )
+{
+    volatile __local  float N9[768]; // threads.x * threads.y * 3
+
+    det_step   /= sizeof(*det);
+    trace_step /= sizeof(*trace);
+    maxCounter += counter_offset;
+
+    // Determine the indices
+    const int gridDim_y  = get_num_groups(1) / c_nOctaveLayers;
+    const int blockIdx_y = get_group_id(1)   % gridDim_y;
+    const int blockIdx_z = get_group_id(1)   / gridDim_y;
+
+    const int layer = blockIdx_z + 1;
+
+    const int size = calcSize(c_octave, layer);
+
+    // Ignore pixels without a 3x3x3 neighbourhood in the layer above
+    const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
+
+    const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
+    const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
+
+    // Is this thread within the hessian buffer?
+    const int zoff = get_local_size(0) * get_local_size(1);
+    const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
+    N9[localLin - zoff] = 
+        det[det_step * 
+        (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+        + min(max(j, 0), c_img_cols - 1)];                            // x
+    N9[localLin       ] = 
+        det[det_step * 
+        (c_layer_rows * (layer    ) + min(max(i, 0), c_img_rows - 1)) // y
+        + min(max(j, 0), c_img_cols - 1)];                            // x
+    N9[localLin + zoff] = 
+        det[det_step * 
+        (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+        + min(max(j, 0), c_img_cols - 1)];                            // x
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (i < c_layer_rows - margin 
+        && j < c_layer_cols - margin
+        && get_local_id(0) > 0 
+        && get_local_id(0) < get_local_size(0) - 1
+        && get_local_id(1) > 0 
+        && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
+        )
+    {
+        float val0 = N9[localLin];
+
+        if (val0 > c_hessianThreshold)
+        {
+            // Coordinates for the start of the wavelet in the sum image. There
+            // is some integer division involved, so don't try to simplify this
+            // (cancel out sampleStep) without checking the result is the same
+            const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
+            const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
+
+            if (within_check(maskSumTex, sum_i, sum_j, size))
+            {
+                // Check to see if we have a max (in its 26 neighbours)
+                const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
+                &&                   val0 > N9[localLin     - get_local_size(0) - zoff]
+                &&                   val0 > N9[localLin + 1 - get_local_size(0) - zoff]
+                &&                   val0 > N9[localLin - 1                     - zoff]
+                &&                   val0 > N9[localLin                         - zoff]
+                &&                   val0 > N9[localLin + 1                     - zoff]
+                &&                   val0 > N9[localLin - 1 + get_local_size(0) - zoff]
+                &&                   val0 > N9[localLin     + get_local_size(0) - zoff]
+                &&                   val0 > N9[localLin + 1 + get_local_size(0) - zoff]
+
+                &&                   val0 > N9[localLin - 1 - get_local_size(0)]
+                &&                   val0 > N9[localLin     - get_local_size(0)]
+                &&                   val0 > N9[localLin + 1 - get_local_size(0)]
+                &&                   val0 > N9[localLin - 1                    ]
+                &&                   val0 > N9[localLin + 1                    ]
+                &&                   val0 > N9[localLin - 1 + get_local_size(0)]
+                &&                   val0 > N9[localLin     + get_local_size(0)]
+                &&                   val0 > N9[localLin + 1 + get_local_size(0)]
+
+                &&                   val0 > N9[localLin - 1 - get_local_size(0) + zoff]
+                &&                   val0 > N9[localLin     - get_local_size(0) + zoff]
+                &&                   val0 > N9[localLin + 1 - get_local_size(0) + zoff]
+                &&                   val0 > N9[localLin - 1                     + zoff]
+                &&                   val0 > N9[localLin                         + zoff]
+                &&                   val0 > N9[localLin + 1                     + zoff]
+                &&                   val0 > N9[localLin - 1 + get_local_size(0) + zoff]
+                &&                   val0 > N9[localLin     + get_local_size(0) + zoff]
+                &&                   val0 > N9[localLin + 1 + get_local_size(0) + zoff]
+                ;
+
+                if(condmax)
+                {
+                    unsigned int ind = atomic_inc(maxCounter);
+
+                    if (ind < c_max_candidates)
+                    {
+                        const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
+
+                        maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
+                    }
+                }
+            }
+        }
+    }
+}
+
+__kernel
+    void icvFindMaximaInLayer(
+    __global float * det, 
+    __global float * trace, 
+    __global int4 * maxPosBuffer, 
+    volatile __global unsigned int* maxCounter,
+    int counter_offset,
+    int det_step,     // the step of det in bytes
+    int trace_step,   // the step of trace in bytes
+    int c_img_rows,
+    int c_img_cols,
+    int c_nOctaveLayers,
+    int c_octave,
+    int c_layer_rows,
+    int c_layer_cols,
+    int c_max_candidates,
+    float c_hessianThreshold
+    )
+{
+    volatile __local  float N9[768]; // threads.x * threads.y * 3
+
+    det_step   /= sizeof(float);
+    trace_step /= sizeof(float);
+    maxCounter += counter_offset;
+
+    // Determine the indices
+    const int gridDim_y  = get_num_groups(1) / c_nOctaveLayers;
+    const int blockIdx_y = get_group_id(1)   % gridDim_y;
+    const int blockIdx_z = get_group_id(1)   / gridDim_y;
+
+    const int layer = blockIdx_z + 1;
+
+    const int size = calcSize(c_octave, layer);
+
+    // Ignore pixels without a 3x3x3 neighbourhood in the layer above
+    const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
+
+    const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
+    const int i = get_local_id(1) + blockIdx_y      * (get_local_size(1) - 2) + margin - 1;
+
+    // Is this thread within the hessian buffer?
+    const int zoff     = get_local_size(0) * get_local_size(1);
+    const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
+
+    int l_x = min(max(j, 0), c_img_cols - 1);
+    int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
+
+    N9[localLin - zoff] = 
+        det[det_step * (l_y - c_layer_rows) + l_x];
+    N9[localLin       ] = 
+        det[det_step * (l_y               ) + l_x];
+    N9[localLin + zoff] = 
+        det[det_step * (l_y + c_layer_rows) + l_x];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (i < c_layer_rows - margin 
+        && j < c_layer_cols - margin
+        && get_local_id(0) > 0 
+        && get_local_id(0) < get_local_size(0) - 1
+        && get_local_id(1) > 0 
+        && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
+        )
+    {
+        float val0 = N9[localLin];
+        if (val0 > c_hessianThreshold)
+        {
+            //printf(\"(%3d, %3d) N9[%3d]=%7.1f val0=%7.1f\\n\", l_x, l_y, localLin - zoff, N9[localLin], val0);
+            // Coordinates for the start of the wavelet in the sum image. There
+            // is some integer division involved, so don't try to simplify this
+            // (cancel out sampleStep) without checking the result is the same
+
+            // Check to see if we have a max (in its 26 neighbours)
+            const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
+            &&                   val0 > N9[localLin     - get_local_size(0) - zoff]
+            &&                   val0 > N9[localLin + 1 - get_local_size(0) - zoff]
+            &&                   val0 > N9[localLin - 1                     - zoff]
+            &&                   val0 > N9[localLin                         - zoff]
+            &&                   val0 > N9[localLin + 1                     - zoff]
+            &&                   val0 > N9[localLin - 1 + get_local_size(0) - zoff]
+            &&                   val0 > N9[localLin     + get_local_size(0) - zoff]
+            &&                   val0 > N9[localLin + 1 + get_local_size(0) - zoff]
+
+            &&                   val0 > N9[localLin - 1 - get_local_size(0)]
+            &&                   val0 > N9[localLin     - get_local_size(0)]
+            &&                   val0 > N9[localLin + 1 - get_local_size(0)]
+            &&                   val0 > N9[localLin - 1                    ]
+            &&                   val0 > N9[localLin + 1                    ]
+            &&                   val0 > N9[localLin - 1 + get_local_size(0)]
+            &&                   val0 > N9[localLin     + get_local_size(0)]
+            &&                   val0 > N9[localLin + 1 + get_local_size(0)]
+
+            &&                   val0 > N9[localLin - 1 - get_local_size(0) + zoff]
+            &&                   val0 > N9[localLin     - get_local_size(0) + zoff]
+            &&                   val0 > N9[localLin + 1 - get_local_size(0) + zoff]
+            &&                   val0 > N9[localLin - 1                     + zoff]
+            &&                   val0 > N9[localLin                         + zoff]
+            &&                   val0 > N9[localLin + 1                     + zoff]
+            &&                   val0 > N9[localLin - 1 + get_local_size(0) + zoff]
+            &&                   val0 > N9[localLin     + get_local_size(0) + zoff]
+            &&                   val0 > N9[localLin + 1 + get_local_size(0) + zoff]
+            ;
+
+            if(condmax)
+            {
+                unsigned int ind = atomic_inc(maxCounter);
+
+                if (ind < c_max_candidates)
+                {
+                    const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
+
+                    maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
+                }
+            }
+        }
+    }
+}
+
+// solve 3x3 linear system Ax=b for floating point input
+inline bool solve3x3_float(volatile __local  const float A[3][3], volatile __local  const float b[3], volatile __local  float x[3])
+{
+    float det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+        - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+        + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+    if (det != 0)
+    {
+        F invdet = 1.0 / det;
+
+        x[0] = invdet * 
+            (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+            A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+            A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   ));
+
+        x[1] = invdet * 
+            (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+            b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+            A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0]));
+
+        x[2] = invdet * 
+            (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+            A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+            b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
+
+        return true;
+    }
+    return false;
+}
+
+#define X_ROW          0
+#define Y_ROW          1
+#define LAPLACIAN_ROW  2
+#define OCTAVE_ROW     3
+#define SIZE_ROW       4
+#define ANGLE_ROW      5
+#define HESSIAN_ROW    6
+#define ROWS_COUNT     7
+
+////////////////////////////////////////////////////////////////////////
+// INTERPOLATION
+__kernel 
+    void icvInterpolateKeypoint(
+    __global const float * det, 
+    __global const int4 * maxPosBuffer,
+    __global float * keypoints,
+    volatile __global unsigned int * featureCounter,
+    int det_step,
+    int keypoints_step,
+    int c_img_rows,
+    int c_img_cols,
+    int c_octave,
+    int c_layer_rows,
+    int c_max_features
+    )
+{
+    det_step /= sizeof(*det);
+    keypoints_step /= sizeof(*keypoints);
+    __global float * featureX       = keypoints + X_ROW * keypoints_step;
+    __global float * featureY       = keypoints + Y_ROW * keypoints_step;
+    __global int * featureLaplacian = (__global int *)keypoints + LAPLACIAN_ROW * keypoints_step;
+    __global int * featureOctave    = (__global int *)keypoints + OCTAVE_ROW * keypoints_step;
+    __global float * featureSize    = keypoints + SIZE_ROW * keypoints_step;
+    __global float * featureHessian = keypoints + HESSIAN_ROW * keypoints_step;
+
+    const int4 maxPos = maxPosBuffer[get_group_id(0)];
+
+    const int j = maxPos.x - 1 + get_local_id(0);
+    const int i = maxPos.y - 1 + get_local_id(1);
+    const int layer = maxPos.z - 1 + get_local_id(2);
+
+    volatile __local  float N9[3][3][3];
+
+    N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = 
+        det[det_step * (c_layer_rows * layer + i) + j];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
+    {
+        volatile __local  float dD[3];
+
+        //dx
+        dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
+        //dy
+        dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
+        //ds
+        dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
+
+        volatile __local  float H[3][3];
+
+        //dxx
+        H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
+        //dxy
+        H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
+        //dxs
+        H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
+        //dyx = dxy
+        H[1][0] = H[0][1];
+        //dyy
+        H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
+        //dys
+        H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
+        //dsx = dxs
+        H[2][0] = H[0][2];
+        //dsy = dys
+        H[2][1] = H[1][2];
+        //dss
+        H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
+
+        volatile __local  float x[3];
+
+        if (solve3x3_float(H, dD, x))
+        {
+            if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
+            {
+                // if the step is within the interpolation region, perform it
+
+                const int size = calcSize(c_octave, maxPos.z);
+
+                const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
+                const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
+
+                const float center_i = sum_i + (float)(size - 1) / 2;
+                const float center_j = sum_j + (float)(size - 1) / 2;
+
+                const float px = center_j + x[0] * (1 << c_octave);
+                const float py = center_i + x[1] * (1 << c_octave);
+
+                const int ds = size - calcSize(c_octave, maxPos.z - 1);
+                const float psize = round(size + x[2] * ds);
+
+                /* The sampling intervals and wavelet sized for selecting an orientation
+                and building the keypoint descriptor are defined relative to 's' */
+                const float s = psize * 1.2f / 9.0f;
+
+                /* To find the dominant orientation, the gradients in x and y are
+                sampled in a circle of radius 6s using wavelets of size 4s.
+                We ensure the gradient wavelet size is even to ensure the
+                wavelet pattern is balanced and symmetric around its center */
+                const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+
+                // check when grad_wav_size is too big
+                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
+                {
+                    // Get a new feature index.
+                    unsigned int ind = atomic_inc(featureCounter);
+
+                    if (ind < c_max_features)
+                    {
+                        featureX[ind] = px;
+                        featureY[ind] = py;
+                        featureLaplacian[ind] = maxPos.w;
+                        featureOctave[ind] = c_octave;
+                        featureSize[ind] = psize;
+                        featureHessian[ind] = N9[1][1][1];
+                    }
+                } // grad_wav_size check
+            } // If the subpixel interpolation worked
+        }
+    } // If this is thread 0.
+}
+
+////////////////////////////////////////////////////////////////////////
+// Orientation
+
+#define ORI_SEARCH_INC 5
+#define ORI_WIN        60
+#define ORI_SAMPLES    113
+
+__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
+__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
+__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 
+    0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 
+    0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 
+    0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 
+    0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 
+    0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 
+    0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 
+    0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 
+    0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 
+    0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 
+    0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 
+    0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 
+    0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 
+    0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 
+    0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
+    0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 
+    0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 
+    0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 
+    0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
+    0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 
+    0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 
+    0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
+    0.001707611023448408f, 0.001455130288377404f};
+
+__constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
+__constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
+
+void reduce_32_sum(volatile __local  float * data, float partial_reduction, int tid)
+{
+#define op(A, B) (A)+(B)
+    data[tid] = partial_reduction;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16) 
+    {
+        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+        data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+        data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+        data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+        data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+    }
+#undef op
+}
+
+__kernel
+    void icvCalcOrientation(
+    image2d_t sumTex,
+    __global float * keypoints,
+    int keypoints_step,
+    int c_img_rows,
+    int c_img_cols
+    )
+{
+    keypoints_step /= sizeof(*keypoints);
+    __global float* featureX    = keypoints + X_ROW * keypoints_step;
+    __global float* featureY    = keypoints + Y_ROW * keypoints_step;
+    __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
+    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
+
+    volatile __local  float s_X[128];
+    volatile __local  float s_Y[128];
+    volatile __local  float s_angle[128];
+
+    volatile __local  float s_sumx[32 * 4];
+    volatile __local  float s_sumy[32 * 4];
+
+    /* The sampling intervals and wavelet sized for selecting an orientation
+    and building the keypoint descriptor are defined relative to 's' */
+    const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
+
+    /* To find the dominant orientation, the gradients in x and y are
+    sampled in a circle of radius 6s using wavelets of size 4s.
+    We ensure the gradient wavelet size is even to ensure the
+    wavelet pattern is balanced and symmetric around its center */
+    const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+
+    // check when grad_wav_size is too big
+    if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
+        return;
+
+    // Calc X, Y, angle and store it to shared memory
+    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+    float X = 0.0f, Y = 0.0f, angle = 0.0f;
+
+    if (tid < ORI_SAMPLES)
+    {
+        const float margin = (float)(grad_wav_size - 1) / 2.0f;
+        const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
+        const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
+
+        if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
+            x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
+        {
+            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x);
+            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
+
+            angle = atan2(Y, X);
+            if (angle < 0)
+                angle += 2.0f * CV_PI_F;
+            angle *= 180.0f / CV_PI_F;
+        }
+    }
+    s_X[tid] = X;
+    s_Y[tid] = Y;
+    s_angle[tid] = angle;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float bestx = 0, besty = 0, best_mod = 0;
+
+#pragma unroll
+    for (int i = 0; i < 18; ++i)
+    {
+        const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
+
+        float sumx = 0.0f, sumy = 0.0f;
+        int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
+        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+        {
+            sumx = s_X[get_local_id(0)];
+            sumy = s_Y[get_local_id(0)];
+        }
+        d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
+        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+        {
+            sumx += s_X[get_local_id(0) + 32];
+            sumy += s_Y[get_local_id(0) + 32];
+        }
+        d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
+        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+        {
+            sumx += s_X[get_local_id(0) + 64];
+            sumy += s_Y[get_local_id(0) + 64];
+        }
+        d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
+        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+        {
+            sumx += s_X[get_local_id(0) + 96];
+            sumy += s_Y[get_local_id(0) + 96];
+        }
+
+        reduce_32_sum(s_sumx + get_local_id(1) * 32, sumx, get_local_id(0));
+        reduce_32_sum(s_sumy + get_local_id(1) * 32, sumy, get_local_id(0));
+
+        const float temp_mod = sumx * sumx + sumy * sumy;
+        if (temp_mod > best_mod)
+        {
+            best_mod = temp_mod;
+            bestx = sumx;
+            besty = sumy;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (get_local_id(0) == 0)
+    {
+        s_X[get_local_id(1)] = bestx;
+        s_Y[get_local_id(1)] = besty;
+        s_angle[get_local_id(1)] = best_mod;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(1) == 0 && get_local_id(0) == 0)
+    {
+        int bestIdx = 0;
+
+        if (s_angle[1] > s_angle[bestIdx])
+            bestIdx = 1;
+        if (s_angle[2] > s_angle[bestIdx])
+            bestIdx = 2;
+        if (s_angle[3] > s_angle[bestIdx])
+            bestIdx = 3;
+
+        float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
+        if (kp_dir < 0)
+            kp_dir += 2.0f * CV_PI_F;
+        kp_dir *= 180.0f / CV_PI_F;
+
+        featureDir[get_group_id(0)] = kp_dir;
+    }
+}
+
+#undef ORI_SEARCH_INC
+#undef ORI_WIN
+#undef ORI_SAMPLES
+
+////////////////////////////////////////////////////////////////////////
+// Descriptors
+
+#define PATCH_SZ 20
+
+__constant float c_DW[PATCH_SZ * PATCH_SZ] =
+{
+    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
+    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
+    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
+    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
+    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
+    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
+    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
+    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
+    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
+    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
+    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
+    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
+    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
+    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
+    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
+    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
+    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
+    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
+    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
+    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
+};
+
+// utility for linear filter
+inline uchar readerGet(
+    image2d_t src, 
+    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, 
+    int i, int j
+    )
+{
+    float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
+    float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
+    return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x;
+}
+
+inline float linearFilter(
+    image2d_t src, 
+    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,  
+    float y, float x
+    )
+{
+    x -= 0.5f;
+    y -= 0.5f;
+
+    float out = 0.0f;
+
+    const int x1 = convert_int_rtn(x);
+    const int y1 = convert_int_rtn(y);
+    const int x2 = x1 + 1;
+    const int y2 = y1 + 1;
+
+    uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
+    out = out + src_reg * ((x2 - x) * (y2 - y));
+
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
+    out = out + src_reg * ((x - x1) * (y2 - y));
+
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
+    out = out + src_reg * ((x2 - x) * (y - y1));
+
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
+    out = out + src_reg * ((x - x1) * (y - y1));
+
+    return out;
+}
+
+void calc_dx_dy(
+    image2d_t imgTex,
+    volatile __local  float s_dx_bin[25],
+    volatile __local  float s_dy_bin[25],
+    volatile __local  float s_PATCH[6][6],
+    __global const float* featureX, 
+    __global const float* featureY, 
+    __global const float* featureSize, 
+    __global const float* featureDir
+    )
+{
+    const float centerX = featureX[get_group_id(0)];
+    const float centerY = featureY[get_group_id(0)];
+    const float size = featureSize[get_group_id(0)];
+    const float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f);
+
+    /* The sampling intervals and wavelet sized for selecting an orientation
+    and building the keypoint descriptor are defined relative to 's' */
+    const float s = size * 1.2f / 9.0f;
+
+    /* Extract a window of pixels around the keypoint of size 20s */
+    const int win_size = (int)((PATCH_SZ + 1) * s);
+
+    float sin_dir;
+    float cos_dir;
+    sin_dir = sincos(descriptor_dir, &cos_dir);
+
+    /* Nearest neighbour version (faster) */
+    const float win_offset = -(float)(win_size - 1) / 2;
+
+    // Compute sampling points
+    // since grids are 2D, need to compute xBlock and yBlock indices
+    const int xBlock = (get_group_id(1) & 3);  // get_group_id(1) % 4
+    const int yBlock = (get_group_id(1) >> 2); // floor(get_group_id(1)/4)
+    const int xIndex = xBlock * 5 + get_local_id(0);
+    const int yIndex = yBlock * 5 + get_local_id(1);
+
+    const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
+    const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
+
+    s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 5 && get_local_id(1) < 5)
+    {
+        const int tid = get_local_id(1) * 5 + get_local_id(0);
+
+        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
+
+        const float vx = (
+            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1] - 
+            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] + 
+            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - 
+            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ]) 
+            * dw;
+        const float vy = (
+            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ] - 
+            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] + 
+            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - 
+            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1]) 
+            * dw;
+        s_dx_bin[tid] = vx;
+        s_dy_bin[tid] = vy;
+    }
+}
+void reduce_sum25(
+    volatile __local  float* sdata1, 
+    volatile __local  float* sdata2, 
+    volatile __local  float* sdata3, 
+    volatile __local  float* sdata4, 
+    int tid
+    )
+{
+    // first step is to reduce from 25 to 16
+    if (tid < 9) // use 9 threads
+    {
+        sdata1[tid] += sdata1[tid + 16];
+        sdata2[tid] += sdata2[tid + 16];
+        sdata3[tid] += sdata3[tid + 16];
+        sdata4[tid] += sdata4[tid + 16];
+    }
+
+    // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+    if (tid < 8)
+    {
+        sdata1[tid] += sdata1[tid + 8];
+        sdata1[tid] += sdata1[tid + 4];
+        sdata1[tid] += sdata1[tid + 2];
+        sdata1[tid] += sdata1[tid + 1];
+
+        sdata2[tid] += sdata2[tid + 8];
+        sdata2[tid] += sdata2[tid + 4];
+        sdata2[tid] += sdata2[tid + 2];
+        sdata2[tid] += sdata2[tid + 1];
+
+        sdata3[tid] += sdata3[tid + 8];
+        sdata3[tid] += sdata3[tid + 4];
+        sdata3[tid] += sdata3[tid + 2];
+        sdata3[tid] += sdata3[tid + 1];
+
+        sdata4[tid] += sdata4[tid + 8];
+        sdata4[tid] += sdata4[tid + 4];
+        sdata4[tid] += sdata4[tid + 2];
+        sdata4[tid] += sdata4[tid + 1];
+    }
+}
+
+__kernel 
+    void compute_descriptors64(
+    image2d_t imgTex,
+    volatile __global float * descriptors, 
+    __global const float * keypoints,
+    int descriptors_step,
+    int keypoints_step
+    )
+{
+    descriptors_step /= sizeof(float);
+    keypoints_step   /= sizeof(float);
+
+    __global const float * featureX    = keypoints + X_ROW * keypoints_step;
+    __global const float * featureY    = keypoints + Y_ROW * keypoints_step;
+    __global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
+    __global const float * featureDir  = keypoints + ANGLE_ROW * keypoints_step;
+
+    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+    volatile __local  float sdx[25];
+    volatile __local  float sdy[25];
+    volatile __local  float sdxabs[25];
+    volatile __local  float sdyabs[25];
+    volatile __local  float s_PATCH[6][6];
+
+    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+    if (tid < 25)
+    {
+        sdxabs[tid] = fabs(sdx[tid]); // |dx| array
+        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
+
+        // write dx, dy, |dx|, |dy|
+        if (tid == 0)
+        {
+            descriptors_block[0] = sdx[0];
+            descriptors_block[1] = sdy[0];
+            descriptors_block[2] = sdxabs[0];
+            descriptors_block[3] = sdyabs[0];
+        }
+    }
+}
+__kernel 
+    void compute_descriptors128(
+    image2d_t imgTex,
+    __global volatile float * descriptors, 
+    __global float * keypoints,
+    int descriptors_step,
+    int keypoints_step
+    )
+{
+    descriptors_step /= sizeof(*descriptors);
+    keypoints_step   /= sizeof(*keypoints);
+
+    __global float * featureX   = keypoints + X_ROW * keypoints_step;
+    __global float * featureY   = keypoints + Y_ROW * keypoints_step;
+    __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
+    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
+
+    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+    volatile __local  float sdx[25];
+    volatile __local  float sdy[25];
+
+    // sum (reduce) 5x5 area response
+    volatile __local  float sd1[25];
+    volatile __local  float sd2[25];
+    volatile __local  float sdabs1[25];
+    volatile __local  float sdabs2[25];
+    volatile __local  float s_PATCH[6][6];
+
+    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+    if (tid < 25)
+    {
+        if (sdy[tid] >= 0)
+        {
+            sd1[tid] = sdx[tid];
+            sdabs1[tid] = fabs(sdx[tid]);
+            sd2[tid] = 0;
+            sdabs2[tid] = 0;
+        }
+        else
+        {
+            sd1[tid] = 0;
+            sdabs1[tid] = 0;
+            sd2[tid] = sdx[tid];
+            sdabs2[tid] = fabs(sdx[tid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+
+        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
+        if (tid == 0)
+        {
+            descriptors_block[0] = sd1[0];
+            descriptors_block[1] = sdabs1[0];
+            descriptors_block[2] = sd2[0];
+            descriptors_block[3] = sdabs2[0];
+        }
+
+        if (sdx[tid] >= 0)
+        {
+            sd1[tid] = sdy[tid];
+            sdabs1[tid] = fabs(sdy[tid]);
+            sd2[tid] = 0;
+            sdabs2[tid] = 0;
+        }
+        else
+        {
+            sd1[tid] = 0;
+            sdabs1[tid] = 0;
+            sd2[tid] = sdy[tid];
+            sdabs2[tid] = fabs(sdy[tid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
+        if (tid == 0)
+        {
+            descriptors_block[4] = sd1[0];
+            descriptors_block[5] = sdabs1[0];
+            descriptors_block[6] = sd2[0];
+            descriptors_block[7] = sdabs2[0];
+        }
+    }
+}
+
+__kernel 
+    void normalize_descriptors128(__global float * descriptors, int descriptors_step)
+{
+    descriptors_step /= sizeof(*descriptors);
+    // no need for thread ID
+    __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
+
+    // read in the unnormalized descriptor values (squared)
+    volatile __local  float sqDesc[128];
+    const float lookup = descriptor_base[get_local_id(0)];
+    sqDesc[get_local_id(0)] = lookup * lookup;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 64)
+        sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // reduction to get total
+    if (get_local_id(0) < 32)
+    {
+        volatile __local  float* smem = sqDesc;
+
+        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
+    }
+
+    // compute length (square root)
+    volatile __local  float len;
+    if (get_local_id(0) == 0)
+    {
+        len = sqrt(sqDesc[0]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // normalize and store in output
+    descriptor_base[get_local_id(0)] = lookup / len;
+}
+__kernel 
+    void normalize_descriptors64(__global float * descriptors, int descriptors_step)
+{
+    descriptors_step /= sizeof(*descriptors);
+    // no need for thread ID
+    __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
+
+    // read in the unnormalized descriptor values (squared)
+    volatile __local  float sqDesc[64];
+    const float lookup = descriptor_base[get_local_id(0)];
+    sqDesc[get_local_id(0)] = lookup * lookup;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // reduction to get total
+    if (get_local_id(0) < 32)
+    {
+        volatile __local  float* smem = sqDesc;
+
+        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
+        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
+    }
+
+    // compute length (square root)
+    volatile __local  float len;
+    if (get_local_id(0) == 0)
+    {
+        len = sqrt(sqDesc[0]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // normalize and store in output
+    descriptor_base[get_local_id(0)] = lookup / len;
+}
--- a/modules/ocl/src/kernels/objdetect_hog.cl
+++ b/modules/ocl/src/kernels/objdetect_hog.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wenju He, wenju@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#define CELL_WIDTH 8
+#define CELL_HEIGHT 8
+#define CELLS_PER_BLOCK_X 2
+#define CELLS_PER_BLOCK_Y 2
+#define NTHREADS 256
+#define CV_PI_F 3.1415926535897932384626433832795f
+
+//----------------------------------------------------------------------------
+// Histogram computation
+
+__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, 
+                                   const int cnbins, const int cblock_hist_size, const int img_block_width, 
+                                   const int grad_quadstep, const int qangle_step, 
+                                   __global const float* grad, __global const uchar* qangle, 
+                                   const float scale, __global float* block_hists, __local float* smem)
+{
+    const int lidX = get_local_id(0);
+    const int lidY = get_local_id(1);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    const int cell_x = lidX / 16;
+    const int cell_y = lidY;
+    const int cell_thread_x = lidX & 0xF;
+
+    __local float* hists = smem;
+    __local float* final_hist = smem + cnbins * 48;
+
+    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
+    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
+
+    __global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1);
+    __global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1);
+
+    // 12 means that 12 pixels affect on block's cell (in one row)
+    if (cell_thread_x < 12)
+    {
+        __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x;
+        for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+            hist[bin_id * 48] = 0.f;
+
+        const int dist_x = -4 + cell_thread_x - 4 * cell_x;
+
+        const int dist_y_begin = -4 - 4 * lidY;
+        for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+        {
+            float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
+            uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+
+            grad_ptr += grad_quadstep;
+            qangle_ptr += qangle_step;
+
+            int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+            int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+            float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale);
+            float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f;
+
+            hist[bin.x * 48] += gaussian * interp_weight * vote.x;
+            hist[bin.y * 48] += gaussian * interp_weight * vote.y;
+        }
+
+        volatile __local float* hist_ = hist;
+        for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
+        {
+            if (cell_thread_x < 6) hist_[0] += hist_[6];
+            if (cell_thread_x < 3) hist_[0] += hist_[3];
+            if (cell_thread_x == 0)
+                final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size;
+
+    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
+    if (tid < cblock_hist_size)
+        block_hist[tid] = final_hist[tid];
+}
+
+//-------------------------------------------------------------
+//  Normalization of histograms via L2Hys_norm
+//
+float reduce_smem(volatile __local float* smem, int size)
+{
+    unsigned int tid = get_local_id(0);
+    float sum = smem[tid];
+
+    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); }
+
+    if (tid < 32)
+    {
+        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
+        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
+        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
+        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
+        if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
+        if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0];
+
+    return sum;
+}
+
+__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
+                                     __global float* block_hists, const float threshold, __local float *squares)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
+
+    float elem = 0.f;
+    if (tid < block_hist_size)
+        elem = hist[0];
+
+    squares[tid] = elem * elem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float sum = reduce_smem(squares, nthreads);
+
+    float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
+    elem = min(elem * scale, threshold);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    squares[tid] = elem * elem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = reduce_smem(squares, nthreads);
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+    if (tid < block_hist_size)
+        hist[0] = elem * scale;
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//
+__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
+                                    const int img_win_width, const int img_block_width,
+                                    const int win_block_stride_x, const int win_block_stride_y,
+                                    __global const float * block_hists, __global const float* coefs,
+                                    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+    }
+
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+ 
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        volatile __local float* smem = products;
+        smem[tid] = product = product + smem[tid + 32];
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+        smem[tid] = product = product + smem[tid + 1];
+    }
+
+    if (tid == 0)
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+}
+
+//----------------------------------------------------------------------------
+// Extract descriptors
+
+__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
+                                            const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
+									        __global const float* block_hists, __global float* descriptors)
+{
+    int tid = get_local_id(0);
+    int gidX = get_group_id(0);
+    int gidY = get_group_id(1);
+    
+    // Get left top corner of the window in src
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    // Get left top corner of the window in dst
+    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+
+    // Copy elements from src to dst
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+    }
+}
+
+__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, 
+                                            const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
+                                            const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
+{
+    int tid = get_local_id(0);
+    int gidX = get_group_id(0);
+    int gidY = get_group_id(1);
+
+    // Get left top corner of the window in src
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    // Get left top corner of the window in dst
+    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+
+    // Copy elements from src to dst
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int block_idx = i / cblock_hist_size;
+        int idx_in_block = i - block_idx * cblock_hist_size;
+
+        int y = block_idx / cnblocks_win_x;
+        int x = block_idx - y * cnblocks_win_x;
+
+        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+    }
+}
+
+//----------------------------------------------------------------------------
+// Gradients computation
+
+__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, 
+                                            const __global uchar4 * img, __global float * grad, __global uchar * qangle, 
+                                            const float angle_scale, const char correct_gamma, const int cnbins)
+{
+    const int x = get_global_id(0);
+    const int tid = get_local_id(0);
+    const int gSizeX = get_local_size(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const uchar4* row = img + gidY * img_step;
+
+    __local float sh_row[(NTHREADS + 2) * 3];
+
+    uchar4 val;
+    if (x < width)
+        val = row[x];
+    else
+        val = row[width - 2];
+
+    sh_row[tid + 1] = val.x;
+    sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
+    sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
+
+    if (tid == 0)
+    {
+        val = row[max(x - 1, 1)];
+        sh_row[0] = val.x;
+        sh_row[(NTHREADS + 2)] = val.y;
+        sh_row[2 * (NTHREADS + 2)] = val.z;
+    }
+
+    if (tid == gSizeX - 1)
+    {
+        val = row[min(x + 1, width - 2)];
+        sh_row[gSizeX + 1] = val.x;
+        sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
+        sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (x < width)
+    {
+        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
+        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+
+        float3 dx;
+        if (correct_gamma == 1)
+            dx = sqrt(b) - sqrt(a);
+        else
+            dx = b - a;
+
+        float3 dy = (float3) 0.f;
+
+        if (gidY > 0 && gidY < height - 1)
+        {
+            a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
+            b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
+
+            if (correct_gamma == 1)
+                dy = sqrt(b) - sqrt(a);
+            else
+                dy = b - a;
+        }
+
+        float best_dx = dx.x;
+        float best_dy = dy.x;
+
+        float mag0 = dx.x * dx.x + dy.x * dy.x;
+        float mag1 = dx.y * dx.y + dy.y * dy.y;
+        if (mag0 < mag1)
+        {
+            best_dx = dx.y;
+            best_dy = dy.y;
+            mag0 = mag1;
+        }
+
+        mag1 = dx.z * dx.z + dy.z * dy.z;
+        if (mag0 < mag1)
+        {
+            best_dx = dx.z;
+            best_dy = dy.z;
+            mag0 = mag1;
+        }
+
+        mag0 = sqrt(mag0);
+
+        float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)floor(ang);
+        ang -= hidx;
+        hidx = (hidx + cnbins) % cnbins;
+
+        qangle[(gidY * qangle_step + x) << 1] = hidx;
+        qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
+        grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
+        grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
+    }
+}
+
+__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
+                                            __global const uchar * img, __global float * grad, __global uchar * qangle, 
+                                            const float angle_scale, const char correct_gamma, const int cnbins)
+{
+    const int x = get_global_id(0);
+    const int tid = get_local_id(0);
+    const int gSizeX = get_local_size(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const uchar* row = img + gidY * img_step;
+
+    __local float sh_row[NTHREADS + 2];
+
+    if (x < width)
+        sh_row[tid + 1] = row[x];
+    else
+        sh_row[tid + 1] = row[width - 2];
+
+    if (tid == 0)
+        sh_row[0] = row[max(x - 1, 1)];
+
+    if (tid == gSizeX - 1)
+        sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (x < width)
+    {
+        float dx;
+
+        if (correct_gamma == 1)
+            dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
+        else
+            dx = sh_row[tid + 2] - sh_row[tid];
+
+        float dy = 0.f;
+        if (gidY > 0 && gidY < height - 1)
+        {
+            float a = (float) img[ (gidY + 1) * img_step + x ];
+            float b = (float) img[ (gidY - 1) * img_step + x ];
+            if (correct_gamma == 1)
+                dy = sqrt(a) - sqrt(b);
+            else
+                dy = a - b;
+        }
+        float mag = sqrt(dx * dx + dy * dy);
+
+        float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)floor(ang);
+        ang -= hidx;
+        hidx = (hidx + cnbins) % cnbins;
+
+        qangle[ (gidY * qangle_step + x) << 1 ]     = hidx;
+        qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
+        grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
+        grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
+    }
+}
--- a/modules/ocl/src/surf.cpp
+++ b/modules/ocl/src/surf.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <iomanip>
+#include "precomp.hpp"
+
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#if !defined (HAVE_OPENCL)
+
+cv::ocl::SURF_OCL::SURF_OCL() { throw_nogpu(); }
+cv::ocl::SURF_OCL::SURF_OCL(double, int, int, bool, float, bool) { throw_nogpu(); }
+int cv::ocl::SURF_OCL::descriptorSize() const { throw_nogpu(); return 0;}
+void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint>&, oclMat&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat&, vector<KeyPoint>&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat&, vector<float>&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, oclMat&, oclMat&, bool) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&, oclMat&, bool) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::releaseMemory() { throw_nogpu(); }
+
+#else /* !defined (HAVE_OPENCL) */
+namespace cv { namespace ocl 
+{
+    ///////////////////////////OpenCL kernel strings///////////////////////////
+    extern const char * nonfree_surf;
+}}
+
+namespace 
+{
+    static inline int divUp(int total, int grain)
+    {
+        return (total + grain - 1) / grain;
+    }
+    static inline int calcSize(int octave, int layer)
+    {
+        /* Wavelet size at first layer of first octave. */
+        const int HAAR_SIZE0 = 9;
+
+        /* Wavelet size increment between layers. This should be an even number,
+        such that the wavelet sizes in an octave are either all even or all odd.
+        This ensures that when looking for the neighbours of a sample, the layers
+
+        above and below are aligned correctly. */
+        const int HAAR_SIZE_INC = 6;
+
+        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
+    }
+
+    class SURF_OCL_Invoker
+    {
+    public:
+        // facilities
+        void bindImgTex(const oclMat& img);
+        void bindSumTex(const oclMat& sum);
+        void bindMaskSumTex(const oclMat& maskSum);
+
+        //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
+        //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
+
+        // kernel callers declearations
+        void icvCalcLayerDetAndTrace_gpu(oclMat& det, oclMat& trace, int octave, int nOctaveLayers, int layer_rows);
+
+        void icvFindMaximaInLayer_gpu(const oclMat& det, const oclMat& trace, oclMat& maxPosBuffer, oclMat& maxCounter, int counterOffset,
+            int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
+
+        void icvInterpolateKeypoint_gpu(const oclMat& det, const oclMat& maxPosBuffer, unsigned int maxCounter,
+            oclMat& keypoints, oclMat& counters, int octave, int layer_rows, int maxFeatures);
+
+        void icvCalcOrientation_gpu(const oclMat& keypoints, int nFeatures);
+
+        void compute_descriptors_gpu(const oclMat& descriptors, const oclMat& keypoints, int nFeatures);
+        // end of kernel callers declearations
+
+
+        SURF_OCL_Invoker(SURF_OCL& surf, const oclMat& img, const oclMat& mask) :
+        surf_(surf),
+            img_cols(img.cols), img_rows(img.rows),
+            use_mask(!mask.empty())
+        {
+            CV_Assert(!img.empty() && img.type() == CV_8UC1);
+            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
+            CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
+
+            const int min_size = calcSize(surf_.nOctaves - 1, 0);
+            CV_Assert(img_rows - min_size >= 0);
+            CV_Assert(img_cols - min_size >= 0);
+
+            const int layer_rows = img_rows >> (surf_.nOctaves - 1);
+            const int layer_cols = img_cols >> (surf_.nOctaves - 1);
+            const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
+            CV_Assert(layer_rows - 2 * min_margin > 0);
+            CV_Assert(layer_cols - 2 * min_margin > 0);
+
+            maxFeatures   = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
+            maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
+
+            CV_Assert(maxFeatures > 0);
+
+            counters.create(1, surf_.nOctaves + 1, CV_32SC1);
+            counters.setTo(Scalar::all(0));
+
+            //loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
+
+            bindImgTex(img);
+            oclMat integral_sqsum;
+            integral(img, surf_.sum, integral_sqsum); // the two argumented integral version is incorrect
+
+            bindSumTex(surf_.sum);
+            maskSumTex = 0;
+
+            if (use_mask)
+            {
+                throw std::exception();
+                //!FIXME
+                // temp fix for missing min overload
+                oclMat temp(mask.size(), mask.type());
+                temp.setTo(Scalar::all(1.0));
+                //cv::ocl::min(mask, temp, surf_.mask1);           ///////// disable this 
+                integral(surf_.mask1, surf_.maskSum);
+                bindMaskSumTex(surf_.maskSum);
+            }
+        }
+
+        void detectKeypoints(oclMat& keypoints)
+        {
+            // create image pyramid buffers
+            // different layers have same sized buffers, but they are sampled from gaussin kernel.
+            surf_.det.create(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1);  
+            surf_.trace.create(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1);
+
+            surf_.maxPosBuffer.create(1, maxCandidates, CV_32SC4);
+            keypoints.create(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1);
+            keypoints.setTo(Scalar::all(0));
+
+            for (int octave = 0; octave < surf_.nOctaves; ++octave)
+            {
+                const int layer_rows = img_rows >> octave;
+                const int layer_cols = img_cols >> octave;
+
+                //loadOctaveConstants(octave, layer_rows, layer_cols);
+
+                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows);
+
+                icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
+                    octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
+
+                unsigned int maxCounter = Mat(counters).at<unsigned int>(1 + octave);
+                maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
+
+                if (maxCounter > 0)
+                {
+                    icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
+                        keypoints, counters, octave, layer_rows, maxFeatures);
+                }
+            }
+            unsigned int featureCounter = Mat(counters).at<unsigned int>(0);
+            featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
+
+            keypoints.cols = featureCounter;
+
+            if (surf_.upright)
+                keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
+            else
+                findOrientation(keypoints);
+        }
+
+        void findOrientation(oclMat& keypoints)
+        {
+            const int nFeatures = keypoints.cols;
+            if (nFeatures > 0)
+            {
+                icvCalcOrientation_gpu(keypoints, nFeatures);
+            }
+        }
+
+        void computeDescriptors(const oclMat& keypoints, oclMat& descriptors, int descriptorSize)
+        {
+            const int nFeatures = keypoints.cols;
+            if (nFeatures > 0)
+            {
+                descriptors.create(nFeatures, descriptorSize, CV_32F);
+                compute_descriptors_gpu(descriptors, keypoints, nFeatures);
+            }
+        }
+
+        ~SURF_OCL_Invoker()
+        {
+            if(imgTex)
+                openCLFree(imgTex);
+            if(sumTex)
+                openCLFree(sumTex);
+            if(maskSumTex)
+                openCLFree(maskSumTex);
+            additioalParamBuffer.release();
+        }
+
+    private:
+        SURF_OCL& surf_;
+
+        int img_cols, img_rows;
+
+        bool use_mask;
+
+        int maxCandidates;
+        int maxFeatures;
+
+        oclMat counters;
+
+        // texture buffers
+        cl_mem imgTex;
+        cl_mem sumTex;
+        cl_mem maskSumTex;
+
+        oclMat additioalParamBuffer;
+    };
+}
+
+cv::ocl::SURF_OCL::SURF_OCL()
+{
+    hessianThreshold = 100.0f;
+    extended = true;
+    nOctaves = 4;
+    nOctaveLayers = 2;
+    keypointsRatio = 0.01f;
+    upright = false;
+}
+
+cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
+{
+    hessianThreshold = _threshold;
+    extended = _extended;
+    nOctaves = _nOctaves;
+    nOctaveLayers = _nOctaveLayers;
+    keypointsRatio = _keypointsRatio;
+    upright = _upright;
+}
+
+int cv::ocl::SURF_OCL::descriptorSize() const
+{
+    return extended ? 128 : 64;
+}
+
+void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint>& keypoints, oclMat& keypointsGPU)
+{
+    if (keypoints.empty())
+        keypointsGPU.release();
+    else
+    {
+        Mat keypointsCPU(SURF_OCL::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
+
+        float* kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
+        float* kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
+        int* kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
+        int* kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
+        float* kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
+        float* kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
+        float* kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
+
+        for (size_t i = 0, size = keypoints.size(); i < size; ++i)
+        {
+            const KeyPoint& kp = keypoints[i];
+            kp_x[i] = kp.pt.x;
+            kp_y[i] = kp.pt.y;
+            kp_octave[i] = kp.octave;
+            kp_size[i] = kp.size;
+            kp_dir[i] = kp.angle;
+            kp_hessian[i] = kp.response;
+            kp_laplacian[i] = 1;
+        }
+
+        keypointsGPU.upload(keypointsCPU);
+    }
+}
+
+void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat& keypointsGPU, vector<KeyPoint>& keypoints)
+{
+    const int nFeatures = keypointsGPU.cols;
+
+    if (nFeatures == 0)
+        keypoints.clear();
+    else
+    {
+        CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
+
+        Mat keypointsCPU(keypointsGPU);
+
+        keypoints.resize(nFeatures);
+
+        float* kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
+        float* kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
+        int* kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
+        int* kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
+        float* kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
+        float* kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
+        float* kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
+
+        for (int i = 0; i < nFeatures; ++i)
+        {
+            KeyPoint& kp = keypoints[i];
+            kp.pt.x = kp_x[i];
+            kp.pt.y = kp_y[i];
+            kp.class_id = kp_laplacian[i];
+            kp.octave = kp_octave[i];
+            kp.size = kp_size[i];
+            kp.angle = kp_dir[i];
+            kp.response = kp_hessian[i];
+        }
+    }
+}
+
+void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat& descriptorsGPU, vector<float>& descriptors)
+{
+    if (descriptorsGPU.empty())
+        descriptors.clear();
+    else
+    {
+        CV_Assert(descriptorsGPU.type() == CV_32F);
+
+        descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
+        Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
+        descriptorsGPU.download(descriptorsCPU);
+    }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints)
+{
+    if (!img.empty())
+    {
+        SURF_OCL_Invoker surf(*this, img, mask);
+
+        surf.detectKeypoints(keypoints);
+    }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints, oclMat& descriptors,
+    bool useProvidedKeypoints)
+{
+    if (!img.empty())
+    {
+        SURF_OCL_Invoker surf(*this, img, mask);
+
+        if (!useProvidedKeypoints)
+            surf.detectKeypoints(keypoints);
+        else if (!upright)
+        {
+            surf.findOrientation(keypoints);
+        }
+
+        surf.computeDescriptors(keypoints, descriptors, descriptorSize());
+    }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints)
+{
+    oclMat keypointsGPU;
+
+    (*this)(img, mask, keypointsGPU);
+
+    downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints,
+    oclMat& descriptors, bool useProvidedKeypoints)
+{
+    oclMat keypointsGPU;
+
+    if (useProvidedKeypoints)
+        uploadKeypoints(keypoints, keypointsGPU);
+
+    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
+
+    downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints,
+    vector<float>& descriptors, bool useProvidedKeypoints)
+{
+    oclMat descriptorsGPU;
+
+    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
+
+    downloadDescriptors(descriptorsGPU, descriptors);
+}
+
+void cv::ocl::SURF_OCL::releaseMemory()
+{
+    sum.release();
+    mask1.release();
+    maskSum.release();
+    intBuffer.release();
+    det.release();
+    trace.release();
+    maxPosBuffer.release();
+}
+
+// Facilities
+
+//// load SURF constants into device memory
+//void SURF_OCL_Invoker::loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
+//{
+//	Mat tmp(1, 9, CV_32FC1);
+//	float * tmp_data = tmp.ptr<float>();
+//	*tmp_data        = maxCandidates;
+//	*(++tmp_data)    = maxFeatures;
+//	*(++tmp_data)    = img_rows;
+//	*(++tmp_data)    = img_cols;
+//	*(++tmp_data)    = nOctaveLayers;
+//	*(++tmp_data)    = hessianThreshold;
+//	additioalParamBuffer = tmp;
+//}
+//void SURF_OCL_Invoker::loadOctaveConstants(int octave, int layer_rows, int layer_cols)
+//{
+//	Mat tmp = additioalParamBuffer;
+//	float * tmp_data = tmp.ptr<float>();
+//	tmp_data += 6;
+//	*tmp_data        = octave;
+//	*(++tmp_data)    = layer_rows;
+//	*(++tmp_data)    = layer_cols;
+//	additioalParamBuffer = tmp;
+//}
+
+// create and bind source buffer to image oject.
+void SURF_OCL_Invoker::bindImgTex(const oclMat& img)
+{
+    Mat cpu_img(img); // time consuming
+    cl_image_format format;
+    int err;
+
+    format.image_channel_data_type = CL_UNSIGNED_INT8;
+    format.image_channel_order     = CL_R;
+
+#if CL_VERSION_1_2
+    cl_image_desc desc;
+    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+    desc.image_width      = cpu_img.cols;
+    desc.image_height     = cpu_img.rows;
+    desc.image_depth      = NULL;
+    desc.image_array_size = 1;
+    desc.image_row_pitch  = cpu_img.step;
+    desc.image_slice_pitch= 0;
+    desc.buffer           = NULL;
+    desc.num_mip_levels   = 0;
+    desc.num_samples      = 0;
+    imgTex = clCreateImage(img.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err);
+#else
+    imgTex = clCreateImage2D(
+        img.clCxt->impl->clContext, 
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+        &format, 
+        cpu_img.cols, 
+        cpu_img.rows, 
+        cpu_img.step, 
+        cpu_img.data, 
+        &err);
+#endif
+    openCLSafeCall(err);
+}
+
+void SURF_OCL_Invoker::bindSumTex(const oclMat& sum)
+{
+    Mat cpu_img(sum); // time consuming
+    cl_image_format format;
+    int err;
+    format.image_channel_data_type = CL_UNSIGNED_INT32;
+    format.image_channel_order     = CL_R;
+#if CL_VERSION_1_2
+    cl_image_desc desc;
+    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+    desc.image_width      = cpu_img.cols;
+    desc.image_height     = cpu_img.rows;
+    desc.image_depth      = NULL;
+    desc.image_array_size = 1;
+    desc.image_row_pitch  = cpu_img.step;
+    desc.image_slice_pitch= 0;
+    desc.buffer           = NULL;
+    desc.num_mip_levels   = 0;
+    desc.num_samples      = 0;
+    sumTex = clCreateImage(sum.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err);
+#else
+    sumTex = clCreateImage2D(
+        sum.clCxt->impl->clContext, 
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+        &format, 
+        cpu_img.cols, 
+        cpu_img.rows, 
+        cpu_img.step, 
+        cpu_img.data, 
+        &err);
+#endif
+    openCLSafeCall(err);
+}
+void SURF_OCL_Invoker::bindMaskSumTex(const oclMat& maskSum)
+{
+    Mat cpu_img(maskSum); // time consuming
+    cl_image_format format;
+    int err;
+    format.image_channel_data_type = CL_UNSIGNED_INT32;
+    format.image_channel_order     = CL_R;
+#if CL_VERSION_1_2
+    cl_image_desc desc;
+    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+    desc.image_width      = cpu_img.cols;
+    desc.image_height     = cpu_img.rows;
+    desc.image_depth      = NULL;
+    desc.image_array_size = 1;
+    desc.image_row_pitch  = cpu_img.step;
+    desc.image_slice_pitch= 0;
+    desc.buffer           = NULL;
+    desc.num_mip_levels   = 0;
+    desc.num_samples      = 0;
+    maskSumTex = clCreateImage(maskSum.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err);
+#else
+    maskSumTex = clCreateImage2D(
+        maskSum.clCxt->impl->clContext, 
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+        &format, 
+        cpu_img.cols, 
+        cpu_img.rows, 
+        cpu_img.step, 
+        cpu_img.data, 
+        &err);
+#endif
+    openCLSafeCall(err);
+}
+
+////////////////////////////
+// kernel caller definitions
+void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat& det, oclMat& trace, int octave, int nOctaveLayers, int c_layer_rows)
+{
+    const int min_size = calcSize(octave, 0);
+    const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
+    const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
+
+    Context *clCxt = det.clCxt;
+    string kernelName = "icvCalcLayerDetAndTrace";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows));
+
+    size_t localThreads[3]  = {16, 16, 1};
+    size_t globalThreads[3] = {
+        divUp(max_samples_j, localThreads[0]) * localThreads[0], 
+        divUp(max_samples_i, localThreads[1]) * localThreads[1] * (nOctaveLayers + 2), 
+        1};
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat& det, const oclMat& trace, oclMat& maxPosBuffer, oclMat& maxCounter, int counterOffset,
+    int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
+{
+    const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
+
+    Context *clCxt = det.clCxt;
+    string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&counterOffset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&nLayers));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
+    args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
+
+    if(use_mask)
+    {
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
+    }
+
+    size_t localThreads[3]  = {16, 16, 1};
+    size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) * localThreads[0], 
+        divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) * nLayers * localThreads[1], 
+        1};
+
+    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat& det, const oclMat& maxPosBuffer, unsigned int maxCounter,
+    oclMat& keypoints, oclMat& counters, int octave, int layer_rows, int maxFeatures)
+{
+    Context *clCxt = det.clCxt;
+    string kernelName = "icvInterpolateKeypoint";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures));
+
+    size_t localThreads[3]  = {3, 3, 3};
+    size_t globalThreads[3] = {maxCounter * localThreads[0], 1, 1};
+
+    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat& keypoints, int nFeatures)
+{
+    Context * clCxt = counters.clCxt;
+    string kernelName = "icvCalcOrientation";
+
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+
+    size_t localThreads[3]  = {32, 4, 1};
+    size_t globalThreads[3] = {nFeatures * localThreads[0], localThreads[1], 1};
+
+    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat& descriptors, const oclMat& keypoints, int nFeatures)
+{
+    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
+    Context *clCxt = descriptors.clCxt;
+    string kernelName = "";
+    vector< pair<size_t, const void *> > args;
+    size_t localThreads[3]  = {1, 1, 1};
+    size_t globalThreads[3] = {1, 1, 1};
+
+    if(descriptors.cols == 64)
+    {
+        kernelName = "compute_descriptors64";
+
+        localThreads[0] = 6;
+        localThreads[1] = 6;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = 16 * localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+
+        kernelName = "normalize_descriptors64";
+
+        localThreads[0] = 64;
+        localThreads[1] = 1;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    }
+    else
+    {
+        kernelName = "compute_descriptors128";
+
+        localThreads[0] = 6;
+        localThreads[1] = 6;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = 16 * localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+
+        kernelName = "normalize_descriptors128";
+
+        localThreads[0] = 128;
+        localThreads[1] = 1;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    }
+}
+
+#endif // /* !defined (HAVE_OPENCL) */
+
--- a/modules/ocl/test/test_hog.cpp
+++ b/modules/ocl/test/test_hog.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//		Wenju He, wenju@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/core/core.hpp"
+using namespace std;
+#ifdef HAVE_OPENCL
+
+
+PARAM_TEST_CASE(HOG,cv::Size,int)
+{
+	cv::Size winSize;
+	int type;
+	vector<cv::ocl::Info> info;
+	virtual void SetUp()
+	{
+		winSize = GET_PARAM(0);
+		type = GET_PARAM(1);
+		cv::ocl::getDevice(info);
+	}
+};
+
+TEST_P(HOG, GetDescriptors)
+{
+    // Load image
+    cv::Mat img_rgb = readImage("../../../samples/gpu/road.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+    // Convert image
+    cv::Mat img;
+    switch (type)
+    {
+    case CV_8UC1:
+        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        break;
+    case CV_8UC4:
+    default:
+        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        break;
+    }
+    cv::ocl::oclMat d_img(img);
+
+    // HOGs
+    cv::ocl::HOGDescriptor ocl_hog;
+    ocl_hog.gamma_correction = true;
+    cv::HOGDescriptor hog;
+    hog.gammaCorrection = true;
+
+    // Compute descriptor
+    cv::ocl::oclMat d_descriptors;
+    ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
+    cv::Mat down_descriptors;
+    d_descriptors.download(down_descriptors);
+    down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
+
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    std::vector<float> descriptors;
+    switch (type)
+    {
+    case CV_8UC1:
+        hog.compute(img, descriptors, ocl_hog.win_size);
+        break;
+    case CV_8UC4:
+    default:
+        hog.compute(img_rgb, descriptors, ocl_hog.win_size);
+        break;
+    }
+    cv::Mat cpu_descriptors(descriptors);
+
+    EXPECT_MAT_SIMILAR(down_descriptors, cpu_descriptors, 1e-2);
+}
+
+
+TEST_P(HOG, Detect)
+{
+    // Load image
+    cv::Mat img_rgb = readImage("../../../samples/gpu/road.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+    // Convert image
+    cv::Mat img;
+    switch (type)
+    {
+    case CV_8UC1:
+        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        break;
+    case CV_8UC4:
+    default:
+        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        break;
+    }
+    cv::ocl::oclMat d_img(img);
+
+    // HOGs
+    if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
+        winSize = cv::Size(64, 128);
+    cv::ocl::HOGDescriptor ocl_hog(winSize);
+    ocl_hog.gamma_correction = true;
+
+    cv::HOGDescriptor hog;
+    hog.winSize = winSize;
+    hog.gammaCorrection = true;
+
+    if (winSize.width == 48 && winSize.height == 96)
+    {
+        // daimler's base
+        ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector48x96());
+        hog.setSVMDetector(hog.getDaimlerPeopleDetector());
+    }
+    else if (winSize.width == 64 && winSize.height == 128)
+    {
+        ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector64x128());
+        hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    }
+    else
+    {
+        ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+        hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    }
+
+    // OpenCL detection
+    std::vector<cv::Point> d_v_locations;
+    ocl_hog.detect(d_img, d_v_locations, 0);
+    cv::Mat d_locations(d_v_locations);
+    
+    // CPU detection
+    std::vector<cv::Point> v_locations;
+    switch (type)
+    {
+    case CV_8UC1:
+        hog.detect(img, v_locations, 0);
+        break;
+    case CV_8UC4:
+    default:
+        hog.detect(img_rgb, v_locations, 0);
+        break;
+    }
+    cv::Mat locations(v_locations);
+
+    char s[100]={0};
+    EXPECT_MAT_NEAR(d_locations, locations, 0, s);
+}
+
+
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
+                        testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
+                        testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
+
+
+#endif //HAVE_OPENCL