Merge pull request #1123 from bitwangyaoyao:2.4_fix

bc78e87a · Andrey Pavlenko · OpenCV Buildbot · 26a3cabb · 270b2c79 · bc78e87a
Commit bc78e87a authored Jul 30, 2013 by Andrey Pavlenko Committed by OpenCV Buildbot Jul 30, 2013
4 changed files
--- a/modules/ocl/perf/perf_opticalflow.cpp
+++ b/modules/ocl/perf/perf_opticalflow.cpp
@@ -48,8 +48,8 @@
 ///////////// PyrLKOpticalFlow ////////////////////////
 PERFTEST(PyrLKOpticalFlow)
 {
-    std::string images1[] = {"rubberwhale1.png", "basketball1.png"};
+    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
-    std::string images2[] = {"rubberwhale2.png", "basketball2.png"};
+    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
    {

--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -56,98 +56,6 @@ using namespace std;
 static oclMat gauss_w_lut;
 static bool hog_device_cpu;
-/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
-static const float gaussian_interp_lut[] = 
-{
-    /* gaussian lut */
-    0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f, 
-    0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f, 
-    0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f, 
-    0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f, 
-    0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f, 
-    0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
-    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
-    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
-    0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 
-    0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 
-    0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f, 
-    0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f, 
-    0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f, 
-    0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
-    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
-    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
-    0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 
-    0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 
-    0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f, 
-    0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f, 
-    0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f, 
-    0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f, 
-    0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f, 
-    0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f, 
-    0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 
-    0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 
-    0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f, 
-    0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f, 
-    0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f, 
-    0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
-    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
-    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
-    0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 
-    0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 
-    0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f, 
-    0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f, 
-    0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f, 
-    0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
-    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
-    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
-    0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 
-    0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 
-    0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f, 
-    /* interp_weight lut */
-    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
-    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
-    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f, 
-    0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 
-    0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 
-    0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f, 
-    0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f, 
-    0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f, 
-    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
-    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
-    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f, 
-    0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 
-    0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 
-    0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f, 
-    0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f, 
-    0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f, 
-    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
-    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
-    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f, 
-    0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 
-    0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 
-    0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f, 
-    0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f, 
-    0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f, 
-    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
-    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
-    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f, 
-    0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 
-    0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 
-    0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f, 
-    0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f, 
-    0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f, 
-    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
-    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
-    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f, 
-    0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 
-    0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 
-    0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f, 
-    0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f, 
-    0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f, 
-    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
-    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
-    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
-};
 namespace cv
 {
@@ -180,7 +88,7 @@ namespace cv
                                      int nblocks_win_x, int nblocks_win_y);
                void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                                   int height, int width, float sigma, const cv::ocl::oclMat &grad,
+                                   int height, int width, const cv::ocl::oclMat &grad,
                                   const cv::ocl::oclMat &qangle, 
                                   const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
@@ -328,10 +236,18 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
    labels.create(1, wins_per_img.area(), CV_8U);
-    vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut + 
+    float sigma = getWinSigma();
-        sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
+    float scale = 1.f / (2.f * sigma * sigma);
-    Mat m_lut(v_lut);
+    Mat gaussian_lut(1, 512, CV_32FC1);
-    gauss_w_lut.upload(m_lut.reshape(1,1));
+    int idx = 0;
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
+    gauss_w_lut.upload(gaussian_lut);
 }
 void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
@@ -358,7 +274,7 @@ void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
    computeGradient(img, this->grad, this->qangle);
    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
-        effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
+        effect_size.width, grad, qangle, gauss_w_lut, block_hists);
    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
        effect_size.width, block_hists, (float)threshold_L2hys);
@@ -1708,7 +1624,7 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
 void cv::ocl::device::hog::compute_hists(int nbins, 
                                         int block_stride_x, int block_stride_y,
-                                         int height, int width, float sigma, 
+                                         int height, int width, 
                                         const cv::ocl::oclMat &grad, 
                                         const cv::ocl::oclMat &qangle, 
                                         const cv::ocl::oclMat &gauss_w_lut, 
@@ -1716,8 +1632,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
 {
    Context *clCxt = Context::getContext();
    vector< pair<size_t, const void *> > args;
-    string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" : 
+    string kernelName = "compute_hists_lut_kernel";
-        "compute_hists_kernel";
    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) 
        / block_stride_x;
@@ -1728,9 +1643,6 @@ void cv::ocl::device::hog::compute_hists(int nbins,
    int grad_quadstep = grad.step >> 2;
    int qangle_step = qangle.step;
-    // Precompute gaussian spatial window parameter
-    float scale = 1.f / (2.f * sigma * sigma);
    int blocks_in_group = 4;
    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
    size_t globalThreads[3] = { 
@@ -1751,15 +1663,23 @@ void cv::ocl::device::hog::compute_hists(int nbins,
    args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    if (kernelName.compare("compute_hists_lut_kernel") == 0)
    args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
-    else
-        args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
    args.push_back( make_pair( smem, (void *)NULL));
+    if(hog_device_cpu)
+    {
        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
-        localThreads, args, -1, -1);
+            localThreads, args, -1, -1, "-D CPU");
+    }else
+    {
+        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
+        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        char opt[32] = {0};
+        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+            localThreads, args, -1, -1, opt);
+    }
 }
 void cv::ocl::device::hog::normalize_hists(int nbins, 

--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -53,7 +53,7 @@
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
-// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
+// Use pre-computed gaussian and interp_weight lookup tables
 __kernel void compute_hists_lut_kernel(
    const int cblock_stride_x, const int cblock_stride_y,
    const int cnbins, const int cblock_hist_size, const int img_block_width, 
@@ -146,99 +146,6 @@ __kernel void compute_hists_lut_kernel(
    }
 }
-//----------------------------------------------------------------------------
-// Histogram computation
-// 12 threads for a cell, 12x4 threads per block
-__kernel void compute_hists_kernel(
-    const int cblock_stride_x, const int cblock_stride_y,
-    const int cnbins, const int cblock_hist_size, const int img_block_width, 
-    const int blocks_in_group, const int blocks_total,
-    const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
-    const float scale, __global float* block_hists, __local float* smem)
-{
-    const int lx = get_local_id(0);
-    const int lp = lx / 24; /* local group id */
-    const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
-    const int gidY = gid / img_block_width;
-    const int gidX = gid - gidY * img_block_width;
-    const int lidX = lx - lp * 24;
-    const int lidY = get_local_id(1);
-    const int cell_x = lidX / 12;
-    const int cell_y = lidY;
-    const int cell_thread_x = lidX - cell_x * 12;
-    __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * 
-        CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
-    __local float* final_hist = hists + cnbins * 
-        (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
-    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
-    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
-    __global const float* grad_ptr = (gid < blocks_total) ? 
-        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
-        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
-    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + 
-        cell_thread_x;
-    for (int bin_id = 0; bin_id < cnbins; ++bin_id)
-        hist[bin_id * 48] = 0.f;
-    const int dist_x = -4 + cell_thread_x - 4 * cell_x;
-    const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
-    const int dist_y_begin = -4 - 4 * lidY;
-    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
-    {
-        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
-        grad_ptr += grad_quadstep;
-        qangle_ptr += qangle_step;
-        int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
-        float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * 
-            dist_center_x) * scale);
-        float interp_weight = (8.f - fabs(dist_y + 0.5f)) * 
-            (8.f - fabs(dist_x + 0.5f)) / 64.f;
-        hist[bin.x * 48] += gaussian * interp_weight * vote.x;
-        hist[bin.y * 48] += gaussian * interp_weight * vote.y;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    volatile __local float* hist_ = hist;
-    for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
-    {
-        if (cell_thread_x < 6)
-            hist_[0] += hist_[6];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (cell_thread_x < 3)
-            hist_[0] += hist_[3];
-#ifdef CPU
-        barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-        if (cell_thread_x == 0)
-            final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
-                hist_[0] + hist_[1] + hist_[2];
-    }
-#ifdef CPU
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
-    if ((tid < cblock_hist_size) && (gid < blocks_total))
-    {
-        __global float* block_hist = block_hists + 
-            (gidY * img_block_width + gidX) * cblock_hist_size;
-        block_hist[tid] = final_hist[tid];
-    }
-}
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
 //  optimized for the case of 9 bins

--- a/modules/ocl/src/opencl/pyrlk.cl
+++ b/modules/ocl/src/opencl/pyrlk.cl
@@ -17,6 +17,7 @@
 // @Authors
 //    Dachuan Zhao, dachuan@multicorewareinc.com
 //    Yao Wang, bitwangyaoyao@gmail.com
+//    Xiaopeng Fu, fuxiaopeng2222@163.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -47,6 +48,7 @@
 //#pragma OPENCL EXTENSION cl_amd_printf : enable
 #define	BUFFER	64
+#define	BUFFER2	BUFFER>>1
 #ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
@@ -58,53 +60,16 @@ void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local
    smem3[tid] = val3;
    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 32)
+    for(int i = BUFFER2; i > 0; i >>= 1)
-    {
-        smem1[tid] += smem1[tid + 32];
-        smem2[tid] += smem2[tid + 32];
-        smem3[tid] += smem3[tid + 32];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-        smem1[tid] += smem1[tid + 16];
-        smem2[tid] += smem2[tid + 16];
-        smem3[tid] += smem3[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-        smem1[tid] += smem1[tid + 8];
-        smem2[tid] += smem2[tid + 8];
-        smem3[tid] += smem3[tid + 8];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
    {
-        smem1[tid] += smem1[tid + 4];
+        if(tid < i)
-        smem2[tid] += smem2[tid + 4];
-        smem3[tid] += smem3[tid + 4];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
        {
-        smem1[tid] += smem1[tid + 2];
+            smem1[tid] += smem1[tid + i];
-        smem2[tid] += smem2[tid + 2];
+            smem2[tid] += smem2[tid + i];
-        smem3[tid] += smem3[tid + 2];
+            smem3[tid] += smem3[tid + i];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
-        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
-        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
-        smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 }
 void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
@@ -113,47 +78,15 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
    smem2[tid] = val2;
    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 32)
+    for(int i = BUFFER2; i > 0; i >>= 1)
-    {
-        smem1[tid] += smem1[tid + 32];
-        smem2[tid] += smem2[tid + 32];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-        smem1[tid] += smem1[tid + 16];
-        smem2[tid] += smem2[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-        smem1[tid] += smem1[tid + 8];
-        smem2[tid] += smem2[tid + 8];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
    {
-        smem1[tid] += smem1[tid + 4];
+        if(tid < i)
-        smem2[tid] += smem2[tid + 4];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
        {
-        smem1[tid] += smem1[tid + 2];
+            smem1[tid] += smem1[tid + i];
-        smem2[tid] += smem2[tid + 2];
+            smem2[tid] += smem2[tid + i];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
-        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
-        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 }
 void reduce1(float val1, volatile __local float* smem1, int tid)
@@ -161,45 +94,18 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
    smem1[tid] = val1;
    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 32)
+    for(int i = BUFFER2; i > 0; i >>= 1)
-    {
-        smem1[tid] += smem1[tid + 32];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-        smem1[tid] += smem1[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
    {
-        smem1[tid] += smem1[tid + 8];
+        if(tid < i)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
        {
-        smem1[tid] += smem1[tid + 4];
+            smem1[tid] += smem1[tid + i];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
-    {
-        smem1[tid] += smem1[tid + 2];
    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
-        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 }
 #else
 void reduce3(float val1, float val2, float val3,
-__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
+             __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
 {
    smem1[tid] = val1;
    smem2[tid] = val2;
@@ -212,15 +118,19 @@ __local volatile float* smem1, __local volatile float* smem2, __local volatile f
        smem2[tid] += smem2[tid + 32];
        smem3[tid] += smem3[tid + 32];
 #if WAVE_SIZE < 32
-	} barrier(CLK_LOCAL_MEM_FENCE);
+    }
-	if (tid < 16) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
 #endif
        smem1[tid] += smem1[tid + 16];
        smem2[tid] += smem2[tid + 16];
        smem3[tid] += smem3[tid + 16];
 #if WAVE_SIZE <16
-	} barrier(CLK_LOCAL_MEM_FENCE);
+    }
-	if (tid < 8) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
 #endif
        smem1[tid] += smem1[tid + 8];
        smem2[tid] += smem2[tid + 8];
@@ -238,6 +148,7 @@ __local volatile float* smem1, __local volatile float* smem2, __local volatile f
        smem2[tid] += smem2[tid + 1];
        smem3[tid] += smem3[tid + 1];
    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 }
 void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
@@ -251,14 +162,18 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola
        smem1[tid] += smem1[tid + 32];
        smem2[tid] += smem2[tid + 32];
 #if WAVE_SIZE < 32
-	} barrier(CLK_LOCAL_MEM_FENCE);
+    }
-	if (tid < 16) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
 #endif
        smem1[tid] += smem1[tid + 16];
        smem2[tid] += smem2[tid + 16];
 #if WAVE_SIZE <16
-	} barrier(CLK_LOCAL_MEM_FENCE);
+    }
-	if (tid < 8) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
 #endif
        smem1[tid] += smem1[tid + 8];
        smem2[tid] += smem2[tid + 8];
@@ -272,6 +187,7 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola
        smem1[tid] += smem1[tid + 1];
        smem2[tid] += smem2[tid + 1];
    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 }
 void reduce1(float val1, __local volatile float* smem1, int tid)
@@ -283,19 +199,24 @@ void reduce1(float val1, __local volatile float* smem1, int tid)
    {
        smem1[tid] += smem1[tid + 32];
 #if WAVE_SIZE < 32
-	} barrier(CLK_LOCAL_MEM_FENCE);
+    }
-	if (tid < 16) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
 #endif
        smem1[tid] += smem1[tid + 16];
 #if WAVE_SIZE <16
-	} barrier(CLK_LOCAL_MEM_FENCE);
+    }
-	if (tid < 8) {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
 #endif
        smem1[tid] += smem1[tid + 8];
        smem1[tid] += smem1[tid + 4];
        smem1[tid] += smem1[tid + 2];
        smem1[tid] += smem1[tid + 1];
    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 }
 #endif
@@ -388,15 +309,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
                             __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
                             const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
 {
-#ifdef CPU
-    __local float smem1[BUFFER+1];
-    __local float smem2[BUFFER+1];
-    __local float smem3[BUFFER+1];
-#else
    __local float smem1[BUFFER];
    __local float smem2[BUFFER];
    __local float smem3[BUFFER];
-#endif
    unsigned int xid=get_local_id(0);
    unsigned int yid=get_local_id(1);
@@ -492,17 +407,11 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
    }
    reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
-    barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
-    A11 = smem1[BUFFER];
-    A12 = smem2[BUFFER];
-    A22 = smem3[BUFFER];
-#else
    A11 = smem1[0];
    A12 = smem2[0];
    A22 = smem3[0];
-#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
    float D = A11 * A22 - A12 * A12;
@@ -592,15 +501,10 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
        }
        reduce2(b1, b2, smem1, smem2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
-        b1 = smem1[BUFFER];
-        b2 = smem2[BUFFER];
-#else
        b1 = smem1[0];
        b2 = smem2[0];
-#endif
+        barrier(CLK_LOCAL_MEM_FENCE);
        float2 delta;
        delta.x = A12 * b2 - A22 * b1;
@@ -675,11 +579,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
        nextPts[gid] = prevPt;
        if (calcErr)
-#ifdef CPU
-            err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y);
-#else
            err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
-#endif
    }
 }
@@ -688,15 +588,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
                             __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
                             const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
 {
-#ifdef CPU
-     __local float smem1[BUFFER+1];
-     __local float smem2[BUFFER+1];
-     __local float smem3[BUFFER+1];
-#else
    __local float smem1[BUFFER];
    __local float smem2[BUFFER];
    __local float smem3[BUFFER];
-#endif
    unsigned int xid=get_local_id(0);
    unsigned int yid=get_local_id(1);
@@ -795,17 +689,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
    }
    reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
-    barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
-    A11 = smem1[BUFFER];
-    A12 = smem2[BUFFER];
-    A22 = smem3[BUFFER];
-#else
    A11 = smem1[0];
    A12 = smem2[0];
    A22 = smem3[0];
-#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
    float D = A11 * A22 - A12 * A12;
@@ -895,15 +783,10 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
        }
        reduce2(b1, b2, smem1, smem2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
-        b1 = smem1[BUFFER];
-        b2 = smem2[BUFFER];
-#else
        b1 = smem1[0];
        b2 = smem2[0];
-#endif
+        barrier(CLK_LOCAL_MEM_FENCE);
        float2 delta;
        delta.x = A12 * b2 - A22 * b1;
@@ -977,11 +860,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
        nextPts[gid] = nextPt;
        if (calcErr)
-#ifdef CPU
-            err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y);
-#else
            err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y);
-#endif
    }
 }