generating the lut table instead of hard coding one

270b2c79 · yao · f1060ac5 · 270b2c79 · 270b2c79
Commit 270b2c79 authored Jul 26, 2013 by yao
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 94 deletions

hog.cpp modules/ocl/src/hog.cpp +0 -0

objdetect_hog.cl modules/ocl/src/opencl/objdetect_hog.cl +1 -94

No files found.
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -53,7 +53,7 @@
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
-// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
+// Use pre-computed gaussian and interp_weight lookup tables
 __kernel void compute_hists_lut_kernel(
    const int cblock_stride_x, const int cblock_stride_y,
    const int cnbins, const int cblock_hist_size, const int img_block_width, 
@@ -146,99 +146,6 @@ __kernel void compute_hists_lut_kernel(
    }
 }

-//----------------------------------------------------------------------------
-// Histogram computation
-// 12 threads for a cell, 12x4 threads per block
-__kernel void compute_hists_kernel(
-    const int cblock_stride_x, const int cblock_stride_y,
-    const int cnbins, const int cblock_hist_size, const int img_block_width, 
-    const int blocks_in_group, const int blocks_total,
-    const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
-    const float scale, __global float* block_hists, __local float* smem)
-{
-    const int lx = get_local_id(0);
-    const int lp = lx / 24; /* local group id */
-    const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
-    const int gidY = gid / img_block_width;
-    const int gidX = gid - gidY * img_block_width;
-
-    const int lidX = lx - lp * 24;
-    const int lidY = get_local_id(1);
-
-    const int cell_x = lidX / 12;
-    const int cell_y = lidY;
-    const int cell_thread_x = lidX - cell_x * 12;
-
-    __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * 
-        CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
-    __local float* final_hist = hists + cnbins * 
-        (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
-
-    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
-    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
-
-    __global const float* grad_ptr = (gid < blocks_total) ? 
-        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
-        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
-
-    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + 
-        cell_thread_x;
-    for (int bin_id = 0; bin_id < cnbins; ++bin_id)
-        hist[bin_id * 48] = 0.f;
-
-    const int dist_x = -4 + cell_thread_x - 4 * cell_x;
-    const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
-
-    const int dist_y_begin = -4 - 4 * lidY;
-    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
-    {
-        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
-
-        grad_ptr += grad_quadstep;
-        qangle_ptr += qangle_step;
-
-        int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
-
-        float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * 
-            dist_center_x) * scale);
-        float interp_weight = (8.f - fabs(dist_y + 0.5f)) * 
-            (8.f - fabs(dist_x + 0.5f)) / 64.f;
-
-        hist[bin.x * 48] += gaussian * interp_weight * vote.x;
-        hist[bin.y * 48] += gaussian * interp_weight * vote.y;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    volatile __local float* hist_ = hist;
-    for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
-    {
-        if (cell_thread_x < 6)
-            hist_[0] += hist_[6];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (cell_thread_x < 3)
-            hist_[0] += hist_[3];
-#ifdef CPU
-        barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-        if (cell_thread_x == 0)
-            final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
-                hist_[0] + hist_[1] + hist_[2];
-    }
-#ifdef CPU
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
-    if ((tid < cblock_hist_size) && (gid < blocks_total))
-    {
-        __global float* block_hist = block_hists + 
-            (gidY * img_block_width + gidX) * cblock_hist_size;
-        block_hist[tid] = final_hist[tid];
-    }
-}
-
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
 //  optimized for the case of 9 bins