Commit 504bc763 authored by vbystricky's avatar vbystricky

Remove pre_invalid parameter

parent 45f21e4f
...@@ -63,7 +63,7 @@ ...@@ -63,7 +63,7 @@
#if sdepth == 4 #if sdepth == 4
kernel void integral_sum_cols(__global uchar4 *src, __global int *sum, kernel void integral_sum_cols(__global uchar4 *src, __global int *sum,
int src_offset, int pre_invalid, int rows, int cols, int src_step, int dst_step) int src_offset, int rows, int cols, int src_step, int dst_step)
{ {
int lid = get_local_id(0); int lid = get_local_id(0);
int gid = get_group_id(0); int gid = get_group_id(0);
...@@ -122,19 +122,19 @@ kernel void integral_sum_cols(__global uchar4 *src, __global int *sum, ...@@ -122,19 +122,19 @@ kernel void integral_sum_cols(__global uchar4 *src, __global int *sum,
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows) if(lid > 0 && (i+lid) <= rows)
{ {
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; int loc_s0 = gid * dst_step + i + lid - 1, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0]; lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1]; lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc])); sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++) for(int k = 0; k < 4; k++)
{ {
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; if(gid * 4 + k >= cols) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k]; sum[loc_s0 + k * dst_step / 4] = sum_p[k];
} }
sum_p = (__local int*)(&(lm_sum[1][bf_loc])); sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++) for(int k = 0; k < 4; k++)
{ {
if(gid * 4 + k + 4 >= cols + pre_invalid) break; if(gid * 4 + k + 4 >= cols) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k]; sum[loc_s1 + k * dst_step / 4] = sum_p[k];
} }
} }
...@@ -238,7 +238,7 @@ kernel void integral_sum_rows(__global int4 *srcsum, __global int *sum, ...@@ -238,7 +238,7 @@ kernel void integral_sum_rows(__global int4 *srcsum, __global int *sum,
#elif sdepth == 5 #elif sdepth == 5
kernel void integral_sum_cols(__global uchar4 *src, __global float *sum, kernel void integral_sum_cols(__global uchar4 *src, __global float *sum,
int src_offset, int pre_invalid, int rows, int cols, int src_step, int dst_step) int src_offset, int rows, int cols, int src_step, int dst_step)
{ {
int lid = get_local_id(0); int lid = get_local_id(0);
int gid = get_group_id(0); int gid = get_group_id(0);
...@@ -297,19 +297,19 @@ kernel void integral_sum_cols(__global uchar4 *src, __global float *sum, ...@@ -297,19 +297,19 @@ kernel void integral_sum_cols(__global uchar4 *src, __global float *sum,
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows) if(lid > 0 && (i+lid) <= rows)
{ {
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; int loc_s0 = gid * dst_step + i + lid - 1, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0]; lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1]; lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc])); sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++) for(int k = 0; k < 4; k++)
{ {
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; if(gid * 4 + k >= cols) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k]; sum[loc_s0 + k * dst_step / 4] = sum_p[k];
} }
sum_p = (__local float*)(&(lm_sum[1][bf_loc])); sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++) for(int k = 0; k < 4; k++)
{ {
if(gid * 4 + k + 4 >= cols + pre_invalid) break; if(gid * 4 + k + 4 >= cols) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k]; sum[loc_s1 + k * dst_step / 4] = sum_p[k];
} }
} }
......
...@@ -254,12 +254,12 @@ static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth ) ...@@ -254,12 +254,12 @@ static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat(); UMat src = _src.getUMat(), t_sum(t_size, sdepth), sum = _sum.getUMat();
t_sum = t_sum(Range::all(), Range(0, size.height)); t_sum = t_sum(Range::all(), Range(0, size.height));
int offset = (int)src.offset / vlen, pre_invalid = (int)src.offset % vlen; int offset = (int)src.offset / vlen;
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen; int vcols = (src.cols + vlen - 1) / vlen;
int sum_offset = (int)sum.offset / vlen; int sum_offset = (int)sum.offset / vlen;
k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum), k1.args(ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(t_sum),
offset, pre_invalid, src.rows, src.cols, (int)src.step, (int)t_sum.step); offset, src.rows, src.cols, (int)src.step, (int)t_sum.step);
size_t gt = ((vcols + 1) / 2) * 256, lt = 256; size_t gt = ((vcols + 1) / 2) * 256, lt = 256;
if (!k1.run(1, &gt, &lt, false)) if (!k1.run(1, &gt, &lt, false))
return false; return false;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment