Commit 36028bd8 authored by Andrey Kamaev's avatar Andrey Kamaev Committed by OpenCV Buildbot

Merge pull request #808 from bitwangyaoyao:2.4_mac

parents d2de68c1 719e8674
...@@ -77,7 +77,7 @@ namespace cv ...@@ -77,7 +77,7 @@ namespace cv
size_t wave_size = 0; size_t wave_size = 0;
queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast<int>(wave_size)); std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
OPTION_INIT = true; OPTION_INIT = true;
} }
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS); openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);
......
...@@ -277,8 +277,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, ...@@ -277,8 +277,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
char compile_option[128]; char compile_option[128];
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s", sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s",
anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
rectKernel?"-D RECTKERNEL":"", s, rectKernel?"-D RECTKERNEL":"");
s);
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
......
...@@ -330,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src ...@@ -330,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
if (x < thread_cols && y < rows) if (x < thread_cols && y < rows)
{ {
int src_index_0 = mad24(y, src_step, (x) + src_offset); int src_index_0 = mad24(y, src_step, (x) + src_offset);
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset); int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
uchar data0 = *(src + src_index_0); uchar data0 = *(src + src_index_0);
uchar data1 = *(src + src_index_1); *(dst + dst_index_1) = data0;
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
uchar data1 = *(src + src_index_1);
*(dst + dst_index_0) = data1; *(dst + dst_index_0) = data1;
*(dst + dst_index_1) = data0;
} }
} }
__kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset, __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,
......
This diff is collapsed.
...@@ -114,7 +114,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x ...@@ -114,7 +114,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
int groupX_size = get_local_size(0); int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0); int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3) #define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
...@@ -125,7 +125,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x ...@@ -125,7 +125,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
...@@ -143,7 +143,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x ...@@ -143,7 +143,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
...@@ -162,7 +162,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x ...@@ -162,7 +162,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
data = *(src + selected_row * src_step + selected_cols); data = *(src + selected_row * src_step + selected_cols);
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
...@@ -185,7 +185,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x ...@@ -185,7 +185,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
for(int i = 0; i < ANCHOR; i++) for(int i = 0; i < ANCHOR; i++)
{ {
#pragma unroll 3 #pragma unroll 3
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
if(dst_rows_index < dst_rows_end) if(dst_rows_index < dst_rows_end)
...@@ -225,7 +225,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x ...@@ -225,7 +225,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
int groupX_size = get_local_size(0); int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0); int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3) #define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
...@@ -236,7 +236,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x ...@@ -236,7 +236,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
...@@ -254,7 +254,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x ...@@ -254,7 +254,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
...@@ -272,7 +272,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x ...@@ -272,7 +272,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
...@@ -295,7 +295,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x ...@@ -295,7 +295,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
for(int i = 0; i < ANCHOR; i++) for(int i = 0; i < ANCHOR; i++)
{ {
#pragma unroll 3 #pragma unroll 3
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
if(dst_rows_index < dst_rows_end) if(dst_rows_index < dst_rows_end)
...@@ -304,7 +304,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x ...@@ -304,7 +304,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
sum = sum + (mat_kernel[i * ANCHOR + j] * data); sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data);
} }
} }
} }
...@@ -337,7 +337,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ ...@@ -337,7 +337,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int groupX_size = get_local_size(0); int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0); int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3) #define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
...@@ -349,7 +349,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ ...@@ -349,7 +349,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
...@@ -367,7 +367,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ ...@@ -367,7 +367,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
...@@ -386,7 +386,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ ...@@ -386,7 +386,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
...@@ -410,7 +410,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ ...@@ -410,7 +410,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
for(int i = 0; i < ANCHOR; i++) for(int i = 0; i < ANCHOR; i++)
{ {
#pragma unroll 3 #pragma unroll 3
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
if(dst_rows_index < dst_rows_end) if(dst_rows_index < dst_rows_end)
...@@ -468,7 +468,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ ...@@ -468,7 +468,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
...@@ -486,7 +486,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ ...@@ -486,7 +486,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
...@@ -504,7 +504,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ ...@@ -504,7 +504,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
...@@ -522,7 +522,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ ...@@ -522,7 +522,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
int local_cols = lX + j; int local_cols = lX + j;
sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]; sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]);
} }
} }
......
...@@ -44,7 +44,11 @@ ...@@ -44,7 +44,11 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
#define LSIZE 256 #define LSIZE 256
#define LSIZE_1 255 #define LSIZE_1 255
...@@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float ...@@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
gid = gid << 1; gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1) for(int i = 0; i < rows; i =i + LSIZE_1)
{ {
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
...@@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float ...@@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
if(lid > 0 && (i+lid) <= rows){ if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0]; lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1]; lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0]; lm_sqsum[0][bf_loc] += sqsum_t[0];
...@@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo ...@@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
src_step = src_step >> 4; src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1) for(int i = 0; i < rows; i =i + LSIZE_1)
{ {
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0; sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0; sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
...@@ -235,7 +240,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo ...@@ -235,7 +240,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
{ {
int loc0 = gid * 2 * sum_step; int loc0 = gid * 2 * sum_step;
int loc1 = gid * 2 * sqsum_step; int loc1 = gid * 2 * sqsum_step;
for(int k = 1;k <= 8;k++) for(int k = 1; k <= 8; k++)
{ {
if(gid * 8 + k > cols) break; if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0; sum[sum_offset + loc0 + k * sum_step / 4] = 0;
...@@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo ...@@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
} }
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
if(lid > 0 && (i+lid) <= rows){ if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0]; lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1]; lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0]; lm_sqsum[0][bf_loc] += sqsum_t[0];
......
...@@ -47,8 +47,12 @@ ...@@ -47,8 +47,12 @@
//warpAffine kernel //warpAffine kernel
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
#if defined DOUBLE_SUPPORT #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F; typedef double F;
typedef double4 F4; typedef double4 F4;
#define convert_F4 convert_double4 #define convert_F4 convert_double4
...@@ -58,7 +62,6 @@ typedef float4 F4; ...@@ -58,7 +62,6 @@ typedef float4 F4;
#define convert_F4 convert_float4 #define convert_F4 convert_float4
#endif #endif
#define INTER_BITS 5 #define INTER_BITS 5
#define INTER_TAB_SIZE (1 << INTER_BITS) #define INTER_TAB_SIZE (1 << INTER_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE #define INTER_SCALE 1.f/INTER_TAB_SIZE
...@@ -123,7 +126,7 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u ...@@ -123,7 +126,7 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s1 = scon.s1 ? src[spos.s1] : 0;
sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0;
sval.s3 = scon.s3 ? src[spos.s3] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0;
dval = convert_uchar4(dcon != 0) ? sval : dval; dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
*d = dval; *d = dval;
} }
} }
...@@ -206,10 +209,10 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob ...@@ -206,10 +209,10 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
taby = INTER_SCALE * convert_float4(ay); taby = INTER_SCALE * convert_float4(ay);
tabx = INTER_SCALE * convert_float4(ax); tabx = INTER_SCALE * convert_float4(ax);
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE )); itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
int4 val; int4 val;
...@@ -636,7 +639,7 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i ...@@ -636,7 +639,7 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i
short sy0 = (short)(Y0 >> AB_BITS); short sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0; dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
} }
} }
...@@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds ...@@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
float4 v0, v1, v2, v3; float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
...@@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst ...@@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
float tab[16]; float tab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
......
...@@ -47,8 +47,12 @@ ...@@ -47,8 +47,12 @@
//wrapPerspective kernel //wrapPerspective kernel
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
#if defined DOUBLE_SUPPORT #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F; typedef double F;
typedef double4 F4; typedef double4 F4;
#define convert_F4 convert_double4 #define convert_F4 convert_double4
...@@ -112,7 +116,7 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo ...@@ -112,7 +116,7 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s1 = scon.s1 ? src[spos.s1] : 0;
sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0;
sval.s3 = scon.s3 ? src[spos.s3] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0;
dval = convert_uchar4(dcon != 0) ? sval : dval; dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
*d = dval; *d = dval;
} }
} }
...@@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ ...@@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
int i; int i;
#pragma unroll 4 #pragma unroll 4
for(i=0; i<4; i++) for(i=0; i<4; i++)
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0; v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
short itab[4]; short itab[4];
float tab1y[2], tab1x[2]; float tab1y[2], tab1x[2];
...@@ -197,7 +201,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * ...@@ -197,7 +201,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
for(i=0; i<4; i++) for(i=0; i<4; i++)
for(j=0; j<4; j++) for(j=0; j<4; j++)
{ {
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
} }
short itab[16]; short itab[16];
...@@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, ...@@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
int4 v0, v1, v2, v3; int4 v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0; v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0; v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0; v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0; v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
int itab0, itab1, itab2, itab3; int itab0, itab1, itab2, itab3;
float taby, tabx; float taby, tabx;
...@@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * ...@@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
float v0, v1, v2, v3; float v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0; v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0; v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0; v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0; v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
...@@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * ...@@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
float tab[16]; float tab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
...@@ -564,7 +568,7 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d ...@@ -564,7 +568,7 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
short sy = (short)Y; short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0; dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
} }
} }
...@@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 ...@@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
float4 v0, v1, v2, v3; float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
...@@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 ...@@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
float tab[16]; float tab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
...@@ -682,3 +686,4 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 ...@@ -682,3 +686,4 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
} }
} }
} }
...@@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0 ...@@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++) for(j = 0; j < tpl_cols; j ++)
{ {
sum = mad24(img_ptr[j], tpl_ptr[j], sum); sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
} }
} }
res[res_idx] = sum; res[res_idx] = (float)sum;
} }
} }
...@@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0 ...@@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0
sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
} }
} }
res[res_idx] = sum.x + sum.y + sum.z + sum.w; res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
} }
} }
...@@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0 ...@@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float sum = (float)( float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
- (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
res[res_idx] -= sum * tpl_sum; res[res_idx] -= sum * tpl_sum;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment