Commit 17ffb288 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #7602 from mshabunin:fix-opencl-warnings

parents f1d93cb2 3e28d517
...@@ -82,10 +82,10 @@ inline float3 sobel(int idx, __local const floatN *smem) ...@@ -82,10 +82,10 @@ inline float3 sobel(int idx, __local const floatN *smem)
// result: x, y, mag // result: x, y, mag
float3 res; float3 res;
floatN dx = fma(2, smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4], floatN dx = fma((floatN)2, smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4],
smem[idx + 2] - smem[idx] + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]); smem[idx + 2] - smem[idx] + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]);
floatN dy = fma(2, smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9], floatN dy = fma((floatN)2, smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9],
smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10] + smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]); smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10] + smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]);
#ifdef L2GRAD #ifdef L2GRAD
......
...@@ -49,21 +49,21 @@ ...@@ -49,21 +49,21 @@
#if depth == 0 #if depth == 0
#define DATA_TYPE uchar #define DATA_TYPE uchar
#define MAX_NUM 255 #define MAX_NUM 255
#define HALF_MAX 128 #define HALF_MAX_NUM 128
#define COEFF_TYPE int #define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num) #define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0 #define DEPTH_0
#elif depth == 2 #elif depth == 2
#define DATA_TYPE ushort #define DATA_TYPE ushort
#define MAX_NUM 65535 #define MAX_NUM 65535
#define HALF_MAX 32768 #define HALF_MAX_NUM 32768
#define COEFF_TYPE int #define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num) #define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2 #define DEPTH_2
#elif depth == 5 #elif depth == 5
#define DATA_TYPE float #define DATA_TYPE float
#define MAX_NUM 1.0f #define MAX_NUM 1.0f
#define HALF_MAX 0.5f #define HALF_MAX_NUM 0.5f
#define COEFF_TYPE float #define COEFF_TYPE float
#define SAT_CAST(num) (num) #define SAT_CAST(num) (num)
#define DEPTH_5 #define DEPTH_5
...@@ -229,11 +229,11 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset ...@@ -229,11 +229,11 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeffs = c_RGB2YUVCoeffs_f; __constant float * coeffs = c_RGB2YUVCoeffs_f;
const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2])); const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));
const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX); const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX_NUM);
const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX); const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX_NUM);
#else #else
__constant int * coeffs = c_RGB2YUVCoeffs_i; __constant int * coeffs = c_RGB2YUVCoeffs_i;
const int delta = HALF_MAX * (1 << yuv_shift); const int delta = HALF_MAX_NUM * (1 << yuv_shift);
const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift); const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);
const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift); const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift); const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
...@@ -278,14 +278,14 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset ...@@ -278,14 +278,14 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeffs = c_YUV2RGBCoeffs_f; __constant float * coeffs = c_YUV2RGBCoeffs_f;
float r = fma(V - HALF_MAX, coeffs[3], Y); float r = fma(V - HALF_MAX_NUM, coeffs[3], Y);
float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y)); float g = fma(V - HALF_MAX_NUM, coeffs[2], fma(U - HALF_MAX_NUM, coeffs[1], Y));
float b = fma(U - HALF_MAX, coeffs[0], Y); float b = fma(U - HALF_MAX_NUM, coeffs[0], Y);
#else #else
__constant int * coeffs = c_YUV2RGBCoeffs_i; __constant int * coeffs = c_YUV2RGBCoeffs_i;
const int r = Y + CV_DESCALE(mul24(V - HALF_MAX, coeffs[3]), yuv_shift); const int r = Y + CV_DESCALE(mul24(V - HALF_MAX_NUM, coeffs[3]), yuv_shift);
const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], mul24(U - HALF_MAX, coeffs[1])), yuv_shift); const int g = Y + CV_DESCALE(mad24(V - HALF_MAX_NUM, coeffs[2], mul24(U - HALF_MAX_NUM, coeffs[1])), yuv_shift);
const int b = Y + CV_DESCALE(mul24(U - HALF_MAX, coeffs[0]), yuv_shift); const int b = Y + CV_DESCALE(mul24(U - HALF_MAX_NUM, coeffs[0]), yuv_shift);
#endif #endif
dst[bidx] = SAT_CAST( b ); dst[bidx] = SAT_CAST( b );
...@@ -328,8 +328,8 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of ...@@ -328,8 +328,8 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
float Y3 = ysrc[src_step]; float Y3 = ysrc[src_step];
float Y4 = ysrc[src_step + 1]; float Y4 = ysrc[src_step + 1];
float U = ((float)usrc[uidx]) - HALF_MAX; float U = ((float)usrc[uidx]) - HALF_MAX_NUM;
float V = ((float)usrc[1-uidx]) - HALF_MAX; float V = ((float)usrc[1-uidx]) - HALF_MAX_NUM;
__constant float* coeffs = c_YUV2RGBCoeffs_420; __constant float* coeffs = c_YUV2RGBCoeffs_420;
float ruv = fma(coeffs[4], V, 0.5f); float ruv = fma(coeffs[4], V, 0.5f);
...@@ -373,6 +373,8 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of ...@@ -373,6 +373,8 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
} }
} }
#if uidx < 2
__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset, __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dt_offset, __global uchar* dstptr, int dst_step, int dt_offset,
int rows, int cols) int rows, int cols)
...@@ -399,12 +401,12 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int ...@@ -399,12 +401,12 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
#ifdef SRC_CONT #ifdef SRC_CONT
__global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset); __global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);
int u_ind = mad24(y, cols >> 1, x); int u_ind = mad24(y, cols >> 1, x);
float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX }; float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX_NUM, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX_NUM };
#else #else
int vsteps[2] = { cols >> 1, src_step - (cols >> 1)}; int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};
__global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x); __global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);
__global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0); __global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
float uv[2] = { ((float)usrc[0]) - HALF_MAX, ((float)vsrc[0]) - HALF_MAX }; float uv[2] = { ((float)usrc[0]) - HALF_MAX_NUM, ((float)vsrc[0]) - HALF_MAX_NUM };
#endif #endif
float U = uv[uidx]; float U = uv[uidx];
float V = uv[1-uidx]; float V = uv[1-uidx];
...@@ -451,6 +453,10 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int ...@@ -451,6 +453,10 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
} }
} }
#endif
#if uidx < 2
__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f, __constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
0.438999176f, -0.3679990768f, -0.0709991455f }; 0.438999176f, -0.3679990768f, -0.0709991455f };
...@@ -556,6 +562,8 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int ...@@ -556,6 +562,8 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
} }
} }
#endif
__kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset, __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset, __global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols) int rows, int cols)
...@@ -576,15 +584,15 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of ...@@ -576,15 +584,15 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
__constant float* coeffs = c_YUV2RGBCoeffs_420; __constant float* coeffs = c_YUV2RGBCoeffs_420;
#ifndef USE_OPTIMIZED_LOAD #ifndef USE_OPTIMIZED_LOAD
float U = ((float) src[uidx]) - HALF_MAX; float U = ((float) src[uidx]) - HALF_MAX_NUM;
float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX; float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX_NUM;
float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0]; float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0]; float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
#else #else
int load_src = *((__global int*) src); int load_src = *((__global int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff}; float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[uidx] - HALF_MAX; float U = vec_src[uidx] - HALF_MAX_NUM;
float V = vec_src[(2 + uidx) % 4] - HALF_MAX; float V = vec_src[(2 + uidx) % 4] - HALF_MAX_NUM;
float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0]; float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0]; float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];
#endif #endif
...@@ -644,11 +652,11 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offs ...@@ -644,11 +652,11 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offs
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeffs = c_RGB2YCrCbCoeffs_f; __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0])); DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));
DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX); DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX_NUM);
DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX); DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX_NUM);
#else #else
__constant int * coeffs = c_RGB2YCrCbCoeffs_i; __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
int delta = HALF_MAX * (1 << yuv_shift); int delta = HALF_MAX_NUM * (1 << yuv_shift);
int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift); int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);
int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift); int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift); int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
...@@ -694,14 +702,14 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset, ...@@ -694,14 +702,14 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
#ifdef DEPTH_5 #ifdef DEPTH_5
__constant float * coeff = c_YCrCb2RGBCoeffs_f; __constant float * coeff = c_YCrCb2RGBCoeffs_f;
float r = fma(coeff[0], cr - HALF_MAX, yp); float r = fma(coeff[0], cr - HALF_MAX_NUM, yp);
float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, yp)); float g = fma(coeff[1], cr - HALF_MAX_NUM, fma(coeff[2], cb - HALF_MAX_NUM, yp));
float b = fma(coeff[3], cb - HALF_MAX, yp); float b = fma(coeff[3], cb - HALF_MAX_NUM, yp);
#else #else
__constant int * coeff = c_YCrCb2RGBCoeffs_i; __constant int * coeff = c_YCrCb2RGBCoeffs_i;
int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift); int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX_NUM), yuv_shift);
int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift); int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX_NUM, coeff[2] * (cb - HALF_MAX_NUM)), yuv_shift);
int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift); int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX_NUM), yuv_shift);
#endif #endif
dstptr[(bidx^2)] = SAT_CAST(r); dstptr[(bidx^2)] = SAT_CAST(r);
...@@ -1564,9 +1572,9 @@ __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset ...@@ -1564,9 +1572,9 @@ __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset
uchar4 src_pix = *(__global const uchar4 *)(src + src_index); uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
*(__global uchar4 *)(dst + dst_index) = *(__global uchar4 *)(dst + dst_index) =
(uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM, (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM, mad24(src_pix.y, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w); mad24(src_pix.z, src_pix.w, HALF_MAX_NUM) / MAX_NUM, src_pix.w);
++y; ++y;
dst_index += dst_step; dst_index += dst_step;
......
...@@ -80,7 +80,7 @@ __kernel void fill_accum_global(__global const uchar * list_ptr, int list_step, ...@@ -80,7 +80,7 @@ __kernel void fill_accum_global(__global const uchar * list_ptr, int list_step,
const int x = (val & 0xFFFF); const int x = (val & 0xFFFF);
const int y = (val >> 16) & 0xFFFF; const int y = (val >> 16) & 0xFFFF;
int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift; int r = convert_int_rte(mad((float)x, cosVal, y * sinVal)) + shift;
atomic_inc(accum + r + 1); atomic_inc(accum + r + 1);
} }
} }
...@@ -117,7 +117,7 @@ __kernel void fill_accum_local(__global const uchar * list_ptr, int list_step, i ...@@ -117,7 +117,7 @@ __kernel void fill_accum_local(__global const uchar * list_ptr, int list_step, i
const int x = (point & 0xFFFF); const int x = (point & 0xFFFF);
const int y = point >> 16; const int y = point >> 16;
int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift; int r = convert_int_rte(mad((float)x, cosVal, y * sinVal)) + shift;
atomic_inc(l_accum + r + 1); atomic_inc(l_accum + r + 1);
} }
...@@ -186,7 +186,7 @@ __kernel void get_lines(__global const uchar * accum_ptr, int accum_step, int ac ...@@ -186,7 +186,7 @@ __kernel void get_lines(__global const uchar * accum_ptr, int accum_step, int ac
if (y < accum_rows-2) if (y < accum_rows-2)
{ {
__global uchar* accum = accum_ptr + mad24(y+1, accum_step, mad24(x+1, (int) sizeof(int), accum_offset)); __global const uchar* accum = accum_ptr + mad24(y+1, accum_step, mad24(x+1, (int) sizeof(int), accum_offset));
__global int4* lines = (__global int4*)(lines_ptr + lines_offset); __global int4* lines = (__global int4*)(lines_ptr + lines_offset);
__global int* lines_index = lines_index_ptr + 1; __global int* lines_index = lines_index_ptr + 1;
......
...@@ -125,7 +125,7 @@ kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int b ...@@ -125,7 +125,7 @@ kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int b
sumT accum = 0; sumT accum = 0;
#ifdef SUM_SQUARE #ifdef SUM_SQUARE
__global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset); __global sumSQT *dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_offset);
for (int xin = x; xin < cols; xin += gs) for (int xin = x; xin < cols; xin += gs)
{ {
dst_sq[xin] = 0; dst_sq[xin] = 0;
......
...@@ -465,10 +465,10 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s ...@@ -465,10 +465,10 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s
T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0]; T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0];
T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0]; T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0];
float num = convertToDT(mad(value_sum, template_sum, 0)); float num = convertToDT(mad(value_sum, template_sum, (float)0));
value_sqsum -= weight * value_sum * value_sum; value_sqsum -= weight * value_sum * value_sum;
float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), (float)0));
int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
__global float * dstult = (__global float *)(dst+dst_idx); __global float * dstult = (__global float *)(dst+dst_idx);
...@@ -509,7 +509,7 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s ...@@ -509,7 +509,7 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s
float num = convertToDT(mad(value_sum, temp_sum, 0)); float num = convertToDT(mad(value_sum, temp_sum, 0));
value_sqsum -= weight * value_sum * value_sum; value_sqsum -= weight * value_sum * value_sum;
float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), (float)0));
int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
__global float * dstult = (__global float *)(dst+dst_idx); __global float * dstult = (__global float *)(dst+dst_idx);
...@@ -549,7 +549,7 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s ...@@ -549,7 +549,7 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s
float num = convertToDT(mad(value_sum, temp_sum, 0)); float num = convertToDT(mad(value_sum, temp_sum, 0));
value_sqsum -= weight * value_sum * value_sum; value_sqsum -= weight * value_sum * value_sum;
float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), (float)0));
int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
__global float * dstult = (__global float *)(dst+dst_idx); __global float * dstult = (__global float *)(dst+dst_idx);
......
...@@ -148,6 +148,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, ...@@ -148,6 +148,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
if (src_y >= 2 && src_y < src_rows - 4) if (src_y >= 2 && src_y < src_rows - 4)
{ {
#undef EXTRAPOLATE_
#define EXTRAPOLATE_(val, maxVal) val #define EXTRAPOLATE_(val, maxVal) val
#if kercn == 1 #if kercn == 1
col = EXTRAPOLATE(x, src_cols); col = EXTRAPOLATE(x, src_cols);
...@@ -180,6 +181,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, ...@@ -180,6 +181,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
} }
else // need extrapolate y else // need extrapolate y
{ {
#undef EXTRAPOLATE_
#define EXTRAPOLATE_(val, maxVal) EXTRAPOLATE(val, maxVal) #define EXTRAPOLATE_(val, maxVal) EXTRAPOLATE(val, maxVal)
#if kercn == 1 #if kercn == 1
col = EXTRAPOLATE(x, src_cols); col = EXTRAPOLATE(x, src_cols);
......
...@@ -414,8 +414,8 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src ...@@ -414,8 +414,8 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
#if defined BORDER_CONSTANT #if defined BORDER_CONSTANT
float xf = map1[0], yf = map2[0]; float xf = map1[0], yf = map2[0];
int sx = convert_int_sat_rtz(mad(xf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS; int sx = convert_int_sat_rtz(mad(xf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;
int sy = convert_int_sat_rtz(mad(yf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS; int sy = convert_int_sat_rtz(mad(yf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;
__constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1); __constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
__constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1); __constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
......
...@@ -104,8 +104,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of ...@@ -104,8 +104,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step) for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step)
{ {
int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta; int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + round_delta;
int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta; int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + round_delta;
short sx = convert_short_sat(X0 >> AB_BITS); short sx = convert_short_sat(X0 >> AB_BITS);
short sy = convert_short_sat(Y0 >> AB_BITS); short sy = convert_short_sat(Y0 >> AB_BITS);
...@@ -146,8 +146,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of ...@@ -146,8 +146,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy) for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)
{ {
int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA; int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA;
int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA; int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA;
X0 = X0 >> (AB_BITS - INTER_BITS); X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS); Y0 = Y0 >> (AB_BITS - INTER_BITS);
...@@ -274,8 +274,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of ...@@ -274,8 +274,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
if (dx < dst_cols && dy < dst_rows) if (dx < dst_cols && dy < dst_rows)
{ {
int tmp = (dx << AB_BITS); int tmp = (dx << AB_BITS);
int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA; int X0 = rint(M[0] * tmp) + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA;
int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA; int Y0 = rint(M[3] * tmp) + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA;
X0 = X0 >> (AB_BITS - INTER_BITS); X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS); Y0 = Y0 >> (AB_BITS - INTER_BITS);
......
...@@ -180,11 +180,11 @@ void runHaarClassifier( ...@@ -180,11 +180,11 @@ void runHaarClassifier(
int4 ofs = f->ofs[0]; int4 ofs = f->ofs[0];
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1]; ofs = f->ofs[1];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
if( weight.z > 0 ) if( weight.z > 0 )
{ {
ofs = f->ofs[2]; ofs = f->ofs[2];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
} }
s += (sval < st.y*nf) ? st.z : st.w; s += (sval < st.y*nf) ? st.z : st.w;
...@@ -204,11 +204,11 @@ void runHaarClassifier( ...@@ -204,11 +204,11 @@ void runHaarClassifier(
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1]; ofs = f->ofs[1];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
if( weight.z > 0 ) if( weight.z > 0 )
{ {
ofs = f->ofs[2]; ofs = f->ofs[2];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
} }
idx = (sval < as_float(n.y)*nf) ? n.z : n.w; idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
...@@ -281,12 +281,12 @@ void runHaarClassifier( ...@@ -281,12 +281,12 @@ void runHaarClassifier(
int4 ofs = f->ofs[0]; int4 ofs = f->ofs[0];
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1]; ofs = f->ofs[1];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
//if( weight.z > 0 ) //if( weight.z > 0 )
if( fabs(weight.z) > 0 ) if( fabs(weight.z) > 0 )
{ {
ofs = f->ofs[2]; ofs = f->ofs[2];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
} }
partsum += (sval < st.y*nf) ? st.z : st.w; partsum += (sval < st.y*nf) ? st.z : st.w;
...@@ -304,11 +304,11 @@ void runHaarClassifier( ...@@ -304,11 +304,11 @@ void runHaarClassifier(
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
ofs = f->ofs[1]; ofs = f->ofs[1];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
if( weight.z > 0 ) if( weight.z > 0 )
{ {
ofs = f->ofs[2]; ofs = f->ofs[2];
sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
} }
idx = (sval < as_float(n.y)*nf) ? n.z : n.w; idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
......
...@@ -148,7 +148,7 @@ __kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_c ...@@ -148,7 +148,7 @@ __kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_c
} }
} }
inline float readImage(__global float *image, int x, int y, int rows, int cols, int elemCntPerRow) inline float readImage(__global const float *image, int x, int y, int rows, int cols, int elemCntPerRow)
{ {
int i0 = clamp(x, 0, cols - 1); int i0 = clamp(x, 0, cols - 1);
int j0 = clamp(y, 0, rows - 1); int j0 = clamp(y, 0, rows - 1);
......
...@@ -266,7 +266,7 @@ inline void GetError(image2d_t J, const float x, const float y, const float* Pch ...@@ -266,7 +266,7 @@ inline void GetError(image2d_t J, const float x, const float y, const float* Pch
//macro to read pixel value into local memory. //macro to read pixel value into local memory.
#define READI(_y,_x) IPatchLocal[mad24(mad24((_y), LSy, yid), LM_W, mad24((_x), LSx, xid))] = read_imagef(I, sampler, (float2)(mad((_x), LSx, Point.x + xid - 0.5f), mad((_y), LSy, Point.y + yid - 0.5f))).x; #define READI(_y,_x) IPatchLocal[mad24(mad24((_y), LSy, yid), LM_W, mad24((_x), LSx, xid))] = read_imagef(I, sampler, (float2)(mad((float)(_x), (float)LSx, Point.x + xid - 0.5f), mad((float)(_y), (float)LSy, Point.y + yid - 0.5f))).x;
void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal) void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal)
{ {
int xid=get_local_id(0); int xid=get_local_id(0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment