Merge pull request #1519 from ilya-lavrenov:ocl_minMax

154fe4f6 · Andrey Pavlenko · OpenCV Buildbot · 8c15d276 · 4322c47b · 154fe4f6
Commit 154fe4f6 authored Oct 01, 2013 by Andrey Pavlenko Committed by OpenCV Buildbot Oct 01, 2013
10 changed files
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -584,7 +584,8 @@ namespace cv

        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);

-        CV_EXPORTS void setIdentity(oclMat& src, double val);
+        //! initializes a scaled identity matrix
+        CV_EXPORTS void setIdentity(oclMat& src, const Scalar & val = Scalar(1));

        //////////////////////////////// Filter Engine ////////////////////////////////


--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
--- a/modules/ocl/test/test_norm.cpp
+++ b/modules/ocl/test/test_norm.cpp
@@ -7,12 +7,17 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +26,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -39,25 +44,50 @@
 //
 //M*/

-#include "test_precomp.hpp"
-
-typedef ::testing::TestWithParam<cv::Size> normFixture;
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif

-TEST_P(normFixture, DISABLED_accuracy)
+__kernel void arithm_absdiff_nonsaturate_binary(__global srcT *src1, int src1_step, int src1_offset,
+                         __global srcT *src2, int src2_step, int src2_offset,
+                         __global dstT *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
-    const cv::Size srcSize = GetParam();
+    int x = get_global_id(0);
+    int y = get_global_id(1);

-    cv::Mat src1(srcSize, CV_8UC1), src2(srcSize, CV_8UC1);
-    cv::randu(src1, 0, 2);
-    cv::randu(src2, 0, 2);
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-    cv::ocl::oclMat oclSrc1(src1), oclSrc2(src2);
+        dstT t0 = convertToDstT(src1[src1_index]);
+        dstT t1 = convertToDstT(src2[src2_index]);
+        dstT t2 = t0 - t1;

-    double value = cv::norm(src1, src2, cv::NORM_INF);
-    double oclValue = cv::ocl::norm(oclSrc1, oclSrc2, cv::NORM_INF);
-
-    ASSERT_EQ(value, oclValue);
+        dst[dst_index] = t2 >= 0 ? t2 : -t2;
+    }
 }

-INSTANTIATE_TEST_CASE_P(oclNormTest, normFixture,
-                        ::testing::Values(cv::Size(500, 500), cv::Size(1000, 1000)));
+__kernel void arithm_absdiff_nonsaturate(__global srcT *src1, int src1_step, int src1_offset,
+                         __global dstT *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
+
+        dstT t0 = convertToDstT(src1[src1_index]);
+
+        dst[dst_index] = t0 >= 0 ? t0 : -t0;
+    }
+}
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -53,169 +53,117 @@
 #endif
 #endif

-#if defined (DEPTH_0)
-#define VEC_TYPE uchar8
-#define CONVERT_TYPE convert_uchar8
-#define MIN_VAL 0
-#define MAX_VAL 255
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char8
-#define CONVERT_TYPE convert_char8
-#define MIN_VAL -128
-#define MAX_VAL 127
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort8
-#define CONVERT_TYPE convert_ushort8
-#define MIN_VAL 0
-#define MAX_VAL 65535
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short8
-#define CONVERT_TYPE convert_short8
-#define MIN_VAL -32768
-#define MAX_VAL 32767
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int8
-#define CONVERT_TYPE convert_int8
-#define MIN_VAL INT_MIN
-#define MAX_VAL INT_MAX
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float8
-#define CONVERT_TYPE convert_float8
-#define MIN_VAL (-FLT_MAX)
-#define MAX_VAL FLT_MAX
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double8
-#define CONVERT_TYPE convert_double8
-#define MIN_VAL (-DBL_MAX)
-#define MAX_VAL DBL_MAX
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a) a.s0 = a.s1;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a) a.s0 = a.s2;a.s1 = a.s2;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a) a.s0 = a.s3;a.s1 = a.s3;a.s2 = a.s3;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a) a.s0 = a.s4;a.s1 = a.s4;a.s2 = a.s4;a.s3 = a.s4;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a) a.s0 = a.s5;a.s1 = a.s5;a.s2 = a.s5;a.s3 = a.s5;a.s4 = a.s5;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a) a.s0 = a.s6;a.s1 = a.s6;a.s2 = a.s6;a.s3 = a.s6;a.s4 = a.s6;a.s5 = a.s6;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a) a.s0 = a.s7;a.s1 = a.s7;a.s2 = a.s7;a.s3 = a.s7;a.s4 = a.s7;a.s5 = a.s7;a.s6 = a.s7;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_e(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s7 = a.s6;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
-#endif
-
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable

 /**************************************Array minMax**************************************/
-__kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                  __global VEC_TYPE *src, __global VEC_TYPE *dst)
+
+__kernel void arithm_op_minMax(__global const T * src, __global T * dst,
+    int cols, int invalid_cols, int offset, int elemnum, int groupnum)
 {
   unsigned int lid = get_local_id(0);
   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
+   unsigned int id = get_global_id(0);
+
   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp;
-   if(id < elemnum)
+
+   __local T localmem_max[128], localmem_min[128];
+   T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
   {
+       idx = offset + id + (id / cols) * invalid_cols;
       temp = src[idx];
-       if(id % cols == 0 )
-       {
-           repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
+       minval = min(minval, temp);
+       maxval = max(maxval, temp);
+   }
+
+   if(lid > 127)
+   {
+       localmem_min[lid - 128] = minval;
+       localmem_max[lid - 128] = maxval;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   if(lid < 128)
+   {
+       localmem_min[lid] = min(minval, localmem_min[lid]);
+       localmem_max[lid] = max(maxval, localmem_max[lid]);
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if (lid < lsize)
       {
-           repeat_e(temp);
+           int lid2 = lsize + lid;
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
       }
-       minval = temp;
-       maxval = temp;
+       barrier(CLK_LOCAL_MEM_FENCE);
   }
-   else
+
+   if (lid == 0)
   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
+       dst[gid] = localmem_min[0];
+       dst[gid + groupnum] = localmem_max[0];
   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+}
+
+__kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
+    int cols, int invalid_cols, int offset,
+    int elemnum, int groupnum,
+    const __global uchar * mask, int minvalid_cols, int moffset)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int id = get_global_id(0);
+
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
+
+   __local T localmem_max[128], localmem_min[128];
+   T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
   {
       idx = offset + id + (id / cols) * invalid_cols;
-       temp = src[idx];
-       if(id % cols == 0 )
-       {
-               repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
+       midx = moffset + id + (id / cols) * minvalid_cols;
+
+       if (mask[midx])
       {
-               repeat_e(temp);
+           temp = src[idx];
+           minval = min(minval, temp);
+           maxval = max(maxval, temp);
       }
-       minval = min(minval,temp);
-       maxval = max(maxval,temp);
   }
+
   if(lid > 127)
   {
       localmem_min[lid - 128] = minval;
       localmem_max[lid - 128] = maxval;
   }
   barrier(CLK_LOCAL_MEM_FENCE);
+
   if(lid < 128)
   {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
+       localmem_min[lid] = min(minval, localmem_min[lid]);
+       localmem_max[lid] = max(maxval, localmem_max[lid]);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
   {
-       if(lid < lsize)
+       if (lid < lsize)
       {
           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
       }
       barrier(CLK_LOCAL_MEM_FENCE);
   }
-   if( lid == 0)
+
+   if (lid == 0)
   {
       dst[gid] = localmem_min[0];
       dst[gid + groupnum] = localmem_max[0];

--- a/modules/ocl/src/opencl/arithm_nonzero.cl
+++ b/modules/ocl/src/opencl/arithm_nonzero.cl
@@ -41,151 +41,53 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
-///

-/**************************************PUBLICFUNC*************************************/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
-
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar8
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char8
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort8
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short8
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int8
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float8
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double8
 #endif

-#if defined (REPEAT_S0)
-#define repeat_s(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0;
-#endif
+/**************************************Count NonZero**************************************/

-#if defined (REPEAT_E0)
-#define repeat_e(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s7 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
+__kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
+                                  __global srcT *src, __global dstT *dst)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    unsigned int  id = get_global_id(0);

-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+    unsigned int idx = offset + id + (id / cols) * invalid_cols;
+    __local dstT localmem_nonzero[128];
+    dstT nonzero = (dstT)(0);
+    srcT zero = (srcT)(0), one = (srcT)(1);

-/**************************************Count NonZero**************************************/
-__kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                  __global VEC_TYPE *src, __global int8 *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local int8 localmem_nonzero[128];
-   int8 nonzero;
-   VEC_TYPE zero=0,one=1,temp;
-   if(id < elemnum)
-   {
-       temp = src[idx];
-       if(id % cols == 0 )
-       {
-           repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_e(temp);
-       }
-       nonzero = convert_int8(temp == zero ? zero:one);
-   }
-   else
-   {
-       nonzero = 0;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       temp = src[idx];
-       if(id % cols == 0 )
-       {
-               repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-               repeat_e(temp);
-       }
-       nonzero = nonzero + convert_int8(temp == zero ? zero:one);
-   }
-   if(lid > 127)
-   {
-       localmem_nonzero[lid - 128] = nonzero;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_nonzero[lid] = nonzero + localmem_nonzero[lid];
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
+    for (int grain = groupnum << 8; id < elemnum; id += grain)
+    {
+        idx = offset + id + (id / cols) * invalid_cols;
+        nonzero += src[idx] == zero ? zero : one;
+    }
+
+    if (lid > 127)
+        localmem_nonzero[lid - 128] = nonzero;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid < 128)
+        localmem_nonzero[lid] = nonzero + localmem_nonzero[lid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int lsize = 64; lsize > 0; lsize >>= 1)
+    {
+        if (lid < lsize)
+        {
           int lid2 = lsize + lid;
           localmem_nonzero[lid] = localmem_nonzero[lid] + localmem_nonzero[lid2];
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = localmem_nonzero[0];
-   }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+        dst[gid] = localmem_nonzero[0];
 }
--- a/modules/ocl/src/opencl/arithm_phase.cl
+++ b/modules/ocl/src/opencl/arithm_phase.cl
@@ -45,110 +45,125 @@
 //

 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
+#endif
+
 #define CV_PI 3.1415926535898
+#define CV_2PI 2*3.1415926535898
+
 /**************************************phase inradians**************************************/
-__kernel void arithm_phase_inradians_D5 (__global float *src1, int src1_step, int src1_offset,
-                                         __global float *src2, int src2_step, int src2_offset,
-                                         __global float *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
-{

+__kernel void arithm_phase_inradians_D5(__global float *src1, int src1_step1, int src1_offset1,
+                                         __global float *src2, int src2_step1, int src2_offset1,
+                                         __global float *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
+{
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);

-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = atan2(data2,data1);
+        float data1 = src1[src1_index];
+        float data2 = src2[src2_index];
+        float tmp = atan2(data2, data1);

-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
+        if (tmp < 0)
+            tmp += CV_2PI;

+        dst[dst_index] = tmp;
+    }
 }


 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_phase_inradians_D6 (__global double *src1, int src1_step, int src1_offset,
-                                         __global double *src2, int src2_step, int src2_offset,
-                                         __global double *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
+__kernel void arithm_phase_inradians_D6(__global double *src1, int src1_step1, int src1_offset1,
+                                         __global double *src2, int src2_step1, int src2_offset1,
+                                         __global double *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);

-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+        double data1 = src1[src1_index];
+        double data2 = src2[src2_index];
+        double tmp = atan2(data2, data1);

-        *((__global double *)((__global char *)dst + dst_index)) = atan2(data2,data1);
-    }
+        if (tmp < 0)
+            tmp += CV_2PI;

+        dst[dst_index] = tmp;
+    }
 }
+
 #endif

 /**************************************phase indegrees**************************************/
-__kernel void arithm_phase_indegrees_D5 (__global float *src1, int src1_step, int src1_offset,
-                                         __global float *src2, int src2_step, int src2_offset,
-                                         __global float *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
-{

+__kernel void arithm_phase_indegrees_D5(__global float *src1, int src1_step1, int src1_offset1,
+                                         __global float *src2, int src2_step1, int src2_offset1,
+                                         __global float *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
+{
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);

-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = atan2(data2,data1);
-        float tmp_data = 180*tmp/CV_PI;
+        float data1 = src1[src1_index];
+        float data2 = src2[src2_index];
+        float tmp = atan2(data2, data1);
+        tmp = 180 * tmp / CV_PI;

-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
-    }
+        if (tmp < 0)
+            tmp += 360;

+        dst[dst_index] = tmp;
+    }
 }


 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step, int src1_offset,
-                                         __global double *src2, int src2_step, int src2_offset,
-                                         __global double *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
+__kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step1, int src1_offset1,
+                                         __global double *src2, int src2_step1, int src2_offset1,
+                                         __global double *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);

-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double tmp = atan2(data2,data1);
-        double tmp_data = 180*tmp/CV_PI;
+        double data1 = src1[src1_index];
+        double data2 = src2[src2_index];
+        double tmp = atan2(src2[src2_index], src1[src1_index]);

-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
-    }
+        tmp = 180 * tmp / CV_PI;
+        if (tmp < 0)
+            tmp += 360;

+        dst[dst_index] = tmp;
+    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_setidentity.cl
+++ b/modules/ocl/src/opencl/arithm_setidentity.cl
@@ -42,6 +42,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
@@ -50,51 +51,19 @@
 #endif
 #endif

-
-#if defined (DOUBLE_SUPPORT)
-#define DATA_TYPE double
-#else
-#define DATA_TYPE float
-#endif
-
-__kernel void setIdentityKernel_F1(__global float* src, int src_row, int src_col, int src_step, DATA_TYPE scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < src_col && y < src_row)
-    {
-        if(x == y)
-            src[y * src_step + x] = scalar;
-        else
-            src[y * src_step + x] = 0 * scalar;
-    }
-}
-
-__kernel void setIdentityKernel_D1(__global DATA_TYPE* src, int src_row, int src_col, int src_step, DATA_TYPE scalar)
+__kernel void setIdentity(__global T * src, int src_step, int src_offset,
+    int cols, int rows, __global const T * scalar)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < src_col && y < src_row)
+    if (x < cols && y < rows)
    {
-        if(x == y)
-            src[y * src_step + x] = scalar;
-        else
-            src[y * src_step + x] = 0 * scalar;
-    }
-}
+        int src_index = mad24(y, src_step, src_offset + x);

-__kernel void setIdentityKernel_I1(__global int* src, int src_row, int src_col, int src_step, int scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < src_col && y < src_row)
-    {
-        if(x == y)
-            src[y * src_step + x] = scalar;
+        if (x == y)
+            src[src_index] = *scalar;
        else
-            src[y * src_step + x] = 0 * scalar;
+            src[src_index] = 0;
    }
 }
--- a/modules/ocl/src/opencl/arithm_sum.cl
+++ b/modules/ocl/src/opencl/arithm_sum.cl
@@ -43,163 +43,62 @@
 //
 //M*/

-/**************************************PUBLICFUNC*************************************/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
-#define RES_TYPE double8
-#define CONVERT_RES_TYPE convert_double8
-#else
-#define RES_TYPE float8
-#define CONVERT_RES_TYPE convert_float8
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
-
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar8
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char8
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort8
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short8
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int8
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float8
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double8
-#endif
-
-#if defined (FUNC_TYPE_0)
-#define FUNC(a,b) b += a;
-#endif
-#if defined (FUNC_TYPE_1)
-#define FUNC(a,b) b = b + (a >= 0 ? a : -a);
-#endif
-#if defined (FUNC_TYPE_2)
-#define FUNC(a,b) b = b + a * a;
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0;
 #endif

-#if defined (REPEAT_E0)
-#define repeat_e(a) a = a;
+#if defined (FUNC_SUM)
+#define FUNC(a, b) b += a;
 #endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s7 = 0;
+#if defined (FUNC_ABS_SUM)
+#define FUNC(a, b) b += a >= 0 ? a : -a;
 #endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;
+#if defined (FUNC_SQR_SUM)
+#define FUNC(a, b) b += a * a;
 #endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable

 /**************************************Array buffer SUM**************************************/
-__kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                __global VEC_TYPE *src, __global RES_TYPE *dst)
+
+__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                __global srcT *src, __global dstT *dst)
 {
   unsigned int lid = get_local_id(0);
   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
+   unsigned int id = get_global_id(0);
   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local RES_TYPE localmem_sum[128];
-   RES_TYPE sum = 0,temp;
-   if(id < elemnum)
-   {
-       temp = CONVERT_RES_TYPE(src[idx]);
-       if(id % cols == 0 )
-       {
-           repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_e(temp);
-       }
-       FUNC(temp,sum);
-   }
-   else
-   {
-       sum = 0;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+
+   __local dstT localmem_sum[128];
+   dstT sum = (dstT)(0), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
   {
       idx = offset + id + (id / cols) * invalid_cols;
-       temp = CONVERT_RES_TYPE(src[idx]);
-       if(id % cols == 0 )
-       {
-               repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-               repeat_e(temp);
-       }
-       FUNC(temp,sum);
+       temp = convertToDstT(src[idx]);
+       FUNC(temp, sum);
   }
-   if(lid > 127)
-   {
+
+   if (lid > 127)
       localmem_sum[lid - 128] = sum;
-   }
   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
+
+   if (lid < 128)
       localmem_sum[lid] = sum + localmem_sum[lid];
-   }
   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
   {
-       if(lid < lsize)
+       if (lid < lsize)
       {
           int lid2 = lsize + lid;
           localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
       }
       barrier(CLK_LOCAL_MEM_FENCE);
   }
-   if( lid == 0)
-   {
+
+   if (lid == 0)
       dst[gid] = localmem_sum[0];
-   }
 }
--- a/modules/ocl/src/opencl/arithm_sum_3.cl
+++ b/modules/ocl/src/opencl/arithm_sum_3.cl
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#define RES_TYPE double4
-#define CONVERT_RES_TYPE convert_double4
-#else
-#define RES_TYPE float4
-#define CONVERT_RES_TYPE convert_float4
-#endif
-
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar4
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char4
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort4
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short4
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int4
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float4
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double4
-#endif
-
-#if defined (FUNC_TYPE_0)
-#define FUNC(a,b) b += a;
-#endif
-#if defined (FUNC_TYPE_1)
-#define FUNC(a,b) b = b + (a >= 0 ? a : -a);
-#endif
-#if defined (FUNC_TYPE_2)
-#define FUNC(a,b) b = b + a * a;
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a,b,c) a=a; b =b; c=c;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a,b,c) a.s0=0; b=b; c=c;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a,b,c) a.s0=0; a.s1=0; b=b; c=c;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a,b,c) a.s0=0; a.s1=0; a.s2=0; b=b; c=c;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a,b,c) a=0;b=b; c=c;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a,b,c) a=0; b.s0=0;c=c;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; c=c;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; b.s2=0; c=c;
-#endif
-#if defined (REPEAT_S8)
-#define repeat_s(a,b,c) a=0; b=0; c=c;
-#endif
-#if defined (REPEAT_S9)
-#define repeat_s(a,b,c) a=0; b=0; c.s0=0;
-#endif
-#if defined (REPEAT_S10)
-#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0;
-#endif
-#if defined (REPEAT_S11)
-#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0; c.s2=0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_e(a,b,c) a=a; b =b; c=c;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a,b,c) a=a; b=b; c.s3=0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0; c.s1=0;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a,b,c) a=a; b=b; c=0;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a,b,c) a=a; b.s3=0; c=0;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; c=0;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; b.s1=0; c=0;
-#endif
-#if defined (REPEAT_E8)
-#define repeat_e(a,b,c) a=a; b=0; c=0;
-#endif
-#if defined (REPEAT_E9)
-#define repeat_e(a,b,c) a.s3=0; b=0; c=0;
-#endif
-#if defined (REPEAT_E10)
-#define repeat_e(a,b,c) a.s3=0; a.s2=0; b=0; c=0;
-#endif
-#if defined (REPEAT_E11)
-#define repeat_e(a,b,c) a.s3=0; a.s2=0; a.s1=0; b=0; c=0;
-#endif
-
-__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                __global VEC_TYPE *src, __global RES_TYPE *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int id = get_global_id(0);
-   unsigned int idx = offset + id + (id  / cols) * invalid_cols;
-   idx = idx * 3;
-   __local RES_TYPE localmem_sum1[128];
-   __local RES_TYPE localmem_sum2[128];
-   __local RES_TYPE localmem_sum3[128];
-   RES_TYPE sum1 = 0,sum2 = 0,sum3 = 0,temp1,temp2,temp3;
-   if(id < elemnum)
-   {
-       temp1 = CONVERT_RES_TYPE(src[idx]);
-       temp2 = CONVERT_RES_TYPE(src[idx+1]);
-       temp3 = CONVERT_RES_TYPE(src[idx+2]);
-       if(id % cols == 0 )
-       {
-           repeat_s(temp1,temp2,temp3);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_e(temp1,temp2,temp3);
-       }
-       FUNC(temp1,sum1);
-       FUNC(temp2,sum2);
-       FUNC(temp3,sum3);
-   }
-   else
-   {
-       sum1 = 0;
-       sum2 = 0;
-       sum3 = 0;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       idx = idx * 3;
-       temp1 = CONVERT_RES_TYPE(src[idx]);
-       temp2 = CONVERT_RES_TYPE(src[idx+1]);
-       temp3 = CONVERT_RES_TYPE(src[idx+2]);
-       if(id % cols == 0 )
-       {
-               repeat_s(temp1,temp2,temp3);
-       }
-       if(id % cols == cols - 1)
-       {
-               repeat_e(temp1,temp2,temp3);
-       }
-       FUNC(temp1,sum1);
-       FUNC(temp2,sum2);
-       FUNC(temp3,sum3);
-   }
-   if(lid > 127)
-   {
-       localmem_sum1[lid - 128] = sum1;
-       localmem_sum2[lid - 128] = sum2;
-       localmem_sum3[lid - 128] = sum3;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_sum1[lid] = sum1 + localmem_sum1[lid];
-       localmem_sum2[lid] = sum2 + localmem_sum2[lid];
-       localmem_sum3[lid] = sum3 + localmem_sum3[lid];
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_sum1[lid] = localmem_sum1[lid] + localmem_sum1[lid2];
-           localmem_sum2[lid] = localmem_sum2[lid] + localmem_sum2[lid2];
-           localmem_sum3[lid] = localmem_sum3[lid] + localmem_sum3[lid2];
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid*3]   = localmem_sum1[0];
-       dst[gid*3+1] = localmem_sum2[0];
-       dst[gid*3+2] = localmem_sum3[0];
-   }
-}
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp