Merge pull request #775 from bitwangyaoyao:2.4_fixerr

a2d27429 · Andrey Kamaev · OpenCV Buildbot · 977562b6 · bcc086ba · a2d27429
Commit a2d27429 authored Apr 05, 2013 by Andrey Kamaev Committed by OpenCV Buildbot Apr 05, 2013
23 changed files
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ b/modules/ocl/src/opencl/arithm_absdiff.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -63,6 +67,9 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
    {
        x = x << 2;
+#ifdef dst_align
+#undef dst_align
+#endif
        #define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -111,7 +118,10 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -145,7 +155,10 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -249,7 +262,10 @@ __kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -288,7 +304,10 @@ __kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, in
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -319,7 +338,10 @@ __kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -422,7 +444,10 @@ __kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -564,7 +589,10 @@ __kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -618,7 +646,10 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -668,7 +699,10 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);

--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -45,7 +45,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -42,8 +42,12 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined DOUBLE_SUPPORT
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 typedef double F;
 #else
 typedef float F;
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -122,7 +129,10 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
@@ -182,7 +192,10 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
        x = x << 2;
-        #define bitOfInt  (sizeof(int)== 4 ? 2: 3)
+#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
-        #define dst_align ((dst_offset >> bitOfInt) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> bitOfInt) & 3)
        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
@@ -304,7 +320,10 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -366,7 +385,10 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
        x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));

--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -44,9 +44,13 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
+#endif
 /**************************************add with scalar without mask**************************************/
 __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
                                  __global   uchar *dst,  int dst_step,  int dst_offset,
@@ -59,7 +63,10 @@ __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -99,7 +106,10 @@ __kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int sr
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -131,7 +141,10 @@ __kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -233,7 +246,10 @@ __kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -378,7 +394,10 @@ __kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -432,7 +451,10 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -482,7 +504,10 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);

--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 /**************************************add with scalar with mask**************************************/
@@ -61,7 +65,10 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_ste
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -111,7 +118,10 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -146,7 +156,10 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -267,7 +280,10 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -443,7 +459,10 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -501,7 +520,10 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -555,7 +577,10 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

--- a/modules/ocl/src/opencl/arithm_bitwise_and.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -62,7 +66,10 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -112,7 +119,10 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -163,7 +173,10 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -215,7 +228,10 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

--- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_not.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl
@@ -43,9 +43,12 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -61,7 +64,10 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -101,7 +107,10 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -135,7 +144,10 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -170,7 +182,10 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);

--- a/modules/ocl/src/opencl/arithm_bitwise_or.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -62,7 +66,10 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -110,7 +117,10 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -147,7 +157,10 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -185,7 +198,10 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

--- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
@@ -43,9 +43,12 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -62,7 +65,10 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -112,7 +118,10 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -163,7 +172,10 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -215,7 +227,10 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -301,7 +316,6 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
    }
 }
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
                                     __global char *src2, int src2_step, int src2_offset,

--- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -62,7 +66,10 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -114,7 +121,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -165,7 +175,10 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -217,7 +230,10 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -265,7 +281,10 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -275,7 +294,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src2_index < 0)
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+        if(src2_index < 0)
        {
            float4 tmp;
            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
@@ -307,7 +327,10 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -358,7 +381,10 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -409,7 +435,10 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -462,7 +491,10 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -511,7 +543,10 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -560,7 +595,10 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -609,7 +647,10 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -660,7 +701,10 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -714,7 +758,10 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -769,7 +816,10 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -820,7 +870,10 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -869,7 +922,10 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -920,7 +976,10 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 3)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3)& 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -942,7 +1001,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
            double4 tmp;
            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }               uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        }
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -954,3 +1014,4 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 /***********************************Compare NE*******************************/
 __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset,
@@ -58,7 +62,10 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -110,7 +117,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -162,7 +172,10 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1)& 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -210,7 +223,10 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -259,7 +275,10 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -269,7 +288,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src1_index < 0)
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
        {
            float4 tmp;
            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
@@ -306,7 +326,10 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -358,7 +381,10 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -410,7 +436,10 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -463,7 +492,10 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -512,7 +544,10 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -564,7 +599,10 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -613,7 +651,10 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -664,7 +705,10 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -717,7 +761,10 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -770,7 +817,10 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -819,7 +869,10 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -867,7 +920,10 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2)& 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -915,7 +971,10 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 3)& 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3)& 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -952,3 +1011,5 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_div.cl
+++ b/modules/ocl/src/opencl/arithm_div.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 typedef double F ;
 typedef double4 F4;
 #define convert_F4 convert_double4
@@ -56,34 +60,24 @@ typedef float4 F4;
 #define convert_F  float
 #endif
-uchar round2_uchar(F v){
+inline uchar round2_uchar(F v)
+{
-    uchar v1 = convert_uchar_sat(round(v));
+    return convert_uchar_sat(round(v));
-    //uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
 }
-ushort round2_ushort(F v){
+inline ushort round2_ushort(F v)
+{
-    ushort v1 = convert_ushort_sat(round(v));
+    return convert_ushort_sat(round(v));
-    //ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
 }
-short round2_short(F v){
-    short v1 = convert_short_sat(round(v));
+inline short round2_short(F v)
-    //short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
+{
+    return convert_short_sat(round(v));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
 }
-int round2_int(F v){
-    int v1 = convert_int_sat(round(v));
+inline int round2_int(F v)
-    //int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
+{
+    return convert_int_sat(round(v));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
 }
 ///////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////divide///////////////////////////////////////////////////
@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
                             __global uchar *dst,  int dst_step,  int dst_offset,
                             int rows, int cols, int dst_step1, F scalar)
 {
-    int x = get_global_id(0);
+    int2 coor = (int2)(get_global_id(0), get_global_id(1));
-    int y = get_global_id(1);
-    if (x < cols && y < rows)
+    if (coor.x < cols && coor.y < rows)
    {
-        x = x << 2;
+        coor.x = coor.x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+#undef dst_align
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+#endif
+#define dst_align (dst_offset & 3)
+        int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
+                                mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
-        int dst_start  = mad24(y, dst_step, dst_offset);
+        int4 dst_args  = (int4)(mad24(coor.y, dst_step, dst_offset),
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+                                mad24(coor.y, dst_step, dst_offset + dst_step1),
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+                                mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
+                                0);
-        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src1_data = vload4(0, src1 + src_index.x);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
+        uchar4 src2_data = vload4(0, src2 + src_index.y);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_args.z));
        F4 tmp      = convert_F4(src1_data) * scalar;
        uchar4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x);
+        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y);
+        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z);
+        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w);
+        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+        dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        *((__global uchar4 *)(dst + dst_args.z)) = dst_data;
    }
 }
@@ -142,7 +138,10 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -182,7 +181,10 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -297,7 +299,10 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src_index = mad24(y, src_step, x + src_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -333,7 +338,10 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -368,7 +376,10 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_flip.cl
+++ b/modules/ocl/src/opencl/arithm_flip.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -61,7 +65,10 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
@@ -116,7 +123,10 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
@@ -158,7 +168,10 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
    {
        x = x << 2;
-        #define dst_align (((dst_offset >> 1) & 3) << 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset >> 1) & 3) << 1)
        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
@@ -200,7 +213,10 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
    {
        x = x << 2;
-        #define dst_align (((dst_offset >> 1) & 3) << 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset >> 1) & 3) << 1)
        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);

--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ b/modules/ocl/src/opencl/arithm_mul.cl
@@ -16,7 +16,6 @@
 //
 // @Authors
 //    Jia Haipeng, jiahaipeng95@gmail.com
-//    Dachuan Zhao, dachuan@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -44,11 +43,16 @@
 //
 //M*/
-#if defined DOUBLE_SUPPORT
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-int4 round_int4(float4 v){
+int4 round_int4(float4 v)
+{
    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
@@ -56,7 +60,8 @@ int4 round_int4(float4 v){
    return convert_int4_sat(v);
 }
-uint4 round_uint4(float4 v){
+uint4 round_uint4(float4 v)
+{
    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
@@ -64,7 +69,8 @@ uint4 round_uint4(float4 v){
    return convert_uint4_sat(v);
 }
-long round_int(float v){
+long round_int(float v)
+{
    v = v + (v > 0 ? 0.5 : -0.5);
    return convert_int_sat(v);
@@ -85,7 +91,10 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -130,7 +139,10 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -166,7 +178,10 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));