Merge pull request #1503 from ilya-lavrenov:ocl_arithm

3e91350a · Andrey Pavlenko · OpenCV Buildbot · 26a1a932 · f5af3ab8 · 3e91350a
Commit 3e91350a authored Sep 26, 2013 by Andrey Pavlenko Committed by OpenCV Buildbot Sep 26, 2013
31 changed files
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ b/modules/ocl/doc/operations_on_matrices.rst
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -842,54 +842,6 @@ PERF_TEST_P(PowFixture, pow, OCL_TYPICAL_MAT_SIZES)
        OCL_PERF_ELSE
 }

-///////////// MagnitudeSqr////////////////////////
-
-typedef TestBaseWithParam<Size> MagnitudeSqrFixture;
-
-PERF_TEST_P(MagnitudeSqrFixture, MagnitudeSqr, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst(srcSize, CV_32FC1);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::magnitudeSqr(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        ASSERT_EQ(1, src1.channels());
-
-        TEST_CYCLE()
-        {
-            for (int y = 0; y < srcSize.height; ++y)
-            {
-                const float * const src1Data = reinterpret_cast<float *>(src1.data + src1.step * y);
-                const float * const src2Data = reinterpret_cast<float *>(src2.data + src2.step * y);
-                float * const dstData = reinterpret_cast<float *>(dst.data + dst.step * y);
-                for (int x = 0; x < srcSize.width; ++x)
-                {
-                    float t0 = src1Data[x] * src1Data[x];
-                    float t1 = src2Data[x] * src2Data[x];
-                    dstData[x] = t0 + t1;
-                }
-            }
-        }
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
 ///////////// AddWeighted////////////////////////

 typedef Size_MatType AddWeightedFixture;

--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -130,7 +130,7 @@ public:
    {
        Size src_size = src.size();

-        // Delete those two clause below which exist before, However, the result is alos correct
+        // Delete those two clause below which exist before, However, the result is also correct
        // dst.create(src_size, src.type());
        // dst = Scalar(0.0);

@@ -394,23 +394,8 @@ public:
    {
        Filter2DEngine_GPU::apply(src, dst);

-        //if (iters > 1)
-        //{
-        // Size wholesize;
-        // Point ofs;
-        // dst.locateROI(wholesize,ofs);
-        // int rows = dst.rows, cols = dst.cols;
-        // dst.adjustROI(ofs.y,-ofs.y-rows+dst.wholerows,ofs.x,-ofs.x-cols+dst.wholecols);
-        // dst.copyTo(morfBuf);
-        // dst.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // morfBuf.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // //morfBuf.create(src.size(),src.type());
-        // //Filter2DEngine_GPU::apply(dst, morfBuf);
-        // //morfBuf.copyTo(dst);
-        //}
        for (int i = 1; i < iters; ++i)
        {
-            //dst.swap(morfBuf);
            Size wholesize;
            Point ofs;
            dst.locateROI(wholesize, ofs);
@@ -720,24 +705,16 @@ public:
    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
    {
        Size src_size = src.size();
-        //int src_type = src.type();

        int cn = src.oclchannels();
-        //dst.create(src_size, src_type);
-        //dst = Scalar(0.0);
-        //dstBuf.create(src_size, src_type);
        dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
-        //dstBuf = Scalar(0.0);

        normalizeROI(roi, ksize, anchor, src_size);

        srcROI = src(roi);
        dstROI = dst(roi);
-        //dstBufROI = dstBuf(roi);

        (*rowFilter)(srcROI, dstBuf);
-        //Mat rm(dstBufROI);
-        //std::cout << "rm " << rm << endl;
        (*columnFilter)(dstBuf, dstROI);
    }

@@ -1324,11 +1301,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
    CV_Assert(src.oclchannels() == dst.oclchannels());
    CV_Assert(ksize == (anchor << 1) + 1);
    int src_pix_per_row, dst_pix_per_row;
-    //int src_offset_x, src_offset_y;
    int dst_offset_in_pixel;
    src_pix_per_row = src.step / src.elemSize();
-    //src_offset_x = (src.offset % src.step) / src.elemSize();
-    //src_offset_y = src.offset / src.step;
    dst_pix_per_row = dst.step / dst.elemSize();
    dst_offset_in_pixel = dst.offset / dst.elemSize();

@@ -1340,8 +1314,6 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
    args.push_back(make_pair(sizeof(cl_int), (void *)&src_pix_per_row));
-    //args.push_back(make_pair(sizeof(cl_int),(void*)&src_offset_x));
-    //args.push_back(make_pair(sizeof(cl_int),(void*)&src_offset_y));
    args.push_back(make_pair(sizeof(cl_int), (void *)&dst_pix_per_row));
    args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
    args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
@@ -1360,23 +1332,11 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
        linearColumnFilter_gpu<int>,
        linearColumnFilter_gpu<float>
    };
-    /*
-    CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 ||
-    dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 ||
-    bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
-
-    Mat temp(columnKernel.size(), CV_32SC1);
-    columnKernel.convertTo(temp, CV_32SC1);
-    Mat cont_krnl = temp.reshape(1, 1);
-    */
+
    Mat temp = columnKernel.reshape(1, 1);
    oclMat mat_kernel(temp);

    int ksize = temp.cols;
-
-    //CV_Assert(ksize < 16);
-
    normalizeAnchor(anchor, ksize);

    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, mat_kernel,
@@ -1414,11 +1374,8 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
    }

    if (ddepth < 0)
-    {
        ddepth = src.depth();
-    }

-    //CV_Assert(ddepth == src.depth());
    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));

    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
@@ -1445,18 +1402,10 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if (dx == 0)
-        {
            kx *= scale;
-        }
        else
-        {
            ky *= scale;
    }
-    }
-
-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);

    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType);
 }
@@ -1471,18 +1420,10 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if (dx == 0)
-        {
            kx *= scale;
-        }
        else
-        {
            ky *= scale;
    }
-    }
-
-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);

    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
 }
@@ -1505,9 +1446,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
    Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);

    if (scale != 1)
-    {
        kernel *= scale;
-    }

    filter2D(src, dst, ddepth, kernel, Point(-1, -1));
 }
@@ -1526,14 +1465,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do

    // automatic detection of kernel size from sigma
    if (ksize.width <= 0 && sigma1 > 0)
-    {
        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }

    if (ksize.height <= 0 && sigma2 > 0)
-    {
        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }

    CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);

@@ -1544,17 +1479,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
    Mat ky;

    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
-    {
        ky = kx;
-    }
    else
-    {
        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
-    }

-    //Mat kx_, ky_;
-    //kx.convertTo(kx_,CV_32S,1<<8);
-    //ky.convertTo(ky_,CV_32S,1<<8);
    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
 }

@@ -1585,15 +1513,11 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
    if (bordertype != BORDER_CONSTANT)
    {
        if (src.rows == 1)
-        {
            ksize.height = 1;
-        }

        if (src.cols == 1)
-        {
            ksize.width = 1;
    }
-    }

    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
    f->apply(src, dst);
@@ -1618,6 +1542,7 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize
    {
        lut.at<float>(idx++) = sigma2 / (sigma2 + x * x + y * y);
    }
+
    oclMat dlut(lut);
    int depth = src.depth();
    int cn = src.oclchannels();

--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -244,9 +244,6 @@ namespace cv
                    kernelName = "remapNNF1Constant";
            }

-            //int channels = dst.oclchannels();
-            //int depth = dst.depth();
-            //int type = src.type();
            size_t blkSizeX = 16, blkSizeY = 16;
            size_t glbSizeX;
            int cols = dst.cols;
@@ -499,21 +496,13 @@ namespace cv
                openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
            }
            else
-            {
                CV_Error(CV_StsUnsupportedFormat, "Non-supported filter length");
-                //string kernelName = "medianFilter";
-                //args.push_back( make_pair( sizeof(cl_int),(void*)&m));
-
-                //openCLExecuteKernel(clCxt,&imgproc_median,kernelName,globalThreads,localThreads,args,src.oclchannels(),-1);
-            }
-
        }

        ////////////////////////////////////////////////////////////////////////
        // copyMakeBorder
        void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
        {
-            //CV_Assert(src.oclchannels() != 2);
            CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
            if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
            {
@@ -529,10 +518,12 @@ namespace cv
            {
                CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
            }
+
            if(bordertype == cv::BORDER_REFLECT_101)
            {
                CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
            }
+
            dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
            int srcStep = src.step1() / src.oclchannels();
            int dstStep = dst.step1() / dst.oclchannels();
@@ -732,19 +723,6 @@ namespace cv
            }

            openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-            //uchar* cputemp=new uchar[32*dst.wholerows];
-            ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
-            //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE,
-            //						0, 32*dst.wholerows, cputemp, 0, NULL, NULL));
-            //for(int i=0;i<dst.wholerows;i++)
-            //{
-            //	for(int j=0;j<dst.wholecols;j++)
-            //	{
-            //		cout<< (int)cputemp[i*32+j]<<" ";
-            //	}
-            //	cout<<endl;
-            //}
-            //delete []cputemp;
        }

        ////////////////////////////////////////////////////////////////////////
@@ -1286,11 +1264,6 @@ namespace cv
            if( src.depth() != CV_8U || src.oclchannels() != 4 )
                CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );

-            //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
-            //            {
-            //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
-            //            }
-
            dst.create( src.size(), CV_8UC4 );

            if( !(criteria.type & TermCriteria::MAX_ITER) )

--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -1013,7 +1013,7 @@ namespace cv
            programCache->releaseProgram();
        }

-        bool Context::supportsFeature(int ftype)
+        bool Context::supportsFeature(int ftype) const
        {
            switch(ftype)
            {
@@ -1028,7 +1028,7 @@ namespace cv
            }
        }

-        size_t Context::computeUnits()
+        size_t Context::computeUnits() const
        {
            return impl->maxComputeUnits;
        }

--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -347,19 +347,14 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
                        localThreads, args, -1, -1, compile_option);
 }

-void cv::ocl::oclMat::copyTo( oclMat &m ) const
-{
-    CV_DbgAssert(!this->empty());
-    m.create(size(), type());
-    openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
-                       data, step, cols * elemSize(), rows, offset);
-}
-
 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 {
    if (mask.empty())
    {
-        copyTo(mat);
+        CV_DbgAssert(!this->empty());
+        mat.create(size(), type());
+        openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
+                           data, step, cols * elemSize(), rows, offset);
    }
    else
    {

--- a/modules/ocl/src/opencl/arithm_LUT.cl
+++ b/modules/ocl/src/opencl/arithm_LUT.cl
@@ -38,125 +38,66 @@
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif

-__kernel
-void LUT_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C1( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0)<<2;
-    int gidy = get_global_id(1);
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
+    int x1 = get_global_id(0);
+    int y = get_global_id(1);

-    __local uchar l[256];
-    l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= cols-4?cols-4:gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    p.x = src[src_index];
-    p.y = src[src_index+1];
-    p.z = src[src_index+2];
-    p.w = src[src_index+3];
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);

-    q.x = l[p.x];
-    q.y = l[p.y];
-    q.z = l[p.z];
-    q.w = l[p.w];
-    *(__global uchar4*)(dst + dst_index) = q;
+        dst[dst_index] = lut[lut_offset1 + src[src_index]];
+    }
 }

-__kernel
-void LUT2_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int precols,
-      int channels,
-      int whole_rows,
-      int cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C2( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    //int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    __local uchar l[256];
-    l[lidy] = table[lidy+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);

+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);

-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= precols ? cols+gidx : gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    //uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uchar p = src[src_index];
-    uchar q = l[p];
-    dst[dst_index] = q;
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 1)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 1) + 1] : dst[dst_index + 1];
+    }
 }

-__kernel
-void LUT_C4_D0( __global uchar4 *dst,
-      __global uchar4 *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C4( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);

-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    int src_index = mad24(gidy,src_step,gidx+src_offset);
-    int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
-    __local uchar l[256];
-    l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(gidx<cols && gidy<rows)
+    if (x1 < cols1 && y < rows)
    {
-        uchar4 p = src[src_index];
-        uchar4 q;
-        q.x = l[p.x];
-        q.y = l[p.y];
-        q.z = l[p.z];
-        q.w = l[p.w];
-        dst[dst_index] = q;
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 2)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 2) + 1] : dst[dst_index + 1];
+        dst[dst_index + 2] = x1 + 2 < cols1 ? lut[lut_offset1 + (src[src_index + 2] << 2) + 2] : dst[dst_index + 2];
+        dst[dst_index + 3] = x1 + 3 < cols1 ? lut[lut_offset1 + (src[src_index + 3] << 2) + 3] : dst[dst_index + 3];
    }
 }
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ b/modules/ocl/src/opencl/arithm_absdiff.cl
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
--- a/modules/ocl/src/opencl/arithm_2_mat.cl
+++ b/modules/ocl/src/opencl/arithm_2_mat.cl
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
+++ b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
--- a/modules/ocl/src/opencl/arithm_div.cl
+++ b/modules/ocl/src/opencl/arithm_div.cl
--- a/modules/ocl/src/opencl/arithm_exp.cl
+++ b/modules/ocl/src/opencl/arithm_exp.cl
@@ -42,52 +42,70 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-

 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////EXP//////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////

-__kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+__kernel void arithm_exp_C1(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < cols && y < rows)
+    if(x < cols1 && y < rows)
    {
-      x = x << 2;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
+        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x + dstOffset1);

-      float src_data = *((__global float *)((__global char *)src + srcIdx));
-      float dst_data = exp(src_data);
+        dst[dstIdx] = exp(src[srcIdx]);
+    }
+}
+
+__kernel void arithm_exp_C2(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
+{
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);

-      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);

+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
    }
 }

-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+__kernel void arithm_exp_C4(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-  int x = get_global_id(0);
+    int x1 = get_global_id(0) << 2;
    int y = get_global_id(1);
-  if(x < cols && y < rows )
-  {
-      x = x << 3;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);

-      double src_data = *((__global double *)((__global char *)src + srcIdx));
-      double dst_data = exp(src_data);
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);

-      *((__global double *)((__global char *)dst + dstIdx )) = dst_data;
-     // dst[dstIdx] = exp(src[srcIdx]);
+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
+        dst[dstIdx + 2] = x1 + 2 < cols1 ? exp(src[srcIdx + 2]) : dst[dstIdx + 2];
+        dst[dstIdx + 3] = x1 + 3 < cols1 ? exp(src[srcIdx + 3]) : dst[dstIdx + 3];
    }
 }
-
-#endif
--- a/modules/ocl/src/opencl/arithm_log.cl
+++ b/modules/ocl/src/opencl/arithm_log.cl
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -44,9 +44,14 @@
 //M*/

 /**************************************PUBLICFUNC*************************************/
+
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif

 #if defined (DEPTH_0)
 #define VEC_TYPE uchar8

--- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ b/modules/ocl/src/opencl/arithm_mul.cl
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ b/modules/ocl/src/opencl/arithm_transpose.cl
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -48,7 +48,7 @@
 #define MHEIGHT 256

 #define MIN_VALUE 171
-#define MAX_VALUE 351
+#define MAX_VALUE 357

 //#define RANDOMROI
 int randomInt(int minVal, int maxVal);