diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index c25dddd4dd96c2c09f5e199c551db7a0874357a7..3ce7ba62acceb3cb9c7e0a2cfd81180891d2dc4a 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -1033,67 +1033,117 @@ namespace cv
             else
                 scale = 1. / scale;
 
-            if (ksize > 0)
+            const int sobel_lsz = 16;
+            if((src.type() == CV_8UC1 || src.type() == CV_32FC1) &&
+                (ksize==3 || ksize==5 || ksize==7 || ksize==-1) &&
+                src.wholerows > sobel_lsz + (ksize>>1) &&
+                src.wholecols > sobel_lsz + (ksize>>1))
             {
-                Context* clCxt = Context::getContext();
-                if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
-                    src.cols % 8 == 0 && src.rows % 8 == 0 &&
-                    ksize==3 &&
-                    (borderType ==cv::BORDER_REFLECT ||
-                     borderType == cv::BORDER_REPLICATE ||
-                     borderType ==cv::BORDER_REFLECT101 ||
-                     borderType ==cv::BORDER_WRAP))
+                Dx.create(src.size(), CV_32FC1);
+                Dy.create(src.size(), CV_32FC1);
+
+                CV_Assert(Dx.rows == Dy.rows && Dx.cols == Dy.cols);
+
+                size_t lt2[3] = {sobel_lsz, sobel_lsz, 1};
+                size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
+
+                unsigned int src_pitch = src.step;
+                unsigned int Dx_pitch = Dx.step;
+                unsigned int Dy_pitch = Dy.step;
+
+                int src_offset_x = (src.offset % src.step) / src.elemSize();
+                int src_offset_y = src.offset / src.step;
+
+                float _scale = scale;
+
+                std::vector<std::pair<size_t , const void *> > args;
+                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&src.data ));
+                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
+
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x ));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y ));
+
+                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&Dx.data ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.offset ));
+                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dx_pitch ));
+                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&Dy.data ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dy.offset ));
+                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dy_pitch ));
+
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholecols ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholerows ));
+
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.cols ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.rows ));
+
+                args.push_back( std::make_pair( sizeof(cl_float), (void *)&_scale ));
+
+                string option = cv::format("-D BLK_X=%d -D BLK_Y=%d",(int)lt2[0],(int)lt2[1]);
+                switch(src.type())
                 {
-                    Dx.create(src.size(), CV_32FC1);
-                    Dy.create(src.size(), CV_32FC1);
-
-                    const unsigned int block_x = 8;
-                    const unsigned int block_y = 8;
-
-                    unsigned int src_pitch = src.step;
-                    unsigned int dst_pitch = Dx.cols;
-
-                    float _scale = scale;
-
-                    std::vector<std::pair<size_t , const void *> > args;
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
-                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
-                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
-                    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
-                    size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
-
-                    string option = "-D BLK_X=8 -D BLK_Y=8";
-                    switch(borderType)
-                    {
-                    case cv::BORDER_REPLICATE:
-                        option += " -D BORDER_REPLICATE";
-                        break;
-                    case cv::BORDER_REFLECT:
-                        option += " -D BORDER_REFLECT";
-                        break;
-                    case cv::BORDER_REFLECT101:
-                        option += " -D BORDER_REFLECT101";
-                        break;
-                    case cv::BORDER_WRAP:
-                        option += " -D BORDER_WRAP";
-                        break;
-                    }
-                    openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
+                case CV_8UC1:
+                    option += " -D SRCTYPE=uchar";
+                    break;
+                case CV_32FC1:
+                    option += " -D SRCTYPE=float";
+                    break;
                 }
-                else
+                switch(borderType)
                 {
-                    Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
-                    Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+                case cv::BORDER_CONSTANT:
+                    option += " -D BORDER_CONSTANT";
+                    break;
+                case cv::BORDER_REPLICATE:
+                    option += " -D BORDER_REPLICATE";
+                    break;
+                case cv::BORDER_REFLECT:
+                    option += " -D BORDER_REFLECT";
+                    break;
+                case cv::BORDER_REFLECT101:
+                    option += " -D BORDER_REFLECT_101";
+                    break;
+                case cv::BORDER_WRAP:
+                    option += " -D BORDER_WRAP";
+                    break;
+                default:
+                    CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
+                    break;
                 }
+
+                string kernel_name;
+                switch(ksize)
+                {
+                case -1:
+                    option += " -D SCHARR";
+                    kernel_name = "sobel3";
+                    break;
+                case 3:
+                    kernel_name = "sobel3";
+                    break;
+                case 5:
+                    kernel_name = "sobel5";
+                    break;
+                case 7:
+                    kernel_name = "sobel7";
+                    break;
+                default:
+                    CV_Error(CV_StsBadFlag, "Kernel size is not supported!");
+                    break;
+                }
+                openCLExecuteKernel(src.clCxt, &imgproc_sobel3, kernel_name, gt2, lt2, args, -1, -1, option.c_str() );
             }
             else
             {
-                Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
-                Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
+                if (ksize > 0)
+                {
+                    Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
+                    Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+                }
+                else
+                {
+                    Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
+                    Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
+                }
             }
             CV_Assert(Dx.offset == 0 && Dy.offset == 0);
         }
diff --git a/modules/ocl/src/opencl/imgproc_sobel3.cl b/modules/ocl/src/opencl/imgproc_sobel3.cl
index d6a995f5527bda3f30573079225419fbe5d43a3c..8356fce01818078c2d04c10176a67e271ce11bdd 100644
--- a/modules/ocl/src/opencl/imgproc_sobel3.cl
+++ b/modules/ocl/src/opencl/imgproc_sobel3.cl
@@ -1,45 +1,97 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////Macro for border type////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
 
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define EXTRAPOLATE(x, maxV)
+#elif defined BORDER_REPLICATE
+//aaaaaa|abcdefgh|hhhhhhh
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = max(min((x), (maxV) - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+//cdefgh|abcdefgh|abcdefg
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = ( (x) + (maxV) ) % (maxV); \
+    }
+#elif defined BORDER_REFLECT
+//fedcba|abcdefgh|hgfedcb
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \
+    }
+#elif defined BORDER_REFLECT_101
+//gfedcb|abcdefgh|gfedcba
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \
+    }
+#else
+#error No extrapolation method
 #endif
 
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
+#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
 
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
+#else
+#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
 #endif
 
+#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x])
+#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x])
+
+#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \
+    int srcX = x + srcOffsetX - (kernel_border); \
+    int srcY = y + srcOffsetY - (kernel_border); \
+    int xb = srcX; \
+    int yb = srcY; \
+    \
+    EXTRAPOLATE(xb, (width)); \
+    EXTRAPOLATE(yb, (height)); \
+    lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    \
+    if(lix < ((kernel_border)*2)) \
+    { \
+        int xb = srcX+BLK_X; \
+        EXTRAPOLATE(xb,(width)); \
+        lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    } \
+    if(liy< ((kernel_border)*2)) \
+    { \
+        int yb = srcY+BLK_Y; \
+        EXTRAPOLATE(yb, (height)); \
+        lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    } \
+    if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \
+    { \
+        int xb = srcX+BLK_X; \
+        int yb = srcY+BLK_Y; \
+        EXTRAPOLATE(xb,(width)); \
+        EXTRAPOLATE(yb,(height)); \
+        lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    }
+
 __kernel void sobel3(
         __global uchar* Src,
-        __global float* DstX,
-        __global float* DstY,
-        int width, int height,
-        uint srcStride, uint dstStride,
-        float scale
+        const uint      SrcPitch,
+        const int       srcOffsetX,
+        const int       srcOffsetY,
+        __global uchar* DstX,
+        const int       DstXOffset,
+        const uint      DstXPitch,
+        __global uchar* DstY,
+        const int       DstYOffset,
+        const uint      DstYPitch,
+        int             width,
+        int             height,
+        int             dstWidth,
+        int             dstHeight,
+        float           scale
         )
 {
     __local float lsmem[BLK_Y+2][BLK_X+2];
@@ -47,62 +99,249 @@ __kernel void sobel3(
     int lix = get_local_id(0);
     int liy = get_local_id(1);
 
-    int gix = get_group_id(0);
-    int giy = get_group_id(1);
-
-    int id_x = get_global_id(0);
-    int id_y = get_global_id(1);
-
-    lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
-
-    int id_y_h = ADDR_H(id_y-1, 0,height);
-    int id_y_b = ADDR_B(id_y+1, height,id_y+1);
-
-    int id_x_l = ADDR_L(id_x-1, 0,width);
-    int id_x_r = ADDR_R(id_x+1, width,id_x+1);
-
-    if(liy==0)
-    {
-        lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
-
-        if(lix==0)
-            lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
-        else if(lix==BLK_X-1)
-            lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
-    }
-    else if(liy==BLK_Y-1)
-    {
-        lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
-
-        if(lix==0)
-            lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
-        else if(lix==BLK_X-1)
-            lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
-    }
-
-    if(lix==0)
-        lsmem[liy+1][0]    = convert_float(Src[ id_y * srcStride + id_x_l ]);
-    else if(lix==BLK_X-1)
-        lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
+    int x = (int)get_global_id(0);
+    int y = (int)get_global_id(1);
 
+    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1)
     barrier(CLK_LOCAL_MEM_FENCE);
 
+    if( x >= dstWidth || y >=dstHeight )  return;
+
     float u1 = lsmem[liy][lix];
     float u2 = lsmem[liy][lix+1];
     float u3 = lsmem[liy][lix+2];
 
     float m1 = lsmem[liy+1][lix];
-    float m2 = lsmem[liy+1][lix+1];
     float m3 = lsmem[liy+1][lix+2];
 
     float b1 = lsmem[liy+2][lix];
     float b2 = lsmem[liy+2][lix+1];
     float b3 = lsmem[liy+2][lix+2];
 
-    //m2 * scale;//
-    float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
-    DstX[ id_y * dstStride + id_x ] = dx * scale;
+    //calc and store dx and dy;//
+#ifdef SCHARR
+    DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale;
+    DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale;
+#else
+    DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale;
+    DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale;
+#endif
+}
+
+__kernel void sobel5(
+        __global uchar* Src,
+        const uint      SrcPitch,
+        const int       srcOffsetX,
+        const int       srcOffsetY,
+        __global uchar* DstX,
+        const int       DstXOffset,
+        const uint      DstXPitch,
+        __global uchar* DstY,
+        const int       DstYOffset,
+        const uint      DstYPitch,
+        int             width,
+        int             height,
+        int             dstWidth,
+        int             dstHeight,
+        float           scale
+        )
+{
+    __local float lsmem[BLK_Y+4][BLK_X+4];
+
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+
+    int x = (int)get_global_id(0);
+    int y = (int)get_global_id(1);
+
+    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2)
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if( x >= dstWidth || y >=dstHeight )  return;
+
+    float t1 = lsmem[liy][lix];
+    float t2 = lsmem[liy][lix+1];
+    float t3 = lsmem[liy][lix+2];
+    float t4 = lsmem[liy][lix+3];
+    float t5 = lsmem[liy][lix+4];
+
+    float u1 = lsmem[liy+1][lix];
+    float u2 = lsmem[liy+1][lix+1];
+    float u3 = lsmem[liy+1][lix+2];
+    float u4 = lsmem[liy+1][lix+3];
+    float u5 = lsmem[liy+1][lix+4];
+
+    float m1 = lsmem[liy+2][lix];
+    float m2 = lsmem[liy+2][lix+1];
+    float m4 = lsmem[liy+2][lix+3];
+    float m5 = lsmem[liy+2][lix+4];
+
+    float l1 = lsmem[liy+3][lix];
+    float l2 = lsmem[liy+3][lix+1];
+    float l3 = lsmem[liy+3][lix+2];
+    float l4 = lsmem[liy+3][lix+3];
+    float l5 = lsmem[liy+3][lix+4];
+
+    float b1 = lsmem[liy+4][lix];
+    float b2 = lsmem[liy+4][lix+1];
+    float b3 = lsmem[liy+4][lix+2];
+    float b4 = lsmem[liy+4][lix+3];
+    float b5 = lsmem[liy+4][lix+4];
+
+    //calc and store dx and dy;//
+    DSTX(x,y) = scale *
+        mad(12.0f, m4 - m2,
+            mad(6.0f, m5 - m1,
+                mad(8.0f, u4 - u2 + l4 - l2,
+                    mad(4.0f, u5 - u1 + l5 - l1,
+                        mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 )
+                        )
+                    )
+                )
+            );
+
+    DSTY(x,y) = scale *
+        mad(12.0f, l3 - u3,
+            mad(6.0f, b3 - t3,
+                mad(8.0f, l2 - u2 + l4 - u4,
+                    mad(4.0f, b2 - t2 + b4 - t4,
+                        mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 )
+                        )
+                    )
+                )
+            );
+}
+
+__kernel void sobel7(
+        __global uchar* Src,
+        const uint      SrcPitch,
+        const int       srcOffsetX,
+        const int       srcOffsetY,
+        __global uchar* DstX,
+        const int       DstXOffset,
+        const uint      DstXPitch,
+        __global uchar* DstY,
+        const int       DstYOffset,
+        const uint      DstYPitch,
+        int             width,
+        int             height,
+        int             dstWidth,
+        int             dstHeight,
+        float           scale
+        )
+{
+    __local float lsmem[BLK_Y+6][BLK_X+6];
+
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+
+    int x = (int)get_global_id(0);
+    int y = (int)get_global_id(1);
+
+    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3)
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if( x >= dstWidth || y >=dstHeight )  return;
+
+    float tt1 = lsmem[liy][lix];
+    float tt2 = lsmem[liy][lix+1];
+    float tt3 = lsmem[liy][lix+2];
+    float tt4 = lsmem[liy][lix+3];
+    float tt5 = lsmem[liy][lix+4];
+    float tt6 = lsmem[liy][lix+5];
+    float tt7 = lsmem[liy][lix+6];
+
+    float t1 = lsmem[liy+1][lix];
+    float t2 = lsmem[liy+1][lix+1];
+    float t3 = lsmem[liy+1][lix+2];
+    float t4 = lsmem[liy+1][lix+3];
+    float t5 = lsmem[liy+1][lix+4];
+    float t6 = lsmem[liy+1][lix+5];
+    float t7 = lsmem[liy+1][lix+6];
+
+    float u1 = lsmem[liy+2][lix];
+    float u2 = lsmem[liy+2][lix+1];
+    float u3 = lsmem[liy+2][lix+2];
+    float u4 = lsmem[liy+2][lix+3];
+    float u5 = lsmem[liy+2][lix+4];
+    float u6 = lsmem[liy+2][lix+5];
+    float u7 = lsmem[liy+2][lix+6];
+
+    float m1 = lsmem[liy+3][lix];
+    float m2 = lsmem[liy+3][lix+1];
+    float m3 = lsmem[liy+3][lix+2];
+    float m5 = lsmem[liy+3][lix+4];
+    float m6 = lsmem[liy+3][lix+5];
+    float m7 = lsmem[liy+3][lix+6];
+
+    float l1 = lsmem[liy+4][lix];
+    float l2 = lsmem[liy+4][lix+1];
+    float l3 = lsmem[liy+4][lix+2];
+    float l4 = lsmem[liy+4][lix+3];
+    float l5 = lsmem[liy+4][lix+4];
+    float l6 = lsmem[liy+4][lix+5];
+    float l7 = lsmem[liy+4][lix+6];
+
+    float b1 = lsmem[liy+5][lix];
+    float b2 = lsmem[liy+5][lix+1];
+    float b3 = lsmem[liy+5][lix+2];
+    float b4 = lsmem[liy+5][lix+3];
+    float b5 = lsmem[liy+5][lix+4];
+    float b6 = lsmem[liy+5][lix+5];
+    float b7 = lsmem[liy+5][lix+6];
+
+    float bb1 = lsmem[liy+6][lix];
+    float bb2 = lsmem[liy+6][lix+1];
+    float bb3 = lsmem[liy+6][lix+2];
+    float bb4 = lsmem[liy+6][lix+3];
+    float bb5 = lsmem[liy+6][lix+4];
+    float bb6 = lsmem[liy+6][lix+5];
+    float bb7 = lsmem[liy+6][lix+6];
+
+    //calc and store dx and dy
+    DSTX(x,y) = scale *
+        mad(100.0f, m5 - m3,
+            mad(80.0f, m6 - m2,
+                mad(20.0f, m7 - m1,
+                    mad(75.0f, u5 - u3 + l5 - l3,
+                        mad(60.0f, u6 - u2 + l6 - l2,
+                            mad(15.0f, u7 - u1 + l7 - l1,
+                                mad(30.0f, t5 - t3 + b5 - b3,
+                                    mad(24.0f, t6 - t2 + b6 - b2,
+                                        mad(6.0f, t7 - t1 + b7 - b1,
+                                            mad(5.0f, tt5 - tt3 + bb5 - bb3,
+                                                mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 )
+                                                )
+                                            )
+                                        )
+                                    )
+                                )
+                            )
+                        )
+                    )
+                )
+            );
 
-    float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
-    DstY[ id_y * dstStride + id_x ] = dy * scale;
-}
\ No newline at end of file
+    DSTY(x,y) = scale *
+        mad(100.0f, l4 - u4,
+            mad(80.0f, b4 - t4,
+                mad(20.0f, bb4 - tt4,
+                    mad(75.0f, l5 - u5 + l3 - u3,
+                        mad(60.0f, b5 - t5 + b3 - t3,
+                            mad(15.0f, bb5 - tt5 + bb3 - tt3,
+                                mad(30.0f, l6 - u6 + l2 - u2,
+                                    mad(24.0f, b6 - t6 + b2 - t2,
+                                        mad(6.0f, bb6 - tt6 + bb2 - tt2,
+                                            mad(5.0f, l7 - u7 + l1 - u1,
+                                                mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 )
+                                                )
+                                            )
+                                        )
+                                    )
+                                )
+                            )
+                        )
+                    )
+                )
+            );
+}