Merge pull request #2491 from ilya-lavrenov:tapi_sep_filter

d8c01828 · Andrey Pavlenko · OpenCV Buildbot · 157f35ef · 2875ce60 · d8c01828
Commit d8c01828 authored Mar 24, 2014 by Andrey Pavlenko Committed by OpenCV Buildbot Mar 24, 2014
12 changed files
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -118,6 +118,8 @@ public:
    virtual int kind() const;
    virtual int dims(int i=-1) const;
+    virtual int cols(int i=-1) const;
+    virtual int rows(int i=-1) const;
    virtual Size size(int i=-1) const;
    virtual int sizend(int* sz, int i=-1) const;
    virtual bool sameSize(const _InputArray& arr) const;

--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -592,7 +592,7 @@ protected:
 CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
 CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
-CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1);
+CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
 CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
 CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
                                         InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),

--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1416,6 +1416,16 @@ int _InputArray::kind() const
    return flags & KIND_MASK;
 }
+int _InputArray::rows(int i) const
+{
+    return size(i).height;
+}
+int _InputArray::cols(int i) const
+{
+    return size(i).width;
+}
 Size _InputArray::size(int i) const
 {
    int k = kind();
@@ -2078,45 +2088,45 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
-void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
+void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
 {
    int k = kind();
    if( k == MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((Mat*)obj)->type() == mtype);
-        ((Mat*)obj)->create(rows, cols, mtype);
+        ((Mat*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == UMAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((UMat*)obj)->type() == mtype);
-        ((UMat*)obj)->create(rows, cols, mtype);
+        ((UMat*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
-        ((cuda::GpuMat*)obj)->create(rows, cols, mtype);
+        ((cuda::GpuMat*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
-        ((ogl::Buffer*)obj)->create(rows, cols, mtype);
+        ((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
        return;
    }
    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
-        ((cuda::CudaMem*)obj)->create(rows, cols, mtype);
+        ((cuda::CudaMem*)obj)->create(_rows, _cols, mtype);
        return;
    }
-    int sizes[] = {rows, cols};
+    int sizes[] = {_rows, _cols};
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4307,7 +4307,7 @@ static std::string kerToStr(const Mat & k)
    return stream.str();
 }
-String kernelToStr(InputArray _kernel, int ddepth)
+String kernelToStr(InputArray _kernel, int ddepth, const char * name)
 {
    Mat kernel = _kernel.getMat().reshape(1, 1);
@@ -4318,13 +4318,13 @@ String kernelToStr(InputArray _kernel, int ddepth)
    if (ddepth != depth)
        kernel.convertTo(kernel, ddepth);
-    typedef std::string (*func_t)(const Mat &);
+    typedef std::string (* func_t)(const Mat &);
-    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>,kerToStr<short>,
+    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
    const func_t func = funcs[depth];
    CV_Assert(func != 0);
-    return cv::format(" -D COEFF=%s", func(kernel).c_str());
+    return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
 }
 #define PROCESS_SRC(src) \

--- a/modules/imgproc/perf/opencl/perf_filters.cpp
+++ b/modules/imgproc/perf/opencl/perf_filters.cpp
@@ -211,7 +211,7 @@ OCL_PERF_TEST_P(SobelFixture, Sobel,
    OCL_TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);
-    SANITY_CHECK(dst);
+    SANITY_CHECK(dst, 1e-6);
 }
 ///////////// Scharr ////////////////////////

--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
--- a/modules/imgproc/src/opencl/filterSepCol.cl
+++ b/modules/imgproc/src/opencl/filterSepCol.cl
@@ -34,47 +34,36 @@
 //
 //
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
 #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
 #define RADIUS 1
-#if CN ==1
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==2
-#define ALIGN (((RADIUS)+1)>>1<<1)
-#elif CN==3
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==4
-#define ALIGN (RADIUS)
-#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0)
-#endif
 #define noconvert
-/**********************************************************************************
+#if CN != 3
-These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
+#define loadpix(addr) *(__global const srcT *)(addr)
-Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
-kernel must be in the center. ROI is not supported either.
+#define SRCSIZE (int)sizeof(srcT)
-Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
+#define DSTSIZE (int)sizeof(dstT)
-from LDS to calculate the result.
+#else
-The length of the convovle kernel supported is only related to the MAX size of LDS,
+#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
-which is HW related.
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
-Niko
+#define SRCSIZE (int)sizeof(srcT1)*3
-6/29/2011
+#define DSTSIZE (int)sizeof(dstT1)*3
-The info above maybe obsolete.
+#endif
-***********************************************************************************/
 #define DIG(a) a,
 __constant float mat_kernel[] = { COEFF };
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
+__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
-                        (__global const GENTYPE_SRC * restrict src,
+                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
-                         const int src_step_in_pixel,
-                         const int src_whole_cols,
-                         const int src_whole_rows,
-                         __global GENTYPE_DST * dst,
-                         const int dst_offset_in_pixel,
-                         const int dst_step_in_pixel,
-                         const int dst_cols,
-                         const int dst_rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -82,38 +71,38 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
    int l_x = get_local_id(0);
    int l_y = get_local_id(1);
-    int start_addr = mad24(y, src_step_in_pixel, x);
+    int start_addr = mad24(y, src_step, x * SRCSIZE);
-    int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
+    int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);
-    int i;
+    srcT sum, temp[READ_TIMES_COL];
-    GENTYPE_SRC sum, temp[READ_TIMES_COL];
+    __local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
-    __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
-    //read pixels from src
+    // read pixels from src
-    for(i = 0;i<READ_TIMES_COL;i++)
+    for (int i = 0; i < READ_TIMES_COL; ++i)
    {
-        int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+        int current_addr = mad24(i, LSIZE1 * src_step, start_addr);
        current_addr = current_addr < end_addr ? current_addr : 0;
-        temp[i] = src[current_addr];
+        temp[i] = loadpix(src + current_addr);
-    }
-    //save pixels to lds
-    for(i = 0;i<READ_TIMES_COL;i++)
-    {
-        LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
    }
+    // save pixels to lds
+    for (int i = 0; i < READ_TIMES_COL; ++i)
+        LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i];
    barrier(CLK_LOCAL_MEM_FENCE);
-    //read pixels from lds and calculate the result
-    sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
+    // read pixels from lds and calculate the result
-    for(i=1;i<=RADIUSY;i++)
+    sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];
+    for (int i = 1; i <= RADIUSY; ++i)
    {
-        temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
+        temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
-        temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
+        temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
-        sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+        sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
    }
-    //write the result to dst
-    if((x<dst_cols) & (y<dst_rows))
+    // write the result to dst
+    if (x < dst_cols && y < dst_rows)
    {
-        start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel);
+        start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));
-        dst[start_addr] = convert_to_DST(sum);
+        storepix(convertToDstT(sum), dst + start_addr);
    }
 }
--- a/modules/imgproc/src/opencl/filterSepRow.cl
+++ b/modules/imgproc/src/opencl/filterSepRow.cl
--- a/modules/imgproc/src/opencl/filterSep_singlePass.cl
+++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2014, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_CONSTANT
+// CCCCCC|abcdefgh|CCCCCCC
+#define EXTRAPOLATE(x, maxV)
+#elif defined BORDER_REPLICATE
+// aaaaaa|abcdefgh|hhhhhhh
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = max(min((x), (maxV) - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+// cdefgh|abcdefgh|abcdefg
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = ( (x) + (maxV) ) % (maxV); \
+    }
+#elif defined BORDER_REFLECT
+// fedcba|abcdefgh|hgfedcb
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
+    }
+#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
+// gfedcb|abcdefgh|gfedcba
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
+    }
+#else
+#error No extrapolation method
+#endif
+#if CN != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define SRCSIZE (int)sizeof(srcT)
+#define DSTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define SRCSIZE (int)sizeof(srcT1)*3
+#define DSTSIZE (int)sizeof(dstT1)*3
+#endif
+#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
+#ifdef BORDER_CONSTANT
+// CCCCCC|abcdefgh|CCCCCCC
+#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
+#else
+#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
+#endif
+#define noconvert
+// horizontal and vertical filter kernels
+// should be defined on host during compile time to avoid overhead
+#define DIG(a) a,
+__constant float mat_kernelX[] = { KERNEL_MATRIX_X };
+__constant float mat_kernelY[] = { KERNEL_MATRIX_Y };
+__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
+                         __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
+{
+    // RADIUSX, RADIUSY are filter dimensions
+    // BLK_X, BLK_Y are local wrogroup sizes
+    // all these should be defined on host during compile time
+    // first lsmem array for source pixels used in first pass,
+    // second lsmemDy for storing first pass results
+    __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];
+    __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];
+    // get local and global ids - used as image and local memory array indexes
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    // calculate pixel position in source image taking image offset into account
+    int srcX = x + srcOffsetX - RADIUSX;
+    int srcY = y + srcOffsetY - RADIUSY;
+    int xb = srcX;
+    int yb = srcY;
+    // extrapolate coordinates, if needed
+    // and read my own source pixel into local memory
+    // with account for extra border pixels, which will be read by starting workitems
+    int clocY = liy;
+    int cSrcY = srcY;
+    do
+    {
+        int yb = cSrcY;
+        EXTRAPOLATE(yb, (height));
+        int clocX = lix;
+        int cSrcX = srcX;
+        do
+        {
+            int xb = cSrcX;
+            EXTRAPOLATE(xb,(width));
+            lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
+            clocX += BLK_X;
+            cSrcX += BLK_X;
+        }
+        while(clocX < BLK_X+(RADIUSX*2));
+        clocY += BLK_Y;
+        cSrcY += BLK_Y;
+    }
+    while (clocY < BLK_Y+(RADIUSY*2));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // do vertical filter pass
+    // and store intermediate results to second local memory array
+    int i, clocX = lix;
+    WT sum = 0.0f;
+    do
+    {
+        sum = 0.0f;
+        for (i=0; i<=2*RADIUSY; i++)
+            sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
+        lsmemDy[liy][clocX] = sum;
+        clocX += BLK_X;
+    }
+    while(clocX < BLK_X+(RADIUSX*2));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // if this pixel happened to be out of image borders because of global size rounding,
+    // then just return
+    if( x >= dst_cols || y >=dst_rows )
+        return;
+    // do second horizontal filter pass
+    // and calculate final result
+    sum = 0.0f;
+    for (i=0; i<=2*RADIUSX; i++)
+        sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
+    //store result into destination image
+    storepix(convertToDstT(sum), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
+}
--- a/modules/imgproc/test/ocl/test_filters.cpp
+++ b/modules/imgproc/test/ocl/test_filters.cpp
@@ -306,7 +306,7 @@ OCL_TEST_P(MorphologyEx, Mat)
            (int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
            (int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
-#define FILTER_TYPES Values(CV_8UC1, CV_8UC2, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4)
+#define FILTER_TYPES Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4)
 OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
                            Values((MatType)CV_8UC1),

--- a/modules/imgproc/test/ocl/test_sepfilter2D.cpp
+++ b/modules/imgproc/test/ocl/test_sepfilter2D.cpp
@@ -75,33 +75,24 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
    void random_roi()
    {
        Size ksize = randomSize(kernelMinSize, kernelMaxSize);
-        if (1 != (ksize.width % 2))
+        if (1 != ksize.width % 2)
            ksize.width++;
-        if (1 != (ksize.height % 2))
+        if (1 != ksize.height % 2)
            ksize.height++;
        Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
        cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1);
        temp = randomMat(Size(1, ksize.height),  CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
        cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);
        Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
-        int rest = roiSize.width % 4;
-        if (0 != rest)
-            roiSize.width += (4 - rest);
        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        rest = srcBorder.lef % 4;
-        if (0 != rest)
-            srcBorder.lef += (4 - rest);
-        rest = srcBorder.rig % 4;
-        if (0 != rest)
-            srcBorder.rig += (4 - rest);
        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
-        anchor.x = -1;
+        anchor.x = anchor.y = -1;
-        anchor.y = -1;
        UMAT_UPLOAD_INPUT_PARAMETER(src);
        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
@@ -115,7 +106,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
 OCL_TEST_P(SepFilter2D, Mat)
 {
-    for (int j = 0; j < test_loop_times; j++)
+    for (int j = 0; j < test_loop_times + 3; j++)
    {
        random_roi();
@@ -126,11 +117,10 @@ OCL_TEST_P(SepFilter2D, Mat)
    }
 }
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D,
                            Combine(
                                Values(CV_8U, CV_32F),
-                                Values(1, 4),
+                                OCL_ALL_CHANNELS,
                                Values(
                                        (BorderType)BORDER_CONSTANT,
                                        (BorderType)BORDER_REPLICATE,

--- a/modules/superres/src/btv_l1.cpp
+++ b/modules/superres/src/btv_l1.cpp
@@ -1014,10 +1014,8 @@ namespace
            return;
 #ifdef HAVE_OPENCL
-        if (isUmat_ && curFrame_.channels() == 1)
+        if (isUmat_)
            curFrame_.copyTo(ucurFrame_);
-        else
-            isUmat_ = false;
 #endif
        ++storePos_;