Merge pull request #2382 from ilya-lavrenov:tapi_nlmeans

6b6cfa89 · Andrey Pavlenko · OpenCV Buildbot · 553673ee · 9b31e6cd · 6b6cfa89
Commit 6b6cfa89 authored Mar 13, 2014 by Andrey Pavlenko Committed by OpenCV Buildbot Mar 13, 2014
10 changed files
--- a/modules/photo/perf/opencl/perf_denoising.cpp
+++ b/modules/photo/perf/opencl/perf_denoising.cpp
@@ -26,7 +26,7 @@ OCL_PERF_TEST(Photo, DenoisingGrayscale)
    OCL_TEST_CYCLE()
            cv::fastNlMeansDenoising(original, result, 10);

-    SANITY_CHECK(result);
+    SANITY_CHECK(result, 1);
 }

 OCL_PERF_TEST(Photo, DenoisingColored)
@@ -42,10 +42,10 @@ OCL_PERF_TEST(Photo, DenoisingColored)
    OCL_TEST_CYCLE()
            cv::fastNlMeansDenoisingColored(original, result, 10, 10);

-    SANITY_CHECK(result);
+    SANITY_CHECK(result, 2);
 }

-OCL_PERF_TEST(Photo, DenoisingGrayscaleMulti)
+OCL_PERF_TEST(Photo, DISABLED_DenoisingGrayscaleMulti)
 {
    const int imgs_count = 3;

@@ -68,7 +68,7 @@ OCL_PERF_TEST(Photo, DenoisingGrayscaleMulti)
    SANITY_CHECK(result);
 }

-OCL_PERF_TEST(Photo, DenoisingColoredMulti)
+OCL_PERF_TEST(Photo, DISABLED_DenoisingColoredMulti)
 {
    const int imgs_count = 3;


--- a/modules/photo/src/arrays.hpp
+++ b/modules/photo/src/arrays.hpp
@@ -39,10 +39,14 @@
 //
 //M*/

+#include "opencv2/core/base.hpp"
+
 #ifndef __OPENCV_DENOISING_ARRAYS_HPP__
 #define __OPENCV_DENOISING_ARRAYS_HPP__

-template <class T> struct Array2d {
+template <class T>
+struct Array2d
+{
    T* a;
    int n1,n2;
    bool needToDeallocArray;
@@ -50,14 +54,16 @@ template <class T> struct Array2d {
    Array2d(const Array2d& array2d):
        a(array2d.a), n1(array2d.n1), n2(array2d.n2), needToDeallocArray(false)
    {
-        if (array2d.needToDeallocArray) {
-            // copy constructor for self allocating arrays not supported
-            throw new std::exception();
+        if (array2d.needToDeallocArray)
+        {
+            CV_Error(Error::BadDataPtr, "Copy constructor for self allocating arrays not supported");
        }
    }

    Array2d(T* _a, int _n1, int _n2):
-        a(_a), n1(_n1), n2(_n2), needToDeallocArray(false) {}
+        a(_a), n1(_n1), n2(_n2), needToDeallocArray(false)
+    {
+    }

    Array2d(int _n1, int _n2):
        n1(_n1), n2(_n2), needToDeallocArray(true)
@@ -65,28 +71,34 @@ template <class T> struct Array2d {
        a = new T[n1*n2];
    }

-    ~Array2d() {
-        if (needToDeallocArray) {
+    ~Array2d()
+    {
+        if (needToDeallocArray)
            delete[] a;
-        }
    }

-    T* operator [] (int i) {
+    T* operator [] (int i)
+    {
        return a + i*n2;
    }

-    inline T* row_ptr(int i) {
+    inline T* row_ptr(int i)
+    {
        return (*this)[i];
    }
 };

-template <class T> struct Array3d {
+template <class T>
+struct Array3d
+{
    T* a;
    int n1,n2,n3;
    bool needToDeallocArray;

    Array3d(T* _a, int _n1, int _n2, int _n3):
-        a(_a), n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(false) {}
+        a(_a), n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(false)
+    {
+    }

    Array3d(int _n1, int _n2, int _n3):
        n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(true)
@@ -94,64 +106,72 @@ template <class T> struct Array3d {
        a = new T[n1*n2*n3];
    }

-    ~Array3d() {
-        if (needToDeallocArray) {
+    ~Array3d()
+    {
+        if (needToDeallocArray)
            delete[] a;
-        }
    }

-    Array2d<T> operator [] (int i) {
+    Array2d<T> operator [] (int i)
+    {
        Array2d<T> array2d(a + i*n2*n3, n2, n3);
        return array2d;
    }

-    inline T* row_ptr(int i1, int i2) {
+    inline T* row_ptr(int i1, int i2)
+    {
        return a + i1*n2*n3 + i2*n3;
    }
 };

-template <class T> struct Array4d {
+template <class T>
+struct Array4d
+{
    T* a;
    int n1,n2,n3,n4;
    bool needToDeallocArray;
    int steps[4];

-    void init_steps() {
+    void init_steps()
+    {
        steps[0] = n2*n3*n4;
        steps[1] = n3*n4;
        steps[2] = n4;
        steps[3] = 1;
    }

-    Array4d(T* _a, int _n1, int _n2, int _n3, int _n4):
+    Array4d(T* _a, int _n1, int _n2, int _n3, int _n4) :
        a(_a), n1(_n1), n2(_n2), n3(_n3), n4(_n4), needToDeallocArray(false)
-     {
+    {
        init_steps();
-     }
+    }

-    Array4d(int _n1, int _n2, int _n3, int _n4):
+    Array4d(int _n1, int _n2, int _n3, int _n4) :
        n1(_n1), n2(_n2), n3(_n3), n4(_n4), needToDeallocArray(true)
    {
        a = new T[n1*n2*n3*n4];
        init_steps();
-   }
+    }

-    ~Array4d() {
-        if (needToDeallocArray) {
+    ~Array4d()
+    {
+        if (needToDeallocArray)
            delete[] a;
-        }
    }

-    Array3d<T> operator [] (int i) {
+    Array3d<T> operator [] (int i)
+    {
        Array3d<T> array3d(a + i*n2*n3*n4, n2, n3, n4);
        return array3d;
    }

-    inline T* row_ptr(int i1, int i2, int i3) {
+    inline T* row_ptr(int i1, int i2, int i3)
+    {
        return a + i1*n2*n3*n4 + i2*n3*n4 + i3*n4;
    }

-    inline int step_size(int dimension) {
+    inline int step_size(int dimension)
+    {
        return steps[dimension];
    }
 };

--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -40,14 +40,17 @@
 //M*/

 #include "precomp.hpp"
-#include "opencv2/photo.hpp"
-#include "opencv2/imgproc.hpp"
+
 #include "fast_nlmeans_denoising_invoker.hpp"
 #include "fast_nlmeans_multi_denoising_invoker.hpp"
+#include "fast_nlmeans_denoising_opencl.hpp"

 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
                               int templateWindowSize, int searchWindowSize)
 {
+    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()),
+               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize))
+
    Mat src = _src.getMat();
    _dst.create(src.size(), src.type());
    Mat dst = _dst.getMat();
@@ -83,15 +86,20 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
                                      float h, float hForColorComponents,
                                      int templateWindowSize, int searchWindowSize)
 {
-    Mat src = _src.getMat();
-    _dst.create(src.size(), src.type());
-    Mat dst = _dst.getMat();
-
-    if (src.type() != CV_8UC3) {
+    if (_src.type() != CV_8UC3)
+    {
        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3!");
        return;
    }

+    CV_OCL_RUN(_src.dims() <= 2 && (_dst.isUMat() || _src.isUMat()),
+                ocl_fastNlMeansDenoisingColored(_src, _dst, h, hForColorComponents,
+                                                templateWindowSize, searchWindowSize))
+
+    Mat src = _src.getMat();
+    _dst.create(src.size(), src.type());
+    Mat dst = _dst.getMat();
+
    Mat src_lab;
    cvtColor(src, src_lab, COLOR_LBGR2Lab);

@@ -117,7 +125,8 @@ static void fastNlMeansDenoisingMultiCheckPreconditions(
                               int templateWindowSize, int searchWindowSize)
 {
    int src_imgs_size = static_cast<int>(srcImgs.size());
-    if (src_imgs_size == 0) {
+    if (src_imgs_size == 0)
+    {
        CV_Error(Error::StsBadArg, "Input images vector should not be empty!");
    }

@@ -136,11 +145,11 @@ static void fastNlMeansDenoisingMultiCheckPreconditions(
            "should be chosen corresponding srcImgs size!");
    }

-    for (int i = 1; i < src_imgs_size; i++) {
-        if (srcImgs[0].size() != srcImgs[i].size() || srcImgs[0].type() != srcImgs[i].type()) {
+    for (int i = 1; i < src_imgs_size; i++)
+        if (srcImgs[0].size() != srcImgs[i].size() || srcImgs[0].type() != srcImgs[i].type())
+        {
            CV_Error(Error::StsBadArg, "Input images should have the same size and type!");
        }
-    }
 }

 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
@@ -152,12 +161,13 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds

    fastNlMeansDenoisingMultiCheckPreconditions(
        srcImgs, imgToDenoiseIndex,
-        temporalWindowSize, templateWindowSize, searchWindowSize
-    );
+        temporalWindowSize, templateWindowSize, searchWindowSize);
+
    _dst.create(srcImgs[0].size(), srcImgs[0].type());
    Mat dst = _dst.getMat();

-    switch (srcImgs[0].type()) {
+    switch (srcImgs[0].type())
+    {
        case CV_8U:
            parallel_for_(cv::Range(0, srcImgs[0].rows),
                FastNlMeansMultiDenoisingInvoker<uchar>(
@@ -192,15 +202,15 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr

    fastNlMeansDenoisingMultiCheckPreconditions(
        srcImgs, imgToDenoiseIndex,
-        temporalWindowSize, templateWindowSize, searchWindowSize
-    );
+        temporalWindowSize, templateWindowSize, searchWindowSize);

    _dst.create(srcImgs[0].size(), srcImgs[0].type());
    Mat dst = _dst.getMat();

    int src_imgs_size = static_cast<int>(srcImgs.size());

-    if (srcImgs[0].type() != CV_8UC3) {
+    if (srcImgs[0].type() != CV_8UC3)
+    {
        CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3!");
        return;
    }
@@ -211,7 +221,8 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
    std::vector<Mat> src_lab(src_imgs_size);
    std::vector<Mat> l(src_imgs_size);
    std::vector<Mat> ab(src_imgs_size);
-    for (int i = 0; i < src_imgs_size; i++) {
+    for (int i = 0; i < src_imgs_size; i++)
+    {
        src_lab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC3);
        l[i] = Mat::zeros(srcImgs[0].size(), CV_8UC1);
        ab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC2);

--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -46,29 +46,35 @@ using namespace cv;

 template <typename T> static inline int calcDist(const T a, const T b);

-template <> inline int calcDist(const uchar a, const uchar b) {
+template <> inline int calcDist(const uchar a, const uchar b)
+{
    return (a-b) * (a-b);
 }

-template <> inline int calcDist(const Vec2b a, const Vec2b b) {
+template <> inline int calcDist(const Vec2b a, const Vec2b b)
+{
    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]);
 }

-template <> inline int calcDist(const Vec3b a, const Vec3b b) {
+template <> inline int calcDist(const Vec3b a, const Vec3b b)
+{
    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]) + (a[2]-b[2])*(a[2]-b[2]);
 }

-template <typename T> static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2) {
+template <typename T> static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+{
    const T a = m.at<T>(i1, j1);
    const T b = m.at<T>(i2, j2);
    return calcDist<T>(a,b);
 }

-template <typename T> static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down) {
-    return calcDist(a_down,b_down) - calcDist(a_up, b_up);
+template <typename T> static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+{
+    return calcDist(a_down, b_down) - calcDist(a_up, b_up);
 }

-template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar  b_up, uchar b_down) {
+template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar  b_up, uchar b_down)
+{
    int A = a_down - b_down;
    int B = a_up - b_up;
    return (A-B)*(A+B);
@@ -76,16 +82,37 @@ template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar  b_up, uch

 template <typename T> static inline void incWithWeight(int* estimation, int weight, T p);

-template <> inline void incWithWeight(int* estimation, int weight, uchar p) {
+template <> inline void incWithWeight(int* estimation, int weight, uchar p)
+{
    estimation[0] += weight * p;
 }

-template <> inline void incWithWeight(int* estimation, int weight, Vec2b p) {
+template <> inline void incWithWeight(int* estimation, int weight, Vec2b p)
+{
    estimation[0] += weight * p[0];
    estimation[1] += weight * p[1];
 }

-template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) {
+template <> inline void incWithWeight(int* estimation, int weight, Vec3b p)
+{
+    estimation[0] += weight * p[0];
+    estimation[1] += weight * p[1];
+    estimation[2] += weight * p[2];
+}
+
+template <> inline void incWithWeight(int* estimation, int weight, int p)
+{
+    estimation[0] += weight * p;
+}
+
+template <> inline void incWithWeight(int* estimation, int weight, Vec2i p)
+{
+    estimation[0] += weight * p[0];
+    estimation[1] += weight * p[1];
+}
+
+template <> inline void incWithWeight(int* estimation, int weight, Vec3i p)
+{
    estimation[0] += weight * p[0];
    estimation[1] += weight * p[1];
    estimation[2] += weight * p[2];
@@ -93,18 +120,21 @@ template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) {

 template <typename T> static inline T saturateCastFromArray(int* estimation);

-template <> inline uchar saturateCastFromArray(int* estimation) {
+template <> inline uchar saturateCastFromArray(int* estimation)
+{
    return saturate_cast<uchar>(estimation[0]);
 }

-template <> inline Vec2b saturateCastFromArray(int* estimation) {
+template <> inline Vec2b saturateCastFromArray(int* estimation)
+{
    Vec2b res;
    res[0] = saturate_cast<uchar>(estimation[0]);
    res[1] = saturate_cast<uchar>(estimation[1]);
    return res;
 }

-template <> inline Vec3b saturateCastFromArray(int* estimation) {
+template <> inline Vec3b saturateCastFromArray(int* estimation)
+{
    Vec3b res;
    res[0] = saturate_cast<uchar>(estimation[0]);
    res[1] = saturate_cast<uchar>(estimation[1]);
@@ -112,4 +142,20 @@ template <> inline Vec3b saturateCastFromArray(int* estimation) {
    return res;
 }

+template <> inline int saturateCastFromArray(int* estimation)
+{
+    return estimation[0];
+}
+
+template <> inline Vec2i saturateCastFromArray(int* estimation)
+{
+    estimation[1] = 0;
+    return Vec2i(estimation);
+}
+
+template <> inline Vec3i saturateCastFromArray(int* estimation)
+{
+    return Vec3i(estimation);
+}
+
 #endif
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
+#define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
+
+#include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cv {
+
+enum
+{
+    BLOCK_ROWS = 32,
+    BLOCK_COLS = 32,
+    CTA_SIZE = 256
+};
+
+static int divUp(int a, int b)
+{
+    return (a + b - 1) / b;
+}
+
+template <typename FT>
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
+                                      int & almostTemplateWindowSizeSqBinShift)
+{
+    const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255;
+    int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue;
+    int depth = DataType<FT>::depth;
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if (depth == CV_64F && !doubleSupport)
+        return false;
+
+    // precalc weight for every possible l2 dist between blocks
+    // additional optimization of precalced weights to replace division(averaging) by binary shift
+    CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX)
+    int templateWindowSizeSq = templateWindowSize * templateWindowSize;
+    almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq);
+    FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
+
+    const FT WEIGHT_THRESHOLD = 1e-3f;
+    int maxDist = 255 * 255 * cn;
+    int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
+    FT den = 1.0f / (h * h * cn);
+
+    almostDist2Weight.create(1, almostMaxDist, CV_32SC1);
+
+    ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
+                  format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
+           almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD);
+
+    size_t globalsize[1] = { almostMaxDist };
+    return k.run(1, globalsize, NULL, false);
+}
+
+static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
+                                     int templateWindowSize, int searchWindowSize)
+{
+    int type = _src.type(), cn = CV_MAT_CN(type);
+    Size size = _src.size();
+
+    if ( type != CV_8UC1 || type != CV_8UC2 || type != CV_8UC4 )
+        return false;
+
+    int templateWindowHalfWize = templateWindowSize / 2;
+    int searchWindowHalfSize = searchWindowSize / 2;
+    templateWindowSize  = templateWindowHalfWize * 2 + 1;
+    searchWindowSize = searchWindowHalfSize * 2 + 1;
+    int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
+    int almostTemplateWindowSizeSqBinShift = -1;
+
+    char cvt[2][40];
+    String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
+                         " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
+                         " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
+                         " -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s",
+                         templateWindowSize, searchWindowSize, ocl::typeToStr(type),
+                         ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE,
+                         templateWindowHalfWize, searchWindowHalfSize,
+                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
+                         CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
+
+    ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
+    if (k.empty())
+        return false;
+
+    UMat almostDist2Weight;
+    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn,
+                                   almostTemplateWindowSizeSqBinShift))
+        return false;
+    CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
+
+    UMat srcex;
+    int borderSize = searchWindowHalfSize + templateWindowHalfWize;
+    copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+
+    _dst.create(size, type);
+    UMat dst = _dst.getUMat();
+
+    int searchWindowSizeSq = searchWindowSize * searchWindowSize;
+    Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
+    Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
+    UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));
+
+    srcex = srcex(Rect(Point(borderSize, borderSize), size));
+    k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
+           ocl::KernelArg::PtrReadOnly(almostDist2Weight),
+           ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
+
+    size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 };
+    return k.run(2, globalsize, localsize, false);
+}
+
+static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
+                                      float h, float hForColorComponents,
+                                      int templateWindowSize, int searchWindowSize)
+{
+    UMat src = _src.getUMat();
+    _dst.create(src.size(), src.type());
+    UMat dst = _dst.getUMat();
+
+    UMat src_lab;
+    cvtColor(src, src_lab, COLOR_LBGR2Lab);
+
+    UMat l(src.size(), CV_8U);
+    UMat ab(src.size(), CV_8UC2);
+    std::vector<UMat> l_ab(2), l_ab_denoised(2);
+    l_ab[0] = l;
+    l_ab[1] = ab;
+    l_ab_denoised[0].create(src.size(), CV_8U);
+    l_ab_denoised[1].create(src.size(), CV_8UC2);
+
+    int from_to[] = { 0,0, 1,1, 2,2 };
+    mixChannels(std::vector<UMat>(1, src_lab), l_ab, from_to, 3);
+
+    fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize);
+    fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize);
+
+    UMat dst_lab(src.size(), src.type());
+    mixChannels(l_ab_denoised, std::vector<UMat>(1, dst_lab), from_to, 3);
+
+    cvtColor(dst_lab, dst, COLOR_Lab2LBGR);
+    return true;
+}
+
+}
+
+#endif
+#endif
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
--- a/modules/photo/src/precomp.hpp
+++ b/modules/photo/src/precomp.hpp
@@ -46,6 +46,8 @@
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/photo.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/imgproc.hpp"

 #ifdef HAVE_TEGRA_OPTIMIZATION
 #include "opencv2/photo/photo_tegra.hpp"

--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
+{
+    int cn, templateWindowSize, searchWindowSize;
+    float h;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        cn = GET_PARAM(0);
+        use_roi = GET_PARAM(1);
+
+        templateWindowSize = 7;
+        searchWindowSize = 21;
+        h = 3.0f;
+    }
+
+    virtual void generateTestData()
+    {
+        Mat image;
+        if (cn == 1)
+        {
+            image = readImage("denoising/lena_noised_gaussian_sigma=10.png", IMREAD_GRAYSCALE);
+            ASSERT_FALSE(image.empty());
+        }
+
+        const int type = CV_8UC(cn);
+
+        Size roiSize = cn == 1 ? image.size() : randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
+        if (cn == 1)
+            image.copyTo(src_roi);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+};
+
+typedef FastNlMeansDenoisingTestBase FastNlMeansDenoising;
+
+OCL_TEST_P(FastNlMeansDenoising, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1)
+    }
+}
+
+typedef FastNlMeansDenoisingTestBase fastNlMeansDenoisingColored;
+
+OCL_TEST_P(fastNlMeansDenoisingColored, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1)
+    }
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, fastNlMeansDenoisingColored, Combine(Values(Channels(3)), Bool()));
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL