Merge pull request #3814 from erikrk:denoising-16bit-master

5501cfd8 · Vadim Pisarevsky · 7ea02397 · 01d3df0d · 5501cfd8 · 5501cfd8
Commit 5501cfd8 authored Mar 24, 2015 by Vadim Pisarevsky
9 changed files
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -442,6 +442,10 @@ template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(
 template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }

 //! @cond IGNORED

@@ -452,6 +456,8 @@ template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_c
 template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
 template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
 template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }

 template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
@@ -460,6 +466,8 @@ template<> inline schar saturate_cast<schar>(short v)        { return saturate_c
 template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
 template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }

 template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
 template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
@@ -467,12 +475,16 @@ template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((
 template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
 template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
 template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }

 template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
 template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
 template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
 template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
 template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }

 template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
 template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }

--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -119,7 +119,7 @@ CV_EXPORTS_W void inpaint( InputArray src, InputArray inpaintMask,
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
 optimizations. Noise expected to be a gaussian white noise

-@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
 @param dst Output image with the same size and type as src .
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@@ -138,6 +138,35 @@ parameter.
 CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h = 3,
        int templateWindowSize = 7, int searchWindowSize = 21);

+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
+                                        const std::vector<float>& h,
+                                        int templateWindowSize = 7, int searchWindowSize = 21,
+                                        int normType = NORM_L2);
+
 /** @brief Modification of fastNlMeansDenoising function for colored images

 @param src Input 8-bit 3-channel image.
@@ -165,8 +194,9 @@ captured in small period of time. For example video. This version of the functio
 images or for manual manipulation with colorspaces. For more details see
 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>

-@param srcImgs Input 8-bit 1-channel, 2-channel or 3-channel images sequence. All images should
-have the same type and size.
+@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
+4-channel images sequence. All images should have the same type and
+size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
@@ -178,14 +208,45 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
-removes noise but also removes image details, smaller h value preserves details but also preserves
-some noise
+@param h Parameter regulating filter strength. Bigger h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
 */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
        int imgToDenoiseIndex, int temporalWindowSize,
        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);

+/** @brief Modification of fastNlMeansDenoising function for images sequence where consequtive images have been
+captured in small period of time. For example video. This version of the function is for grayscale
+images or for manual manipulation with colorspaces. For more details see
+<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
+
+@param srcImgs Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel images sequence. All images should
+have the same type and size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+                                             int imgToDenoiseIndex, int temporalWindowSize,
+                                             const std::vector<float>& h,
+                                             int templateWindowSize = 7, int searchWindowSize = 21,
+                                             int normType = NORM_L2);
+
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences

 @param srcImgs Input 8-bit 3-channel images sequence. All images should have the same type and

--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -50,13 +50,13 @@

 using namespace cv;

-template <typename T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansDenoisingInvoker :
        public ParallelLoopBody
 {
 public:
    FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
-        int template_window_size, int search_window_size, const float h);
+        int template_window_size, int search_window_size, const float *h);

    void operator() (const Range& range) const;

@@ -75,9 +75,9 @@ private:
    int template_window_half_size_;
    int search_window_half_size_;

-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
    int almost_template_window_size_sq_bin_shift_;
-    std::vector<int> almost_dist2weight_;
+    std::vector<WT> almost_dist2weight_;

    void calcDistSumsForFirstElementInRow(
        int i, Array2d<int>& dist_sums,
@@ -99,15 +99,15 @@ inline int getNearestPowerOf2(int value)
    return p;
 }

-template <class T>
-FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansDenoisingInvoker(
    const Mat& src, Mat& dst,
    int template_window_size,
    int search_window_size,
-    const float h) :
+    const float *h) :
    src_(src), dst_(dst)
 {
-    CV_Assert(src.channels() == sizeof(T)); //T is Vec1b or Vec2b or Vec3b
+    CV_Assert(src.channels() == pixelInfo<T>::channels);

    template_window_half_size_ = template_window_size / 2;
    search_window_half_size_   = search_window_size   / 2;
@@ -117,8 +117,10 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
    border_size_ = search_window_half_size_ + template_window_half_size_;
    copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT);

-    const int max_estimate_sum_value = search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value =
+        (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
+    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
+                                          pixelInfo<WT>::sampleMax());

    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -127,30 +129,24 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
    double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;

-    int max_dist = 255 * 255 * sizeof(T);
+    int max_dist = D::template maxDist<T>();
    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
    almost_dist2weight_.resize(almost_max_dist);

-    const double WEIGHT_THRESHOLD = 0.001;
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
-
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
-
-        almost_dist2weight_[almost_dist] = weight;
+        almost_dist2weight_[almost_dist] =
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
    }
-    CV_Assert(almost_dist2weight_[0] == fixed_point_mult_);

    // additional optimization init end
    if (dst_.empty())
        dst_ = Mat::zeros(src_.size(), src_.type());
 }

-template <class T>
-void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
    int row_from = range.start;
    int row_to = range.end - 1;
@@ -215,7 +211,7 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                            dist_sums_row[x] -= col_dist_sums_row[x];

                            int bx = start_bx + x;
-                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
+                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);

                            dist_sums_row[x] += col_dist_sums_row[x];
                            up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -227,9 +223,11 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
            }

            // calc weights
-            int estimation[3], weights_sum = 0;
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                estimation[channel_num] = 0;
+            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
+                weights_sum[channel_num] = 0;

            for (int y = 0; y < search_window_size_; y++)
            {
@@ -238,24 +236,21 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                for (int x = 0; x < search_window_size_; x++)
                {
                    int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
-                    int weight = almost_dist2weight_[almostAvgDist];
-                    weights_sum += weight;
-
+                    WT weight = almost_dist2weight_[almostAvgDist];
                    T p = cur_row_ptr[border_size_ + search_window_x + x];
-                    incWithWeight(estimation, weight, p);
+                    incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                }
            }

-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum/2) / weights_sum;
-
-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
+                                                                                      weights_sum);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
        }
    }
 }

-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
    int i,
    Array2d<int>& dist_sums,
    Array3d<int>& col_dist_sums,
@@ -276,7 +271,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
            for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                {
-                    int dist = calcDist<T>(extended_src_,
+                    int dist = D::template calcDist<T>(extended_src_,
                        border_size_ + i + ty, border_size_ + j + tx,
                        border_size_ + start_y + ty, border_size_ + start_x + tx);

@@ -288,8 +283,8 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
        }
 }

-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
    int i, int j, int first_col_num,
    Array2d<int>& dist_sums,
    Array3d<int>& col_dist_sums,
@@ -312,7 +307,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
            int by = start_by + y;
            int bx = start_bx + x;
            for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
-                col_dist_sums[new_last_col_num][y][x] += calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
+                col_dist_sums[new_last_col_num][y][x] += D::template calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);

            dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x];
            up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x];

--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -28,12 +28,16 @@ static int divUp(int a, int b)
    return (a + b - 1) / b;
 }

-template <typename FT>
-static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
+template <typename FT, typename ST, typename WT>
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
+                                      int searchWindowSize, int templateWindowSize,
+                                      const FT *h, int hn, int cn, int normType,
                                      int & almostTemplateWindowSizeSqBinShift)
 {
-    const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255;
-    int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue;
+    const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
+        std::numeric_limits<ST>::max();
+    int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
+                                           std::numeric_limits<int>::max());
    int depth = DataType<FT>::depth;
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

@@ -48,33 +52,44 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
    FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;

    const FT WEIGHT_THRESHOLD = 1e-3f;
-    int maxDist = 255 * 255 * cn;
+    int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
+        std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
    int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
-    FT den = 1.0f / (h * h * cn);
+    FT den[4];
+    CV_Assert(hn > 0 && hn <= 4);
+    for (int i=0; i<hn; i++)
+        den[i] = 1.0f / (h[i] * h[i] * cn);

-    almostDist2Weight.create(1, almostMaxDist, CV_32SC1);
+    almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));

+    char buf[40];
    ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
-                  format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                  format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
+                         " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
+                         ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         normType == NORM_L1 ? " -D ABS" : ""));
    if (k.empty())
        return false;

    k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
-           almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD);
+           almostDist2ActualDistMultiplier, fixedPointMult,
+           ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);

    size_t globalsize[1] = { almostMaxDist };
    return k.run(1, globalsize, NULL, false);
 }

-static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
-                                     int templateWindowSize, int searchWindowSize)
+static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
+                                     int templateWindowSize, int searchWindowSize, int normType)
 {
-    int type = _src.type(), cn = CV_MAT_CN(type);
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
    Size size = _src.size();

-    if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
+    if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
+                             (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
        return false;

    int templateWindowHalfWize = templateWindowSize / 2;
@@ -84,33 +99,68 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
    int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
    int almostTemplateWindowSizeSqBinShift = -1;

-    char cvt[2][40];
+    char buf[4][40];
    String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
-                         " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
+                         " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
+                         " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
+                         " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                         " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
-                         " -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s",
-                         templateWindowSize, searchWindowSize, ocl::typeToStr(type),
-                         ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize,
-                         templateWindowHalfWize, searchWindowHalfSize,
-                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
-                         ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
+                         " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
+                         templateWindowSize, searchWindowSize,
+                         ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
+                         ocl::typeToStr(CV_32SC(hn)),
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
+                         format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
+                         format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
+                         format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
+                         format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
+                         BLOCK_COLS, BLOCK_ROWS,
+                         ctaSize, templateWindowHalfWize, searchWindowHalfSize,
+                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
+                         (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
+                         ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
+                         normType == NORM_L1 ? " -D ABS" : "");

    ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
    if (k.empty())
        return false;

    UMat almostDist2Weight;
-    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn,
-                                   almostTemplateWindowSizeSqBinShift))
+    if ((depth == CV_8U &&
+         !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
+                                                       searchWindowSize, templateWindowSize,
+                                                       h, hn, cn, normType,
+                                                       almostTemplateWindowSizeSqBinShift)) ||
+        (depth == CV_16U &&
+         !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
+                                                          searchWindowSize, templateWindowSize,
+                                                          h, hn, cn, normType,
+                                                          almostTemplateWindowSizeSqBinShift)))
        return false;
    CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);

    UMat srcex;
    int borderSize = searchWindowHalfSize + templateWindowHalfWize;
-    copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+    if (cn == 3) {
+        srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
+        UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
+        int from_to[] = { 0,0, 1,1, 2,2 };
+        mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
+        copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
+                       BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
+    }
+    else
+        copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);

    _dst.create(size, type);
-    UMat dst = _dst.getUMat();
+    UMat dst;
+    if (cn == 3)
+        dst.create(size, CV_MAKE_TYPE(depth, 4));
+    else
+        dst = _dst.getUMat();

    int searchWindowSizeSq = searchWindowSize * searchWindowSize;
    Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
@@ -123,7 +173,14 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
           ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);

    size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
-    return k.run(2, globalsize, localsize, false);
+    if (!k.run(2, globalsize, localsize, false)) return false;
+
+    if (cn == 3) {
+        int from_to[] = { 0,0, 1,1, 2,2 };
+        mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
+    }
+
+    return true;
 }

 static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,

--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -50,14 +50,14 @@

 using namespace cv;

-template <typename T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansMultiDenoisingInvoker :
        ParallelLoopBody
 {
 public:
    FastNlMeansMultiDenoisingInvoker(const std::vector<Mat>& srcImgs, int imgToDenoiseIndex,
                                     int temporalWindowSize, Mat& dst, int template_window_size,
-                                     int search_window_size, const float h);
+                                     int search_window_size, const float *h);

    void operator() (const Range& range) const;

@@ -81,9 +81,9 @@ private:
    int search_window_half_size_;
    int temporal_window_half_size_;

-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
    int almost_template_window_size_sq_bin_shift;
-    std::vector<int> almost_dist2weight;
+    std::vector<WT> almost_dist2weight;

    void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
                                          Array4d<int>& col_dist_sums,
@@ -94,19 +94,19 @@ private:
                                          Array4d<int>& up_col_dist_sums) const;
 };

-template <class T>
-FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansMultiDenoisingInvoker(
    const std::vector<Mat>& srcImgs,
    int imgToDenoiseIndex,
    int temporalWindowSize,
    cv::Mat& dst,
    int template_window_size,
    int search_window_size,
-    const float h) :
+    const float *h) :
        dst_(dst), extended_srcs_(srcImgs.size())
 {
    CV_Assert(srcImgs.size() > 0);
-    CV_Assert(srcImgs[0].channels() == sizeof(T));
+    CV_Assert(srcImgs[0].channels() == pixelInfo<T>::channels);

    rows_ = srcImgs[0].rows;
    cols_ = srcImgs[0].cols;
@@ -125,8 +125,10 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
            border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT);

    main_extended_src_ = extended_srcs_[temporal_window_half_size_];
-    const int max_estimate_sum_value = temporal_window_size_ * search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value =
+        (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
+    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
+                                          pixelInfo<WT>::sampleMax());

    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -138,30 +140,24 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
    int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
    double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;

-    int max_dist = 255 * 255 * sizeof(T);
-    int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
+    int max_dist = D::template maxDist<T>();
+    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
    almost_dist2weight.resize(almost_max_dist);

-    const double WEIGHT_THRESHOLD = 0.001;
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
-
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
-
-        almost_dist2weight[almost_dist] = weight;
+        almost_dist2weight[almost_dist] =
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
    }
-    CV_Assert(almost_dist2weight[0] == fixed_point_mult_);

    // additional optimization init end
    if (dst_.empty())
        dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type());
 }

-template <class T>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
    int row_from = range.start;
    int row_to = range.end - 1;
@@ -234,7 +230,7 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                                dist_sums_row[x] -= col_dist_sums_row[x];

                                col_dist_sums_row[x] = up_col_dist_sums_row[x] +
-                                    calcUpDownDist(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
+                                    D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);

                                dist_sums_row[x] += col_dist_sums_row[x];
                                up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -247,11 +243,11 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
            }

            // calc weights
-            int weights_sum = 0;
-
-            int estimation[3];
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                estimation[channel_num] = 0;
+            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
+                weights_sum[channel_num] = 0;

            for (int d = 0; d < temporal_window_size_; d++)
            {
@@ -266,26 +262,22 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                    {
                        int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;

-                        int weight = almost_dist2weight[almostAvgDist];
-                        weights_sum += weight;
-
+                        WT weight =  almost_dist2weight[almostAvgDist];
                        T p = cur_row_ptr[border_size_ + search_window_x + x];
-                        incWithWeight(estimation, weight, p);
+                        incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                    }
                }
            }

-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum / 2) / weights_sum;
-
-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
-
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
+                                                                                      weights_sum);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
        }
    }
 }

-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
        int i, Array3d<int>& dist_sums, Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
    int j = 0;
@@ -310,7 +302,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
                {
                    for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                    {
-                        int dist = calcDist<T>(
+                        int dist = D::template calcDist<T>(
                                    main_extended_src_.at<T>(border_size_ + i + ty, border_size_ + j + tx),
                                    cur_extended_src.at<T>(border_size_ + start_y + ty, border_size_ + start_x + tx));

@@ -325,8 +317,8 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
    }
 }

-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
    int i, int j, int first_col_num, Array3d<int>& dist_sums,
    Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
@@ -353,7 +345,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRo
                int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
                for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                {
-                    *col_dist_sums_ptr += calcDist<T>(
+                    *col_dist_sums_ptr += D::template calcDist<T>(
                                main_extended_src_.at<T>(ay + ty, ax),
                                cur_extended_src.at<T>(by + ty, bx));
                }

--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -13,11 +13,11 @@
 namespace cvtest {
 namespace ocl {

-PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
+PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, int, bool, bool)
 {
-    int cn, templateWindowSize, searchWindowSize;
-    float h;
-    bool use_roi;
+    int cn, normType, templateWindowSize, searchWindowSize;
+    std::vector<float> h;
+    bool use_roi, use_image;

    TEST_DECLARE_INPUT_PARAMETER(src);
    TEST_DECLARE_OUTPUT_PARAMETER(dst);
@@ -25,29 +25,46 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
    virtual void SetUp()
    {
        cn = GET_PARAM(0);
-        use_roi = GET_PARAM(1);
+        normType = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+        use_image = GET_PARAM(3);

        templateWindowSize = 7;
        searchWindowSize = 21;
-        h = 3.0f;
+
+        h.resize(cn);
+        for (int i=0; i<cn; i++)
+            h[i] = 3.0f + 0.5f*i;
    }

    virtual void generateTestData()
    {
+        const int type = CV_8UC(cn);
        Mat image;
-        if (cn == 1)
-        {
-            image = readImage("denoising/lena_noised_gaussian_sigma=10.png", IMREAD_GRAYSCALE);
+
+        if (use_image) {
+            image = readImage("denoising/lena_noised_gaussian_sigma=10.png",
+                                  cn == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
            ASSERT_FALSE(image.empty());
        }

-        const int type = CV_8UC(cn);
-
-        Size roiSize = cn == 1 ? image.size() : randomSize(1, MAX_VALUE);
+        Size roiSize = use_image ? image.size() : randomSize(1, MAX_VALUE);
        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
-        if (cn == 1)
-            image.copyTo(src_roi);
+        if (use_image) {
+            ASSERT_TRUE(cn > 0 && cn <= 4);
+            if (cn == 2) {
+                int from_to[] = { 0,0, 1,1 };
+                src_roi.create(roiSize, type);
+                mixChannels(&image, 1, &src_roi, 1, from_to, 2);
+            }
+            else if (cn == 4) {
+                int from_to[] = { 0,0, 1,1, 2,2, 1,3};
+                src_roi.create(roiSize, type);
+                mixChannels(&image, 1, &src_roi, 1, from_to, 4);
+            }
+            else image.copyTo(src_roi);
+        }

        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255);
@@ -65,8 +82,23 @@ OCL_TEST_P(FastNlMeansDenoising, Mat)
    {
        generateTestData();

-        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1);
+    }
+}
+
+typedef FastNlMeansDenoisingTestBase FastNlMeansDenoising_hsep;
+
+OCL_TEST_P(FastNlMeansDenoising_hsep, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize, normType));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize, normType));

        OCL_EXPECT_MATS_NEAR(dst, 1);
    }
@@ -80,15 +112,21 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
    {
        generateTestData();

-        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h[0], h[0], templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h[0], h[0], templateWindowSize, searchWindowSize));

        OCL_EXPECT_MATS_NEAR(dst, 1);
    }
 }

-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool()));
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored, Combine(Values(3, 4), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
+                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
+                                    Bool(), Values(true)));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising_hsep,
+                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
+                                    Bool(), Values(true)));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
+                            Combine(Values(3, 4), Values((int)NORM_L2), Bool(), Values(false)));

 } } // namespace cvtest::ocl