Refactoring in preparation for 16-bit implementation of fastNlMeansDenoising

e178294b · Erik Karlsson · 5466e321 · e178294b · e178294b · e178294b
Commit e178294b authored Feb 12, 2015 by Erik Karlsson
4 changed files
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -65,17 +65,17 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
    switch (src.type()) {
        case CV_8U:
            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar>(
+                FastNlMeansDenoisingInvoker<uchar, int, unsigned int>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC2:
            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b>(
+                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned int>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC3:
            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b>(
+                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned int>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        default:
@@ -175,19 +175,19 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
    {
        case CV_8U:
            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar>(
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned int>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC2:
            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned int>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC3:
            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned int>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
            break;

--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -50,7 +50,7 @@

 using namespace cv;

-template <typename T>
+template <typename T, typename IT, typename UIT>
 struct FastNlMeansDenoisingInvoker :
        public ParallelLoopBody
 {
@@ -75,20 +75,20 @@ private:
    int template_window_half_size_;
    int search_window_half_size_;

-    int fixed_point_mult_;
+    IT fixed_point_mult_;
    int almost_template_window_size_sq_bin_shift_;
-    std::vector<int> almost_dist2weight_;
+    std::vector<IT> almost_dist2weight_;

    void calcDistSumsForFirstElementInRow(
-        int i, Array2d<int>& dist_sums,
-        Array3d<int>& col_dist_sums,
-        Array3d<int>& up_col_dist_sums) const;
+        int i, Array2d<IT>& dist_sums,
+        Array3d<IT>& col_dist_sums,
+        Array3d<IT>& up_col_dist_sums) const;

    void calcDistSumsForElementInFirstRow(
        int i, int j, int first_col_num,
-        Array2d<int>& dist_sums,
-        Array3d<int>& col_dist_sums,
-        Array3d<int>& up_col_dist_sums) const;
+        Array2d<IT>& dist_sums,
+        Array3d<IT>& col_dist_sums,
+        Array3d<IT>& up_col_dist_sums) const;
 };

 inline int getNearestPowerOf2(int value)
@@ -99,8 +99,8 @@ inline int getNearestPowerOf2(int value)
    return p;
 }

-template <class T>
-FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
+template <class T, typename IT, typename UIT>
+FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
    const Mat& src, Mat& dst,
    int template_window_size,
    int search_window_size,
@@ -117,8 +117,8 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
    border_size_ = search_window_half_size_ + template_window_half_size_;
    copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT);

-    const int max_estimate_sum_value = search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value = (IT)search_window_size_ * (IT)search_window_size_ * 255;
+    fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;

    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -127,7 +127,7 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
    double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;

-    int max_dist = 255 * 255 * sizeof(T);
+    IT max_dist = 255 * 255 * sizeof(T);
    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
    almost_dist2weight_.resize(almost_max_dist);

@@ -135,7 +135,7 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));

        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
            weight = 0;
@@ -149,21 +149,21 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
        dst_ = Mat::zeros(src_.size(), src_.type());
 }

-template <class T>
-void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
+template <class T, typename IT, typename UIT>
+void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) const
 {
    int row_from = range.start;
    int row_to = range.end - 1;

    // sums of cols anf rows for current pixel p
-    Array2d<int> dist_sums(search_window_size_, search_window_size_);
+    Array2d<IT> dist_sums(search_window_size_, search_window_size_);

    // for lazy calc optimization (sum of cols for current pixel)
-    Array3d<int> col_dist_sums(template_window_size_, search_window_size_, search_window_size_);
+    Array3d<IT> col_dist_sums(template_window_size_, search_window_size_, search_window_size_);

    int first_col_num = -1;
    // last elements of column sum (for each element in row)
-    Array3d<int> up_col_dist_sums(src_.cols, search_window_size_, search_window_size_);
+    Array3d<IT> up_col_dist_sums(src_.cols, search_window_size_, search_window_size_);

    for (int i = row_from; i <= row_to; i++)
    {
@@ -202,9 +202,9 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const

                    for (int y = 0; y < search_window_size; y++)
                    {
-                        int * dist_sums_row = dist_sums.row_ptr(y);
-                        int * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y);
-                        int * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y);
+                        IT * dist_sums_row = dist_sums.row_ptr(y);
+                        IT * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y);
+                        IT * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y);

                        const T * b_up_ptr = extended_src_.ptr<T>(start_by - template_window_half_size_ - 1 + y);
                        const T * b_down_ptr = extended_src_.ptr<T>(start_by + template_window_half_size_ + y);
@@ -215,7 +215,7 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                            dist_sums_row[x] -= col_dist_sums_row[x];

                            int bx = start_bx + x;
-                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
+                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);

                            dist_sums_row[x] += col_dist_sums_row[x];
                            up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -227,39 +227,39 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
            }

            // calc weights
-            int estimation[3], weights_sum = 0;
+            IT estimation[3], weights_sum = 0;
            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
                estimation[channel_num] = 0;

            for (int y = 0; y < search_window_size_; y++)
            {
                const T* cur_row_ptr = extended_src_.ptr<T>(border_size_ + search_window_y + y);
-                int* dist_sums_row = dist_sums.row_ptr(y);
+                IT* dist_sums_row = dist_sums.row_ptr(y);
                for (int x = 0; x < search_window_size_; x++)
                {
-                    int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
-                    int weight = almost_dist2weight_[almostAvgDist];
+                    int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_);
+                    IT weight = almost_dist2weight_[almostAvgDist];
                    weights_sum += weight;

                    T p = cur_row_ptr[border_size_ + search_window_x + x];
-                    incWithWeight(estimation, weight, p);
+                    incWithWeight<T, IT>(estimation, weight, p);
                }
            }

            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum/2) / weights_sum;
+                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum/2) / weights_sum;

-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
        }
    }
 }

-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElementInRow(
    int i,
-    Array2d<int>& dist_sums,
-    Array3d<int>& col_dist_sums,
-    Array3d<int>& up_col_dist_sums) const
+    Array2d<IT>& dist_sums,
+    Array3d<IT>& col_dist_sums,
+    Array3d<IT>& up_col_dist_sums) const
 {
    int j = 0;

@@ -276,7 +276,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
            for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                {
-                    int dist = calcDist<T>(extended_src_,
+                    int dist = calcDist<T, IT>(extended_src_,
                        border_size_ + i + ty, border_size_ + j + tx,
                        border_size_ + start_y + ty, border_size_ + start_x + tx);

@@ -288,12 +288,12 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
        }
 }

-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForElementInFirstRow(
    int i, int j, int first_col_num,
-    Array2d<int>& dist_sums,
-    Array3d<int>& col_dist_sums,
-    Array3d<int>& up_col_dist_sums) const
+    Array2d<IT>& dist_sums,
+    Array3d<IT>& col_dist_sums,
+    Array3d<IT>& up_col_dist_sums) const
 {
    int ay = border_size_ + i;
    int ax = border_size_ + j + template_window_half_size_;
@@ -312,7 +312,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
            int by = start_by + y;
            int bx = start_bx + x;
            for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
-                col_dist_sums[new_last_col_num][y][x] += calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
+                col_dist_sums[new_last_col_num][y][x] += calcDist<T,IT>(extended_src_, ay + ty, ax, by + ty, bx);

            dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x];
            up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x];

--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -44,118 +44,152 @@

 using namespace cv;

-template <typename T> static inline int calcDist(const T a, const T b);
+template <typename T, typename IT> struct calcDist_
+{
+    static inline IT f(const T a, const T b);
+};

-template <> inline int calcDist(const uchar a, const uchar b)
+template <typename IT> struct calcDist_<uchar, IT>
 {
-    return (a-b) * (a-b);
-}
+    static inline IT f(uchar a, uchar b)
+    {
+        return (IT)(a-b) * (IT)(a-b);
+    }
+};

-template <> inline int calcDist(const Vec2b a, const Vec2b b)
+template <typename IT> struct calcDist_<Vec2b, IT>
 {
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]);
-}
+    static inline IT f(const Vec2b a, const Vec2b b)
+    {
+        return (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) + (IT)(a[1]-b[1])*(IT)(a[1]-b[1]);
+    }
+};
+
+template <typename IT> struct calcDist_<Vec3b, IT>
+{
+    static inline IT f(const Vec3b a, const Vec3b b)
+    {
+        return
+            (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
+            (IT)(a[1]-b[1])*(IT)(a[1]-b[1]) +
+            (IT)(a[2]-b[2])*(IT)(a[2]-b[2]);
+    }
+};

-template <> inline int calcDist(const Vec3b a, const Vec3b b)
+template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
 {
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]) + (a[2]-b[2])*(a[2]-b[2]);
+    return calcDist_<T, IT>::f(a, b);
 }

-template <typename T> static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+template <typename T, typename IT>
+static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
 {
    const T a = m.at<T>(i1, j1);
    const T b = m.at<T>(i2, j2);
-    return calcDist<T>(a,b);
+    return calcDist<T, IT>(a,b);
 }

-template <typename T> static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+template <typename T, typename IT> struct calcUpDownDist_
 {
-    return calcDist(a_down, b_down) - calcDist(a_up, b_up);
-}
+    static inline IT f(T a_up, T a_down, T b_up, T b_down)
+    {
+        return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+    }
+};

-template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar  b_up, uchar b_down)
+template <typename IT> struct calcUpDownDist_<uchar, IT>
 {
-    int A = a_down - b_down;
-    int B = a_up - b_up;
+    static inline IT f(uchar a_up, uchar a_down, uchar b_up, uchar b_down)
+    {
+        IT A = a_down - b_down;
+        IT B = a_up - b_up;
        return (A-B)*(A+B);
-}
+    }
+};

-template <typename T> static inline void incWithWeight(int* estimation, int weight, T p);
-
-template <> inline void incWithWeight(int* estimation, int weight, uchar p)
+template <typename T, typename IT>
+static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
 {
-    estimation[0] += weight * p;
-}
+    return calcUpDownDist_<T, IT>::f(a_up, a_down, b_up, b_down);
+};

-template <> inline void incWithWeight(int* estimation, int weight, Vec2b p)
+template <typename T, typename IT> struct incWithWeight_
 {
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-}
+    static inline void f(IT* estimation, IT weight, T p);
+};

-template <> inline void incWithWeight(int* estimation, int weight, Vec3b p)
-{
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-    estimation[2] += weight * p[2];
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, int p)
+template <typename IT> struct incWithWeight_<uchar, IT>
 {
+    static inline void f(IT* estimation, IT weight, uchar p)
+    {
        estimation[0] += weight * p;
-}
+    }
+};

-template <> inline void incWithWeight(int* estimation, int weight, Vec2i p)
+template <typename IT> struct incWithWeight_<Vec2b, IT>
 {
+    static inline void f(IT* estimation, IT weight, Vec2b p)
+    {
        estimation[0] += weight * p[0];
        estimation[1] += weight * p[1];
-}
+    }
+};

-template <> inline void incWithWeight(int* estimation, int weight, Vec3i p)
+template <typename IT> struct incWithWeight_<Vec3b, IT>
 {
+    static inline void f(IT* estimation, IT weight, Vec3b p)
+    {
        estimation[0] += weight * p[0];
        estimation[1] += weight * p[1];
        estimation[2] += weight * p[2];
+    }
+};
+
+template <typename T, typename IT>
+static inline void incWithWeight(IT* estimation, IT weight, T p)
+{
+    return incWithWeight_<T, IT>::f(estimation, weight, p);
 }

-template <typename T> static inline T saturateCastFromArray(int* estimation);
+template <typename T, typename IT> struct saturateCastFromArray_
+{
+    static inline T f(IT* estimation);
+};

-template <> inline uchar saturateCastFromArray(int* estimation)
+template <typename IT> struct saturateCastFromArray_<uchar, IT>
 {
+    static inline uchar f(IT* estimation)
+    {
        return saturate_cast<uchar>(estimation[0]);
-}
+    }
+};

-template <> inline Vec2b saturateCastFromArray(int* estimation)
+template <typename IT> struct saturateCastFromArray_<Vec2b, IT>
 {
+    static inline Vec2b f(IT* estimation)
+    {
        Vec2b res;
        res[0] = saturate_cast<uchar>(estimation[0]);
        res[1] = saturate_cast<uchar>(estimation[1]);
        return res;
-}
+    }
+};

-template <> inline Vec3b saturateCastFromArray(int* estimation)
+template <typename IT> struct saturateCastFromArray_<Vec3b, IT>
 {
+    static inline Vec3b f(IT* estimation)
+    {
        Vec3b res;
        res[0] = saturate_cast<uchar>(estimation[0]);
        res[1] = saturate_cast<uchar>(estimation[1]);
        res[2] = saturate_cast<uchar>(estimation[2]);
        return res;
-}
-
-template <> inline int saturateCastFromArray(int* estimation)
-{
-    return estimation[0];
-}
-
-template <> inline Vec2i saturateCastFromArray(int* estimation)
-{
-    estimation[1] = 0;
-    return Vec2i(estimation);
-}
+    }
+};

-template <> inline Vec3i saturateCastFromArray(int* estimation)
+template <typename T, typename IT> static inline T saturateCastFromArray(IT* estimation)
 {
-    return Vec3i(estimation);
+    return saturateCastFromArray_<T, IT>::f(estimation);
 }

 #endif
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -50,7 +50,7 @@

 using namespace cv;

-template <typename T>
+template <typename T, typename IT, typename UIT>
 struct FastNlMeansMultiDenoisingInvoker :
        ParallelLoopBody
 {
@@ -81,21 +81,21 @@ private:
    int search_window_half_size_;
    int temporal_window_half_size_;

-    int fixed_point_mult_;
+    IT fixed_point_mult_;
    int almost_template_window_size_sq_bin_shift;
-    std::vector<int> almost_dist2weight;
+    std::vector<IT> almost_dist2weight;

-    void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
-                                          Array4d<int>& col_dist_sums,
-                                          Array4d<int>& up_col_dist_sums) const;
+    void calcDistSumsForFirstElementInRow(int i, Array3d<IT>& dist_sums,
+                                          Array4d<IT>& col_dist_sums,
+                                          Array4d<IT>& up_col_dist_sums) const;

    void calcDistSumsForElementInFirstRow(int i, int j, int first_col_num,
-                                          Array3d<int>& dist_sums, Array4d<int>& col_dist_sums,
-                                          Array4d<int>& up_col_dist_sums) const;
+                                          Array3d<IT>& dist_sums, Array4d<IT>& col_dist_sums,
+                                          Array4d<IT>& up_col_dist_sums) const;
 };

-template <class T>
-FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
+template <class T, typename IT, typename UIT>
+FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
    const std::vector<Mat>& srcImgs,
    int imgToDenoiseIndex,
    int temporalWindowSize,
@@ -125,8 +125,9 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
            border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT);

    main_extended_src_ = extended_srcs_[temporal_window_half_size_];
-    const int max_estimate_sum_value = temporal_window_size_ * search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value =
+        (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * 255;
+    fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;

    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -138,7 +139,7 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
    int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
    double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;

-    int max_dist = 255 * 255 * sizeof(T);
+    IT max_dist = 255 * 255 * sizeof(T);
    int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
    almost_dist2weight.resize(almost_max_dist);

@@ -146,7 +147,7 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));

        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
            weight = 0;
@@ -160,19 +161,19 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
        dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type());
 }

-template <class T>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
+template <class T, typename IT, typename UIT>
+void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) const
 {
    int row_from = range.start;
    int row_to = range.end - 1;

-    Array3d<int> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
+    Array3d<IT> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);

    // for lazy calc optimization
-    Array4d<int> col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_);
+    Array4d<IT> col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_);

    int first_col_num = -1;
-    Array4d<int> up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_);
+    Array4d<IT> up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_);

    for (int i = row_from; i <= row_to; i++)
    {
@@ -216,15 +217,15 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                    for (int d = 0; d < temporal_window_size_; d++)
                    {
                        Mat cur_extended_src = extended_srcs_[d];
-                        Array2d<int> cur_dist_sums = dist_sums[d];
-                        Array2d<int> cur_col_dist_sums = col_dist_sums[first_col_num][d];
-                        Array2d<int> cur_up_col_dist_sums = up_col_dist_sums[j][d];
+                        Array2d<IT> cur_dist_sums = dist_sums[d];
+                        Array2d<IT> cur_col_dist_sums = col_dist_sums[first_col_num][d];
+                        Array2d<IT> cur_up_col_dist_sums = up_col_dist_sums[j][d];
                        for (int y = 0; y < search_window_size; y++)
                        {
-                            int* dist_sums_row = cur_dist_sums.row_ptr(y);
+                            IT* dist_sums_row = cur_dist_sums.row_ptr(y);

-                            int* col_dist_sums_row = cur_col_dist_sums.row_ptr(y);
-                            int* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y);
+                            IT* col_dist_sums_row = cur_col_dist_sums.row_ptr(y);
+                            IT* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y);

                            const T* b_up_ptr = cur_extended_src.ptr<T>(start_by - template_window_half_size_ - 1 + y);
                            const T* b_down_ptr = cur_extended_src.ptr<T>(start_by + template_window_half_size_ + y);
@@ -234,7 +235,7 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                                dist_sums_row[x] -= col_dist_sums_row[x];

                                col_dist_sums_row[x] = up_col_dist_sums_row[x] +
-                                    calcUpDownDist(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
+                                    calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);

                                dist_sums_row[x] += col_dist_sums_row[x];
                                up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -247,9 +248,9 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
            }

            // calc weights
-            int weights_sum = 0;
+            IT weights_sum = 0;

-            int estimation[3];
+            IT estimation[3];
            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
                estimation[channel_num] = 0;

@@ -260,33 +261,33 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                {
                    const T* cur_row_ptr = esrc_d.ptr<T>(border_size_ + search_window_y + y);

-                    int* dist_sums_row = dist_sums.row_ptr(d, y);
+                    IT* dist_sums_row = dist_sums.row_ptr(d, y);

                    for (int x = 0; x < search_window_size_; x++)
                    {
-                        int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
+                        int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift);

-                        int weight = almost_dist2weight[almostAvgDist];
+                        IT weight = almost_dist2weight[almostAvgDist];
                        weights_sum += weight;

                        T p = cur_row_ptr[border_size_ + search_window_x + x];
-                        incWithWeight(estimation, weight, p);
+                        incWithWeight<T, IT>(estimation, weight, p);
                    }
                }
            }

            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum / 2) / weights_sum;
+                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum; // ????

-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);

        }
    }
 }

-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
-        int i, Array3d<int>& dist_sums, Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElementInRow(
+        int i, Array3d<IT>& dist_sums, Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
 {
    int j = 0;

@@ -303,14 +304,14 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
                int start_y = i + y - search_window_half_size_;
                int start_x = j + x - search_window_half_size_;

-                int* dist_sums_ptr = &dist_sums[d][y][x];
-                int* col_dist_sums_ptr = &col_dist_sums[0][d][y][x];
+                IT* dist_sums_ptr = &dist_sums[d][y][x];
+                IT* col_dist_sums_ptr = &col_dist_sums[0][d][y][x];
                int col_dist_sums_step = col_dist_sums.step_size(0);
                for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                {
                    for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                    {
-                        int dist = calcDist<T>(
+                        IT dist = calcDist<T, IT>(
                                    main_extended_src_.at<T>(border_size_ + i + ty, border_size_ + j + tx),
                                    cur_extended_src.at<T>(border_size_ + start_y + ty, border_size_ + start_x + tx));

@@ -325,10 +326,10 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
    }
 }

-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
-    int i, int j, int first_col_num, Array3d<int>& dist_sums,
-    Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForElementInFirstRow(
+    int i, int j, int first_col_num, Array3d<IT>& dist_sums,
+    Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
 {
    int ay = border_size_ + i;
    int ax = border_size_ + j + template_window_half_size_;
@@ -350,10 +351,10 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRo
                int by = start_by + y;
                int bx = start_bx + x;

-                int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
+                IT* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
                for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                {
-                    *col_dist_sums_ptr += calcDist<T>(
+                    *col_dist_sums_ptr += calcDist<T, IT>(
                                main_extended_src_.at<T>(ay + ty, ax),
                                cur_extended_src.at<T>(by + ty, bx));
                }