Basic 16-bit implmentation of fastNlMeansDenoising. Table-based exponetiation…

Basic 16-bit implmentation of fastNlMeansDenoising. Table-based exponetiation leads to high memory footprint and loss of precision in 16-bit mode.

Basic 16-bit implmentation of fastNlMeansDenoising. Table-based exponetiation…
Basic 16-bit implmentation of fastNlMeansDenoising. Table-based exponetiation leads to high memory footprint and loss of precision in 16-bit mode.
42db9e71 · Erik Karlsson · 49e93747 · 42db9e71 · 42db9e71 · 42db9e71
Commit 42db9e71 authored Feb 12, 2015 by Erik Karlsson
3 changed files
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -65,17 +65,32 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
    switch (src.type()) {
        case CV_8U:
            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<uchar, int, unsigned>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC2:
            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC3:
            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<unsigned short, int64, uint64>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        default:
@@ -181,13 +196,31 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
            break;
        case CV_8UC2:
            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned int>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC3:
            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned int>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
            break;

--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -123,11 +123,13 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 16 bits to get a reasonable table size
+    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
+    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
+    const size_t TABLE_MAX_BITS = 24;
    CV_Assert(template_window_size_ <= 46340); // sqrt(INT_MAX)
    int template_window_size_sq = template_window_size_ * template_window_size_;
-    almost_template_window_size_sq_bin_shift_ =
+    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq) +
-        getNearestPowerOf2(template_window_size_sq) + 2*pixelInfo<T>::sampleBits() - 16;
+        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
    double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
    IT max_dist =
@@ -139,7 +141,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
            weight = 0;
@@ -232,7 +234,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
            // calc weights
            IT estimation[3], weights_sum = 0;
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                estimation[channel_num] = 0;
            for (int y = 0; y < search_window_size_; y++)
@@ -250,7 +252,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
                }
            }
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum/2) / weights_sum;
            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);

--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -131,12 +131,15 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 16 bits to get a reasonable table size
+    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
+    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
+    const size_t TABLE_MAX_BITS = 24;
    int template_window_size_sq = template_window_size_ * template_window_size_;
    almost_template_window_size_sq_bin_shift = 0;
    while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq)
        almost_template_window_size_sq_bin_shift++;
-    almost_template_window_size_sq_bin_shift += 2*pixelInfo<T>::sampleBits() - 16;
+    almost_template_window_size_sq_bin_shift +=
+        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
    int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
    double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
@@ -150,7 +153,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
            weight = 0;
@@ -254,7 +257,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
            IT weights_sum = 0;
            IT estimation[3];
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                estimation[channel_num] = 0;
            for (int d = 0; d < temporal_window_size_; d++)
@@ -279,8 +282,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
                }
            }
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum; // ????
+                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum;
            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);