significantly improved parallel non-local means by using granularity parameter…

significantly improved parallel non-local means by using granularity parameter in parallel_for_ loop. Because the algorithm deals with sliding sums, it's essential that each thread has enough work to do, otherwise the algorithm gets higher theoretical complexity and thus there is no speedup comparing to 1-thread code (at best).

significantly improved parallel non-local means by using granularity parameter…
significantly improved parallel non-local means by using granularity parameter in parallel_for_ loop. Because the algorithm deals with sliding sums, it's essential that each thread has enough work to do, otherwise the algorithm gets higher theoretical complexity and thus there is no speedup comparing to 1-thread code (at best).
b37aaa83 · Vadim Pisarevsky · feb5b6aa · b37aaa83 · b37aaa83
Commit b37aaa83 authored May 14, 2015 by Vadim Pisarevsky
Hide whitespace changes
Inline Side-by-side

Showing with 41 additions and 14 deletions

denoising.cpp modules/photo/src/denoising.cpp +30 -14

test_denoising.cpp modules/photo/test/test_denoising.cpp +11 -0

No files found.
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -50,42 +50,50 @@ static void fastNlMeansDenoising_( const Mat& src, Mat& dst, const std::vector<f
                                   int templateWindowSize, int searchWindowSize)
 {
    int hn = (int)h.size();
+    double granularity = (double)std::max(1., (double)dst.total()/(1 << 17));

    switch (CV_MAT_CN(src.type())) {
        case 1:
            parallel_for_(cv::Range(0, src.rows),
                          FastNlMeansDenoisingInvoker<ST, IT, UIT, D, int>(
-                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
+                              src, dst, templateWindowSize, searchWindowSize, &h[0]),
+                          granularity);
            break;
        case 2:
            if (hn == 1)
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            else
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            break;
        case 3:
            if (hn == 1)
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            else
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            break;
        case 4:
            if (hn == 1)
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            else
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            break;
        default:
            CV_Error(Error::StsBadArg,
@@ -237,6 +245,7 @@ static void fastNlMeansDenoisingMulti_( const std::vector<Mat>& srcImgs, Mat& ds
                                        int templateWindowSize, int searchWindowSize)
 {
    int hn = (int)h.size();
+    double granularity = (double)std::max(1., (double)dst.total()/(1 << 16));

    switch (srcImgs[0].type())
    {
@@ -244,43 +253,50 @@ static void fastNlMeansDenoisingMulti_( const std::vector<Mat>& srcImgs, Mat& ds
            parallel_for_(cv::Range(0, srcImgs[0].rows),
                          FastNlMeansMultiDenoisingInvoker<uchar, IT, UIT, D, int>(
                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                              dst, templateWindowSize, searchWindowSize, &h[0]));
+                              dst, templateWindowSize, searchWindowSize, &h[0]),
+                          granularity);
            break;
        case CV_8UC2:
            if (hn == 1)
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            else
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            break;
        case CV_8UC3:
            if (hn == 1)
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            else
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            break;
        case CV_8UC4:
            if (hn == 1)
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            else
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+                                  dst, templateWindowSize, searchWindowSize, &h[0]),
+                              granularity);
            break;
        default:
            CV_Error(Error::StsBadArg,

--- a/modules/photo/test/test_denoising.cpp
+++ b/modules/photo/test/test_denoising.cpp
@@ -156,3 +156,14 @@ TEST(Photo_White, issue_2646)

    ASSERT_EQ(0, nonWhitePixelsCount);
 }
+
+TEST(Photo_Denoising, speed)
+{
+    string imgname = string(cvtest::TS::ptr()->get_data_path()) + "shared/5MP.png";
+    Mat src = imread(imgname, 0), dst;
+
+    double t = (double)getTickCount();
+    fastNlMeansDenoising(src, dst, 5, 7, 21);
+    t = (double)getTickCount() - t;
+    printf("execution time: %gms\n", t*1000./getTickFrequency());
+}