[moved from opencv] Add CV_16UC1 support for cuda::CLAHE

Due to size limit of shared memory, histogram is built on the global memory for CV_16UC1 case. The amount of memory needed for building histogram is: 65536 * 4byte = 256KB and shared memory limit is 48KB typically. Added test cases for CV_16UC1 and various clip limits. Added perf tests for CV_16UC1 on both CPU and CUDA code. There was also a bug in CV_8UC1 case when redistributing "residual" clipped pixels. Adding the test case where clip limit is 5.0 exposes this bug. original commit: https://github.com/opencv/opencv/commit/fb8e652c3f20d377e9f935faee370ed28fb60122

[moved from opencv] Add CV_16UC1 support for cuda::CLAHE
Due to size limit of shared memory, histogram is built on the global memory for CV_16UC1 case. The amount of memory needed for building histogram is: 65536 * 4byte = 256KB and shared memory limit is 48KB typically. Added test cases for CV_16UC1 and various clip limits. Added perf tests for CV_16UC1 on both CPU and CUDA code. There was also a bug in CV_8UC1 case when redistributing "residual" clipped pixels. Adding the test case where clip limit is 5.0 exposes this bug. original commit: https://github.com/opencv/opencv/commit/fb8e652c3f20d377e9f935faee370ed28fb60122
fa3603a5 · Namgoo Lee · d6895a1b · fa3603a5 · fa3603a5 · fa3603a5
Commit fa3603a5 authored Feb 05, 2019 by Namgoo Lee
4 changed files
--- a/modules/cudaimgproc/perf/perf_histogram.cpp
+++ b/modules/cudaimgproc/perf/perf_histogram.cpp
@@ -183,16 +183,18 @@ PERF_TEST_P(Sz, EqualizeHist,
 //////////////////////////////////////////////////////////////////////
 // CLAHE
-DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double);
+DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double, MatType);
 PERF_TEST_P(Sz_ClipLimit, CLAHE,
            Combine(CUDA_TYPICAL_MAT_SIZES,
-                    Values(0.0, 40.0)))
+                    Values(0.0, 40.0),
+                    Values(MatType(CV_8UC1), MatType(CV_16UC1))))
 {
    const cv::Size size = GET_PARAM(0);
    const double clipLimit = GET_PARAM(1);
+    const int type = GET_PARAM(2);
-    cv::Mat src(size, CV_8UC1);
+    cv::Mat src(size, type);
    declare.in(src, WARMUP_RNG);
    if (PERF_RUN_CUDA())

--- a/modules/cudaimgproc/src/cuda/clahe.cu
+++ b/modules/cudaimgproc/src/cuda/clahe.cu
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -141,8 +141,9 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
 namespace clahe
 {
-    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
+    void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
-    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
+    void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream);
+    template <typename T> void transform(PtrStepSz<T> src, PtrStepSz<T> dst, PtrStep<T> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
 }
 namespace
@@ -170,6 +171,7 @@ namespace
        GpuMat srcExt_;
        GpuMat lut_;
+        GpuMat hist_; // histogram on global memory for CV_16UC1 case
    };
    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
@@ -186,14 +188,16 @@ namespace
    {
        GpuMat src = _src.getGpuMat();
-        CV_Assert( src.type() == CV_8UC1 );
+        const int type = src.type();
-        _dst.create( src.size(), src.type() );
+        CV_Assert( type == CV_8UC1 || type == CV_16UC1 );
+        _dst.create( src.size(), type );
        GpuMat dst = _dst.getGpuMat();
-        const int histSize = 256;
+        const int histSize = type == CV_8UC1 ? 256 : 65536;
-        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
+        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, type, lut_);
        cudaStream_t stream = StreamAccessor::getStream(s);
@@ -227,9 +231,18 @@ namespace
            clipLimit = std::max(clipLimit, 1);
        }
-        clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
+        if (type == CV_8UC1)
+            clahe::calcLut_8U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
+        else // type == CV_16UC1
+        {
+            ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_32SC1, hist_);
+            clahe::calcLut_16U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, hist_, stream);
+        }
-        clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
+        if (type == CV_8UC1)
+            clahe::transform<uchar>(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
+        else // type == CV_16UC1
+            clahe::transform<ushort>(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
    }
    void CLAHE_Impl::setClipLimit(double clipLimit)

--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -236,17 +236,19 @@ namespace
    IMPLEMENT_PARAM_CLASS(ClipLimit, double)
 }
-PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit)
+PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit, MatType)
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
    double clipLimit;
+    int type;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
        clipLimit = GET_PARAM(2);
+        type = GET_PARAM(3);
        cv::cuda::setDevice(devInfo.deviceID());
    }
@@ -254,7 +256,11 @@ PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit)
 CUDA_TEST_P(CLAHE, Accuracy)
 {
-    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat src;
+    if (type == CV_8UC1)
+        src = randomMat(size, type);
+    else if (type == CV_16UC1)
+        src = randomMat(size, type, 0, 65535);
    cv::Ptr<cv::cuda::CLAHE> clahe = cv::cuda::createCLAHE(clipLimit);
    cv::cuda::GpuMat dst;
@@ -270,7 +276,8 @@ CUDA_TEST_P(CLAHE, Accuracy)
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CLAHE, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(0.0, 40.0)));
+    testing::Values(0.0, 5.0, 10.0, 20.0, 40.0),
+    testing::Values(MatType(CV_8UC1), MatType(CV_16UC1))));
 }} // namespace