Merge pull request #10150 from alalek:dnn_ocl_refactor_pooling

bd6a6690 · Vadim Pisarevsky · f071a48e · 13f37466 · bd6a6690 · bd6a6690
Commit bd6a6690 authored Nov 24, 2017 by Vadim Pisarevsky
Showing with 107 additions and 116 deletions

ocl4dnn.hpp modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +0 -3

ocl4dnn_pool.cpp modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp +63 -63

ocl4dnn_pooling.cl modules/dnn/src/opencl/ocl4dnn_pooling.cl +44 -50

No files found.
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -351,8 +351,6 @@ class OCL4DNNPool
                     UMat& top_data,
                     UMat& top_mask);
    private:
-        UMat mask_idx_;
        // Pooling parameters
        std::vector<int32_t> pad_;
        std::vector<int32_t> stride_;
@@ -362,7 +360,6 @@ class OCL4DNNPool
        ocl4dnnPoolingMethod_t pool_method_;
        int32_t count_;
-        int32_t batch_size_;
        int32_t channels_;
        int32_t kernel_h_;
        int32_t kernel_w_;

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -54,7 +54,6 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
    int dims = config.in_shape.size();
    int spatial_dims = 2;
-    batch_size_ = config.in_shape[0];
    channels_ = config.channels;
    pool_method_ = config.pool_method;
@@ -88,7 +87,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
 template<typename Dtype>
 OCL4DNNPool<Dtype>::~OCL4DNNPool()
 {
-    mask_idx_.release();
+    // nothing
 }
 template<typename Dtype>
@@ -99,99 +98,100 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
    bool ret = true;
    size_t global[] = { 128 * 128 };
    size_t local[] = { 128 };
-    cl_uint argIdx = 0;
    // support 2D case
    switch (pool_method_)
    {
    case LIBDNN_POOLING_METHOD_MAX:
        {
-            if (top_mask.empty() && mask_idx_.empty())
+            bool haveMask = !top_mask.empty();
-            {
+            ocl::Kernel oclk_max_pool_forward(
-                mask_idx_.create(1, count_, CV_32FC1);
+                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
-            }
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
-            ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
+                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_,
+                       haveMask ? " -D HAVE_MASK=1" : ""
+                ));
            if (oclk_max_pool_forward.empty())
                return false;
-            argIdx = 0;
+            oclk_max_pool_forward.args(
-            oclk_max_pool_forward.set(argIdx++, count_);
+                count_,
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+                ocl::KernelArg::PtrReadOnly(bottom),
-            oclk_max_pool_forward.set(argIdx++, batch_size_);
+                channels_,
-            oclk_max_pool_forward.set(argIdx++, channels_);
+                height_,
-            oclk_max_pool_forward.set(argIdx++, height_);
+                width_,
-            oclk_max_pool_forward.set(argIdx++, width_);
+                pooled_height_,
-            oclk_max_pool_forward.set(argIdx++, pooled_height_);
+                pooled_width_,
-            oclk_max_pool_forward.set(argIdx++, pooled_width_);
+                ocl::KernelArg::PtrWriteOnly(top),
-            oclk_max_pool_forward.set(argIdx++, kernel_h_);
+                ocl::KernelArg::PtrWriteOnly(top_mask)
-            oclk_max_pool_forward.set(argIdx++, kernel_w_);
+            );
-            oclk_max_pool_forward.set(argIdx++, stride_h_);
-            oclk_max_pool_forward.set(argIdx++, stride_w_);
-            oclk_max_pool_forward.set(argIdx++, pad_h_);
-            oclk_max_pool_forward.set(argIdx++, pad_w_);
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-            oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
-            if (mask_idx_.empty())
-                oclk_max_pool_forward.set(argIdx++, (void *)NULL);
-            else
-                oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
            ret = oclk_max_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_AVE:
        {
+            CV_Assert(top_mask.empty());
            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_
+                ));
            if (oclk_ave_pool_forward.empty())
                return false;
-            argIdx = 0;
+            oclk_ave_pool_forward.args(
-            oclk_ave_pool_forward.set(argIdx++, count_);
+                count_,
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+                ocl::KernelArg::PtrReadOnly(bottom),
-            oclk_ave_pool_forward.set(argIdx++, batch_size_);
+                channels_,
-            oclk_ave_pool_forward.set(argIdx++, channels_);
+                height_,
-            oclk_ave_pool_forward.set(argIdx++, height_);
+                width_,
-            oclk_ave_pool_forward.set(argIdx++, width_);
+                pooled_height_,
-            oclk_ave_pool_forward.set(argIdx++, pooled_height_);
+                pooled_width_,
-            oclk_ave_pool_forward.set(argIdx++, pooled_width_);
+                ocl::KernelArg::PtrWriteOnly(top)
-            oclk_ave_pool_forward.set(argIdx++, kernel_h_);
+            );
-            oclk_ave_pool_forward.set(argIdx++, kernel_w_);
-            oclk_ave_pool_forward.set(argIdx++, stride_h_);
-            oclk_ave_pool_forward.set(argIdx++, stride_w_);
-            oclk_ave_pool_forward.set(argIdx++, pad_h_);
-            oclk_ave_pool_forward.set(argIdx++, pad_w_);
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
            ret = oclk_ave_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_STO:
        {
+            CV_Assert(top_mask.empty());
            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_
+                ));
            if (oclk_sto_pool_forward.empty())
                return false;
-            argIdx = 0;
+            oclk_sto_pool_forward.args(
-            oclk_sto_pool_forward.set(argIdx++, count_);
+                count_,
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+                ocl::KernelArg::PtrReadOnly(bottom),
-            oclk_sto_pool_forward.set(argIdx++, batch_size_);
+                channels_,
-            oclk_sto_pool_forward.set(argIdx++, channels_);
+                height_,
-            oclk_sto_pool_forward.set(argIdx++, height_);
+                width_,
-            oclk_sto_pool_forward.set(argIdx++, width_);
+                pooled_height_,
-            oclk_sto_pool_forward.set(argIdx++, pooled_height_);
+                pooled_width_,
-            oclk_sto_pool_forward.set(argIdx++, pooled_width_);
+                ocl::KernelArg::PtrWriteOnly(top)
-            oclk_sto_pool_forward.set(argIdx++, kernel_h_);
+            );
-            oclk_sto_pool_forward.set(argIdx++, kernel_w_);
-            oclk_sto_pool_forward.set(argIdx++, stride_h_);
-            oclk_sto_pool_forward.set(argIdx++, stride_w_);
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
            ret = oclk_sto_pool_forward.run(1, global, local, false);
        }

--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -44,14 +44,23 @@
 #define TEMPLATE(name,type) CONCAT(name,type)
 #define Dtype float
-void TEMPLATE(max_pool_forward_impl, Dtype)(
+#if defined KERNEL_MAX_POOL
-    const int nthreads, __global const Dtype* bottom_data, const int num,
+__kernel void
+#ifdef HAVE_MASK
+    TEMPLATE(max_pool_forward_mask, Dtype)
+#else
+    TEMPLATE(max_pool_forward, Dtype)
+#endif
+(
+    const int nthreads, __global const Dtype* bottom_data,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int pooled_height, const int pooled_width,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    __global Dtype* top_data
-    const int pad_w,
+#ifdef HAVE_MASK
-    __global Dtype* top_data,
+    , __global Dtype* mask
-    const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+#endif
+)
 {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0))
@@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
    const int ph = (index / pooled_width) % pooled_height;
    const int c = (index / pooled_width / pooled_height) % channels;
    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
+    int hstart = ph * STRIDE_H - PAD_H;
-    int wstart = pw * stride_w - pad_w;
+    int wstart = pw * STRIDE_W - PAD_W;
-    const int hend = min(hstart + kernel_h, height);
+    const int hend = min(hstart + KERNEL_H, height);
-    const int wend = min(wstart + kernel_w, width);
+    const int wend = min(wstart + KERNEL_W, width);
    hstart = max(hstart, (int)0);
    wstart = max(wstart, (int)0);
    Dtype maxval = -FLT_MAX;
@@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
      }
    }
    top_data[index] = maxval;
-    if (!no_mask) {
+#ifdef HAVE_MASK
-      if (use_mask == 1) {
+    mask[index] = maxidx;
-        mask[index] = maxidx;
+#endif
-      } else {
-        top_mask[index] = maxidx;
-      }
-    }
  }
 }
-__kernel void TEMPLATE(max_pool_forward, Dtype)(
+#elif defined KERNEL_AVE_POOL
-    const int nthreads, __global const Dtype* bottom_data, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w,
-    __global Dtype* top_data,
-    const int use_mask, __global int* mask, __global Dtype* top_mask)
-{
-    TEMPLATE(max_pool_forward_impl, Dtype)(
-      nthreads, bottom_data, num, channels, height, width,
-      pooled_height, pooled_width, kernel_h,
-      kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
-    );
-}
 __kernel void TEMPLATE(ave_pool_forward, Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data, const int num,
+    const int nthreads, __global const Dtype* const bottom_data,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int pooled_height, const int pooled_width,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    __global Dtype* top_data)
-    const int pad_w, __global Dtype* top_data)
 {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0))
@@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
      const int ph = (index / pooled_width) % pooled_height;
      const int c = (index / pooled_width / pooled_height) % channels;
      const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * stride_h - pad_h;
+      int hstart = ph * STRIDE_H - PAD_H;
-      int wstart = pw * stride_w - pad_w;
+      int wstart = pw * STRIDE_W - PAD_W;
-      int hend = min(hstart + kernel_h, height + pad_h);
+      int hend = min(hstart + KERNEL_H, height + PAD_H);
-      int wend = min(wstart + kernel_w, width + pad_w);
+      int wend = min(wstart + KERNEL_W, width + PAD_W);
      const int pool_size = (hend - hstart) * (wend - wstart);
      hstart = max(hstart, (int)0);
      wstart = max(wstart, (int)0);
@@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
  }
 }
+#elif defined KERNEL_STO_POOL
 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data, const int num,
+    const int nthreads, __global const Dtype* const bottom_data,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int pooled_height, const int pooled_width,
-    const int kernel_w, const int stride_h, const int stride_w,
    __global Dtype* top_data)
 {
  for (int index = get_global_id(0); index < nthreads;
@@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    const int ph = (index / pooled_width) % pooled_height;
    const int c = (index / pooled_width / pooled_height) % channels;
    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
+    const int hstart = ph * STRIDE_H;
-    const int hend = min(hstart + kernel_h, height);
+    const int hend = min(hstart + KERNEL_H, height);
-    const int wstart = pw * stride_w;
+    const int wstart = pw * STRIDE_W;
-    const int wend = min(wstart + kernel_w, width);
+    const int wend = min(wstart + KERNEL_W, width);
    // We set cumsum to be 0 to avoid divide-by-zero problems
    Dtype cumsum = FLT_MIN;
    Dtype cumvalues = 0.;
@@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    // First pass: get sum
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
+        Dtype v = bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+        cumsum += v;
+        cumvalues += v * v;
      }
    }
    top_data[index] = cumvalues / cumsum;
  }
 }
+#endif // KERNEL_*