Merge pull request #10228 from pengli:dnn_new

6074f92d · Alexander Alekhin · 0b688cd2 · 59cbaca4 · 6074f92d · 6074f92d
Commit 6074f92d authored Dec 06, 2017 by Alexander Alekhin
10 changed files
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -185,12 +185,13 @@ public:
        outs.getUMatVector(outputs);
        int cAxis = clamp(axis, inputs[0].dims);
-        if (!(cAxis == 1 && outputs[0].dims == 4 && !padding))
+        if (padding)
            return false;
        int bottom_concat_axis;
-        int concat_size = inputs[0].size[2] * inputs[0].size[3];
+        int concat_size = total(shape(inputs[0]), cAxis + 1);
-        int top_concat_axis = outputs[0].size[1];
+        int top_concat_axis = outputs[0].size[cAxis];
+        int num_concats = total(shape(inputs[0]), 0, cAxis);
        int offset_concat_axis = 0;
        UMat& outMat = outputs[0];
        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
@@ -202,12 +203,12 @@ public:
                return false;
            UMat& inpMat = inputs[i];
-            bottom_concat_axis = inputs[i].size[1];
+            bottom_concat_axis = inputs[i].size[cAxis];
            size_t nthreads = inputs[i].total();
            kernel.set(0, (int)nthreads);
            kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
-            kernel.set(2, (int)inputs[i].size[0]);
+            kernel.set(2, (int)num_concats);
            kernel.set(3, (int)concat_size);
            kernel.set(4, (int)top_concat_axis);
            kernel.set(5, (int)bottom_concat_axis);

--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -44,6 +44,7 @@
 #include "layers_common.hpp"
 #include <float.h>
 #include <algorithm>
+#include "opencl_kernels_dnn.hpp"
 namespace cv
 {
@@ -173,6 +174,24 @@ public:
        CV_Assert((int)_numAxes == inp0.dims);
        computeStrides(shape(*inputs[0]), shape(outputs[0]));
+#ifdef HAVE_OPENCL
+        if (uorder.empty())
+        {
+            std::vector<int> orderVec(_order.begin(), _order.end());;
+            Mat morder(1, orderVec.size(), CV_32SC1, &orderVec[0]);
+            std::vector<int> oldStrideVec(_oldStride.begin(), _oldStride.end());
+            Mat mold_stride(1, _oldStride.size(), CV_32SC1, &oldStrideVec[0]);
+            std::vector<int> newStrideVec(_newStride.begin(), _newStride.end());
+            Mat mnew_stride(1, newStrideVec.size(), CV_32SC1, &newStrideVec[0]);
+            morder.copyTo(uorder);
+            mold_stride.copyTo(uold_stride);
+            mnew_stride.copyTo(unew_stride);
+        }
+#endif
    }
    class PermuteInvoker : public ParallelLoopBody
@@ -247,11 +266,47 @@ public:
        }
    };
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        if (!_needsPermute)
+            return false;
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc);
+            kernel.set(0, (int)_count);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
+            kernel.set(2, ocl::KernelArg::PtrReadOnly(uorder));
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(uold_stride));
+            kernel.set(4, ocl::KernelArg::PtrReadOnly(unew_stride));
+            kernel.set(5, (int)_numAxes);
+            kernel.set(6, ocl::KernelArg::PtrWriteOnly(outputs[i]));
+            if (!kernel.run(1, &_count, NULL, false))
+                return false;
+        }
+        return true;
+    }
+#endif
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
    {
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }
@@ -325,6 +380,10 @@ public:
    std::vector<size_t> _newStride;
    bool _needsPermute;
+#ifdef HAVE_OPENCL
+    UMat uorder, uold_stride, unew_stride;
+#endif
    size_t _numAxes;
 };

--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -44,6 +44,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/dnn/all_layers.hpp>
 #include <iostream>
+#include "opencl_kernels_dnn.hpp"
 namespace cv
 {
@@ -114,11 +115,83 @@ public:
        }
    }
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        if (useSoftmaxTree) {   // Yolo 9000
+            CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
+            return false;
+        }
+        CV_Assert(inputs.size() >= 1);
+        int const cell_size = classes + coords + 1;
+        UMat blob_umat = blobs[0].getUMat(ACCESS_READ);
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            UMat& inpBlob = inputs[ii];
+            UMat& outBlob = outputs[ii];
+            int rows = inpBlob.size[1];
+            int cols = inpBlob.size[2];
+            ocl::Kernel logistic_kernel("logistic_activ", ocl::dnn::region_oclsrc);
+            size_t global = rows*cols*anchors;
+            logistic_kernel.set(0, (int)global);
+            logistic_kernel.set(1, ocl::KernelArg::PtrReadOnly(inpBlob));
+            logistic_kernel.set(2, (int)cell_size);
+            logistic_kernel.set(3, ocl::KernelArg::PtrWriteOnly(outBlob));
+            logistic_kernel.run(1, &global, NULL, false);
+            if (useSoftmax)
+            {
+                // Yolo v2
+                // softmax activation for Probability, for each grid cell (X x Y x Anchor-index)
+                ocl::Kernel softmax_kernel("softmax_activ", ocl::dnn::region_oclsrc);
+                size_t nthreads = rows*cols*anchors;
+                softmax_kernel.set(0, (int)nthreads);
+                softmax_kernel.set(1, ocl::KernelArg::PtrReadOnly(inpBlob));
+                softmax_kernel.set(2, ocl::KernelArg::PtrReadOnly(blob_umat));
+                softmax_kernel.set(3, (int)cell_size);
+                softmax_kernel.set(4, (int)classes);
+                softmax_kernel.set(5, (int)classfix);
+                softmax_kernel.set(6, (int)rows);
+                softmax_kernel.set(7, (int)cols);
+                softmax_kernel.set(8, (int)anchors);
+                softmax_kernel.set(9, (float)thresh);
+                softmax_kernel.set(10, ocl::KernelArg::PtrWriteOnly(outBlob));
+                if (!softmax_kernel.run(1, &nthreads, NULL, false))
+                    return false;
+            }
+            if (nmsThreshold > 0) {
+                Mat mat = outBlob.getMat(ACCESS_WRITE);
+                float *dstData = mat.ptr<float>();
+                do_nms_sort(dstData, rows*cols*anchors, nmsThreshold);
+                //do_nms(dstData, rows*cols*anchors, nmsThreshold);
+            }
+        }
+        return true;
+    }
+#endif
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
    {
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }

--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@@ -44,6 +44,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/dnn/all_layers.hpp>
 #include <iostream>
+#include "opencl_kernels_dnn.hpp"
 namespace cv
 {
@@ -86,11 +87,54 @@ public:
        return backendId == DNN_BACKEND_DEFAULT;
    }
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ocl::Kernel kernel("reorg", ocl::dnn::reorg_oclsrc, buildopt);
+            if (kernel.empty())
+                return false;
+            UMat& srcBlob = inputs[i];
+            UMat& dstBlob = outputs[0];
+            int channels = srcBlob.size[1];
+            int height = srcBlob.size[2];
+            int width = srcBlob.size[3];
+            size_t nthreads = channels * height * width;
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(srcBlob));
+            kernel.set(2, (int)channels);
+            kernel.set(3, (int)height);
+            kernel.set(4, (int)width);
+            kernel.set(5, (int)reorgStride);
+            kernel.set(6, ocl::KernelArg::PtrWriteOnly(dstBlob));
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+        return true;
+    }
+#endif
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
    {
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }

--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -182,11 +182,40 @@ public:
        return true;
    }
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat srcBlob = inputs[i];
+            void *src_handle = inputs[i].handle(ACCESS_READ);
+            void *dst_handle = outputs[i].handle(ACCESS_WRITE);
+            if (src_handle != dst_handle)
+            {
+                MatShape outShape = shape(outputs[i]);
+                UMat umat = srcBlob.reshape(1, (int)outShape.size(), &outShape[0]);
+                umat.copyTo(outputs[i]);
+            }
+        }
+        outs.assign(outputs);
+        return true;
+    }
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
    {
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
    }

--- a/modules/dnn/src/opencl/detection_output.cl
+++ b/modules/dnn/src/opencl/detection_output.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#define Dtype float
+#define Dtype4 float4
+__kernel void DecodeBBoxesCORNER(const int nthreads,
+                                 __global const Dtype* loc_data,
+                                 __global const Dtype* prior_data,
+                                 const int variance_encoded_in_target,
+                                 const int num_priors,
+                                 const int share_location,
+                                 const int num_loc_classes,
+                                 const int background_label_id,
+                                 const int clip_bbox,
+                                 __global Dtype* bbox_data)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;
+        const int i = index % 4;
+        const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;
+        const int c = (index / 4) % num_loc_classes;
+        int label = share_location ? -1 : c;
+        if (label == background_label_id)
+            return; // Ignore background class.
+        Dtype4 loc_vec = vload4(0, loc_data + index - i);
+        Dtype4 bbox_vec, prior_variance;
+        if (variance_encoded_in_target)
+        {
+            bbox_vec = loc_vec;
+        } else {
+            const int start_index = num_priors * 4 + p;
+            prior_variance = vload4(0, prior_data + start_index);
+            bbox_vec = loc_vec * prior_variance;
+        }
+        bbox_xmin = bbox_vec.x;
+        bbox_ymin = bbox_vec.y;
+        bbox_xmax = bbox_vec.z;
+        bbox_ymax = bbox_vec.w;
+        Dtype4 prior_vec = vload4(0, prior_data + p);
+        Dtype val;
+        switch (i)
+        {
+            case 0:
+                val = prior_vec.x + bbox_xmin;
+                break;
+            case 1:
+                val = prior_vec.y + bbox_ymin;
+                break;
+            case 2:
+                val = prior_vec.z + bbox_xmax;
+                break;
+            case 3:
+                val = prior_vec.w + bbox_ymax;
+                break;
+        }
+        if (clip_bbox)
+            val = max(min(val, (Dtype)1.), (Dtype)0.);
+        bbox_data[index] = val;
+    }
+}
+__kernel void DecodeBBoxesCENTER_SIZE(const int nthreads,
+                                      __global const Dtype* loc_data,
+                                      __global const Dtype* prior_data,
+                                      const int variance_encoded_in_target,
+                                      const int num_priors,
+                                      const int share_location,
+                                      const int num_loc_classes,
+                                      const int background_label_id,
+                                      const int clip_bbox,
+                                      __global Dtype* bbox_data)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;
+        const int i = index % 4;
+        const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;
+        const int c = (index / 4) % num_loc_classes;
+        int label = share_location ? -1 : c;
+        if (label == background_label_id)
+            return; // Ignore background class.
+        Dtype4 loc_vec = vload4(0, loc_data + index - i);
+        Dtype4 bbox_vec, prior_variance;
+        if (variance_encoded_in_target)
+        {
+            bbox_vec = loc_vec;
+        } else {
+            const int start_index = num_priors * 4 + p;
+            prior_variance = vload4(0, prior_data + start_index);
+            bbox_vec = loc_vec * prior_variance;
+        }
+        bbox_xmin = bbox_vec.x;
+        bbox_ymin = bbox_vec.y;
+        bbox_xmax = bbox_vec.z;
+        bbox_ymax = bbox_vec.w;
+        Dtype4 prior_vec = vload4(0, prior_data + p);
+        Dtype prior_width = prior_vec.z - prior_vec.x;
+        Dtype prior_height = prior_vec.w - prior_vec.y;
+        Dtype prior_center_x = (prior_vec.x + prior_vec.z) * .5;
+        Dtype prior_center_y = (prior_vec.y + prior_vec.w) * .5;
+        Dtype decode_bbox_center_x, decode_bbox_center_y;
+        Dtype decode_bbox_width, decode_bbox_height;
+        decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
+        decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
+        decode_bbox_width = exp(bbox_xmax) * prior_width;
+        decode_bbox_height = exp(bbox_ymax) * prior_height;
+        Dtype val;
+        switch (i)
+        {
+            case 0:
+                val = decode_bbox_center_x - decode_bbox_width * .5;
+                break;
+            case 1:
+                val = decode_bbox_center_y - decode_bbox_height * .5;
+                break;
+            case 2:
+                val = decode_bbox_center_x + decode_bbox_width * .5;
+                break;
+            case 3:
+                val = decode_bbox_center_y + decode_bbox_height * .5;
+                break;
+        }
+        if (clip_bbox)
+            val = max(min(val, (Dtype)1.), (Dtype)0.);
+        bbox_data[index] = val;
+    }
+}
--- a/modules/dnn/src/opencl/permute.cl
+++ b/modules/dnn/src/opencl/permute.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#define Dtype float
+__kernel void permute(const int nthreads,
+                      __global Dtype* bottom_data,
+                      global int* permute_order,
+                      global int* oldStride,
+                      global int* newStride,
+                      const int num_axes,
+                      __global Dtype* top_data)
+{
+    for (int i = get_global_id(0); i < nthreads; i += get_global_size(0))
+    {
+        int oldPosition = 0;
+        int newPosition = i;
+        for (int j = 0; j < num_axes; ++j)
+        {
+            int order = permute_order[j];
+            oldPosition += (newPosition / newStride[j]) * oldStride[order];
+            newPosition %= newStride[j];
+        }
+        top_data[i] = bottom_data[oldPosition];
+    }
+}
--- a/modules/dnn/src/opencl/region.cl
+++ b/modules/dnn/src/opencl/region.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#define Dtype float
+__kernel void logistic_activ(const int count,
+                             __global const Dtype* src,
+                             const int cell_size,
+                             __global Dtype* dst)
+{
+    for (int i = get_global_id(0); i < count; i += get_global_size(0))
+    {
+        int index = cell_size * i;
+        Dtype x = src[index + 4];
+        dst[index + 4] = 1.f / (1.f + exp(-x));
+    }
+}
+__kernel void softmax_activ(const int count,
+                            __global const Dtype* src,
+                            __global const Dtype* biasData,
+                            const int cell_size,
+                            const int classes,
+                            const int classfix,
+                            const int rows,
+                            const int cols,
+                            const int anchors,
+                            const float thresh,
+                            __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < count; index += get_global_size(0))
+    {
+        int box_index = index * cell_size;
+        float largest = -FLT_MAX;
+        __global const Dtype *input = src + box_index + 5;
+        __global Dtype *output = dst + box_index + 5;
+        for (int i = 0; i < classes; ++i)
+            largest = fmax(largest, input[i]);
+        float sum = 0;
+        for (int i = 0; i < classes; ++i)
+        {
+            float e = exp((input[i] - largest));
+            sum += e;
+            output[i] = e;
+        }
+        int y = index / anchors / cols;
+        int x = index / anchors % cols;
+        int a = index - anchors * (x + y * cols);
+        float scale = dst[box_index + 4];
+        if (classfix == -1 && scale < .5) scale = 0;
+        float v1 = src[box_index + 0];
+        float v2 = src[box_index + 1];
+        float l1 = 1.f / (1.f + exp(-v1));
+        float l2 = 1.f / (1.f + exp(-v2));
+        dst[box_index + 0] = (x + l1) / cols;
+        dst[box_index + 1] = (y + l2) / rows;
+        dst[box_index + 2] = exp(src[box_index + 2]) * biasData[2 * a] / cols;
+        dst[box_index + 3] = exp(src[box_index + 3]) * biasData[2 * a + 1] / rows;
+        for (int i = 0; i < classes; ++i)
+        {
+            float prob = scale * output[i] / sum;
+            output[i] = (prob > thresh) ? prob : 0;
+        }
+    }
+}
--- a/modules/dnn/src/opencl/reorg.cl
+++ b/modules/dnn/src/opencl/reorg.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+__kernel void reorg(const int count,
+                    __global const Dtype* src,
+                    const int channels,
+                    const int height,
+                    const int width,
+                    const int reorgStride,
+                    __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < count; index += get_global_size(0))
+    {
+        int k = index / (height * width);
+        int j = (index - (k * height * width)) / width;
+        int i = (index - (k * height * width)) % width;
+        int out_c = channels / (reorgStride*reorgStride);
+        int c2 = k % out_c;
+        int offset = k / out_c;
+        int w2 = i*reorgStride + offset % reorgStride;
+        int h2 = j*reorgStride + offset / reorgStride;
+        int in_index = w2 + width*reorgStride*(h2 + height*reorgStride*c2);
+        dst[index] = src[in_index];
+    }
+}