Adding of OCL implementations and public interfaces for Convolution and LRN

601afeed · Vitaliy Lyudvichenko · 50c9e1c9 · 601afeed · 601afeed · 601afeed
Commit 601afeed authored Jul 25, 2016 by Vitaliy Lyudvichenko
15 changed files
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -205,6 +205,46 @@ namespace dnn
        void forward(std::vector<Blob*> &input, std::vector<Blob> &output);
    };

+    class CV_EXPORTS_W BaseConvolutionLayer : public Layer
+    {
+    public:
+
+        Size kernel, pad, stride;
+    };
+
+    class CV_EXPORTS_W ConvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+
+        static Ptr<BaseConvolutionLayer> create();
+        static Ptr<BaseConvolutionLayer> create(Size kernel = Size(3, 3), Size pad = Size(0, 0), Size stride = Size(1, 1));
+    };
+
+    class CV_EXPORTS_W DeconvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+
+        static Ptr<BaseConvolutionLayer> create();
+        static Ptr<BaseConvolutionLayer> create(Size kernel = Size(3, 3), Size pad = Size(0, 0), Size stride = Size(1, 1));
+    };
+
+    class CV_EXPORTS_W LRNLayer : public Layer
+    {
+    public:
+
+        enum
+        {
+            CHANNEL_NRM,
+            SPATIAL_NRM
+        };
+        int type;
+
+        int size;
+        double alpha, beta;
+    };
+
+
+
 //! @}
 //! @}


--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -120,7 +120,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
        String type; //!< Type name which was used for creating layer by layer factory.

        Layer();
-        explicit Layer(const LayerParams &params); //!< Initializes only #name, #type and #blobs fields.
+        explicit Layer(const LayerParams &params);      //!< Initializes only #name, #type and #blobs fields.
+        void setParamsFrom(const LayerParams &params);  //!< Initializes only #name, #type and #blobs fields.
        virtual ~Layer();
    };


--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@@ -48,7 +48,10 @@
 namespace cv {
 namespace dnn {

-std::ostream &operator<< (std::ostream &s, cv::Range &r)
+//Useful shortcut
+typedef BlobShape Shape;
+
+inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
 {
    return s << "[" << r.start << ", " << r.end << ")";
 }
@@ -96,8 +99,6 @@ Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
        ranges[i] = Range::all();
    ranges[0] = r0;
    ranges[1] = r1;
-//    for (int i = 0; i < m.dims; i++)
-//        std::cout << ranges[i] << "\n";
    return m(&ranges[0]);
 }

@@ -128,8 +129,32 @@ Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, co
    return m(&ranges[0]);
 }

-}
+//Traits for switching in ploymorphic implementations
+template<typename XMat>
+struct MatTraits
+{
+};

-}
+template<>
+struct MatTraits<cv::Mat>
+{
+    enum
+    {
+        IS_MAT  = 1,
+        IS_UMAT = 0,
+    };
+};
+
+template<>
+struct MatTraits<cv::UMat>
+{
+    enum
+    {
+        IS_MAT  = 0,
+        IS_UMAT = 1,
+    };
+};

+}
+}
 #endif
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -543,6 +543,13 @@ Layer::Layer(const LayerParams &params)

 }

+void Layer::setParamsFrom(const LayerParams &params)
+{
+    blobs = params.blobs;
+    name = params.name;
+    type = params.type;
+}
+
 int Layer::inputNameToIndex(String)
 {
    return -1;

--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -81,9 +81,9 @@ void initModule()
    REG_RUNTIME_LAYER_CLASS(Split, SplitLayer)
    REG_RUNTIME_LAYER_CLASS(Reshape, ReshapeLayer)
    REG_STATIC_LAYER_FUNC(Flatten, createFlattenLayer)
-    REG_RUNTIME_LAYER_CLASS(Pooling, PoolingLayer)
+    REG_RUNTIME_LAYER_CLASS(Pooling, PoolingLayerImpl)
    REG_RUNTIME_LAYER_CLASS(MVN, MVNLayer)
-    REG_RUNTIME_LAYER_CLASS(LRN, LRNLayer)
+    REG_RUNTIME_LAYER_FUNC(LRN, createLRNLayerFromCaffe)
    REG_RUNTIME_LAYER_CLASS(InnerProduct, FullyConnectedLayer)

    REG_RUNTIME_LAYER_CLASS(ReLU, ElementWiseLayer<ReLUFunctor>)
@@ -94,8 +94,8 @@ void initModule()
    REG_RUNTIME_LAYER_CLASS(Sigmoid, ElementWiseLayer<SigmoidFunctor>)
    REG_RUNTIME_LAYER_CLASS(Dropout, BlankLayer)

-    REG_RUNTIME_LAYER_CLASS(Convolution, ConvolutionLayer)
-    REG_RUNTIME_LAYER_CLASS(Deconvolution, DeConvolutionLayer)
+    REG_RUNTIME_LAYER_FUNC(Convolution, createConvolutionLayerFromCaffe)
+    REG_RUNTIME_LAYER_FUNC(Deconvolution, createDeconvolutionLayerFromCaffe)
    REG_RUNTIME_LAYER_CLASS(Concat, ConcatLayer)

    init.status = true;

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ b/modules/dnn/src/layers/convolution_layer.hpp
@@ -42,61 +42,65 @@
 #ifndef __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    //TODO: simultaneously convolution and bias addition for cache optimization
-    class ConvolutionLayer : public Layer
-    {
-    protected:
-        bool bias;
-        int numOutput, group;
-        int padH, padW;
-        int kerH, kerW;
-        int strideH, strideW;

-        int inpH, inpW, inpCn;
-        int outH, outW, outCn;
-        int topH, topW, topCn; //switched between inp/out on deconv/conv
-        int inpGroupCn, outGroupCn;
-        int ksize;
+//TODO: simultaneously convolution and bias addition for cache optimization
+class ConvolutionLayerImpl : public ConvolutionLayer
+{
+public:
+
+    ConvolutionLayerImpl();
+    virtual void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    virtual void init();
+
+protected:
+    int numOutput, group;
+    int inpH, inpW, inpCn;
+    int outH, outW, outCn;
+    int topH, topW, topCn; //switched between inp/out on deconv/conv
+    int inpGroupCn, outGroupCn;
+    int ksize;

-        bool tryUseOpenCL, useOpenCL;
+    bool bias;
+    bool tryUseOpenCL, useOpenCL;

-        Blob colBlob, biasOnesBlob;
+    Blob colBlob, biasOnesBlob;

-        inline bool is1x1() const;
-        virtual void computeInpOutShape(const Blob &inpBlob);
+    bool is1x1() const;
+    virtual void computeInpOutShape(const Blob &inpBlob);

-        void im2col(const  Mat &srcImg,  Mat &dstCol);
-        void im2col(const UMat &srcImg, UMat &dstCol);
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void im2col(const  Mat &srcImg,  Mat &dstCol);
+    void im2col(const UMat &srcImg, UMat &dstCol);
+};
+
+class DeConvolutionLayerImpl : public ConvolutionLayerImpl
+{
+public:
+    DeConvolutionLayerImpl();
+    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);

-    public:
-        ConvolutionLayer() {}
-        ConvolutionLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+protected:

-        template<typename XMat>
-        void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
+    virtual void computeInpOutShape(const Blob &inpBlob);

-    class DeConvolutionLayer : public ConvolutionLayer
-    {
-    protected:
-        void computeInpOutShape(const Blob &inpBlob);
-        void col2im(const  Mat &colMat, Mat  &dstImg);
-        void col2im(const UMat &colMat, UMat &dstImg);
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void col2im(const  Mat &colMat, Mat  &dstImg);
+    void col2im(const UMat &colMat, UMat &dstImg);
+};

-    public:
-        DeConvolutionLayer(LayerParams &params);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+//Importers
+Ptr<Layer> createConvolutionLayerFromCaffe(LayerParams &params);
+Ptr<Layer> createDeconvolutionLayerFromCaffe(LayerParams &params);

-        template<typename XMat>
-        void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@@ -46,43 +46,44 @@ namespace cv
 namespace dnn
 {

-void getKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW, int &strideH, int &strideW)
+void getCaffeConvParams(LayerParams &params, Size &kernel, Size &pad, Size &stride)
 {
    if (params.has("kernel_h") && params.has("kernel_w"))
    {
-        kernelH = params.get<int>("kernel_h");
-        kernelW = params.get<int>("kernel_w");
+        kernel.height = params.get<int>("kernel_h");
+        kernel.width = params.get<int>("kernel_w");
    }
    else if (params.has("kernel_size"))
    {
-        kernelH = kernelW = params.get<int>("kernel_size");
+        kernel.height = kernel.width = params.get<int>("kernel_size");
    }
    else
    {
-        CV_Error(cv::Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
+        CV_Error(Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
    }
+    CV_Assert(kernel.height > 0 && kernel.width > 0);

    if (params.has("pad_h") && params.has("pad_w"))
    {
-        padH = params.get<int>("pad_h");
-        padW = params.get<int>("pad_w");
+        pad.height = params.get<int>("pad_h");
+        pad.width = params.get<int>("pad_w");
    }
    else
    {
-        padH = padW = params.get<int>("pad", 0);
+        pad.height = pad.width = params.get<int>("pad", 0);
    }
+    CV_Assert(pad.height >= 0 && pad.width >= 0);

    if (params.has("stride_h") && params.has("stride_w"))
    {
-        strideH = params.get<int>("stride_h");
-        strideW = params.get<int>("stride_w");
+        stride.height = params.get<int>("stride_h");
+        stride.width = params.get<int>("stride_w");
    }
    else
    {
-        strideH = strideW = params.get<int>("stride", 1);
+        stride.height = stride.width = params.get<int>("stride", 1);
    }
-
-    CV_Assert(kernelH > 0 && kernelW > 0 && padH >= 0 && padW >= 0 && strideH > 0 && strideW > 0);
+    CV_Assert(stride.height > 0 && stride.width > 0);
 }

 }

--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -48,7 +48,7 @@ namespace cv
 namespace dnn
 {

-void getKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW, int &strideH, int &strideW);
+void getCaffeConvParams(LayerParams &params, Size &kernel, Size &pad, Size &stride);

 }
 }

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
--- a/modules/dnn/src/layers/lrn_layer.hpp
+++ b/modules/dnn/src/layers/lrn_layer.hpp
@@ -42,34 +42,36 @@
 #ifndef __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>

 namespace cv
 {
 namespace dnn
 {
-    class LRNLayer : public Layer
+    class LRNLayerImpl : public LRNLayer
    {
-        enum
-        {
-            CHANNEL_NRM,
-            SPATIAL_NRM,
-            SPATIAL_CONTRAST_NRM //cuda-convnet feature
-        } type;
-
-        int size;
-        double alpha, beta;
-
-        Blob bufBlob;
+        bool useOpenCL;
+        Blob buf;

        void channelNoramlization(Blob &src, Blob &dst);
+        template<typename XMat>
+        void channelNoramlization_(Blob &src, Blob &dst);
+        bool channelNoramlization_ocl(const UMat &src, UMat &dst);
+
        void spatialNormalization(Blob &src, Blob &dst);
+        template<typename XMat>
+        void spatialNormalization_(Blob &src, Blob &dst);

    public:

-        LRNLayer(LayerParams &params);
+        LRNLayerImpl();
        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
    };
+
+
+Ptr<Layer> createLRNLayerFromCaffe(LayerParams &params);
+
 }
 }
 #endif
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -72,22 +72,21 @@ namespace dnn
            type = MAX;
        }

-        getKernelParams(params, kernelH, kernelW, padH, padW, strideH, strideW);
+        getCaffeConvParams(params, kernel, pad, stride);
    }

    void PoolingLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
        CV_Assert(inputs.size() > 0);

-        inpW = inputs[0]->cols();
-        inpH = inputs[0]->rows();
-        computeOutputShape(inpH, inpW);
+        inp = inputs[0]->size2();
+        computeOutputShape(inp);

        outputs.resize(inputs.size());
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            CV_Assert(inputs[i]->rows() == inpH && inputs[i]->cols() == inpW);
-            outputs[i].create(BlobShape(inputs[i]->num(), inputs[i]->channels(), outH, outW));
+            CV_Assert(inputs[i]->rows() == inp.height && inputs[i]->cols() == inp.width);
+            outputs[i].create(BlobShape(inputs[i]->num(), inputs[i]->channels(), out.height, out.width));
        }
    }

@@ -104,7 +103,7 @@ namespace dnn
                avePooling(*inputs[ii], outputs[ii]);
                break;
            default:
-                CV_Error(cv::Error::StsNotImplemented, "Not implemented");
+                CV_Error(Error::StsNotImplemented, "Not implemented");
                break;
            }
        }
@@ -112,7 +111,7 @@ namespace dnn

    void PoolingLayer::maxPooling(Blob &input, Blob &output)
    {
-        CV_DbgAssert(output.rows() == outH && output.cols() == outW);
+        CV_DbgAssert(output.rows() == out.height && output.cols() == out.width);

        for (int n = 0; n < input.num(); ++n)
        {
@@ -121,23 +120,23 @@ namespace dnn
                float *srcData = input.ptrf(n, c);
                float *dstData = output.ptrf(n, c);

-                for (int ph = 0; ph < outH; ++ph)
+                for (int ph = 0; ph < out.height; ++ph)
                {
-                    for (int pw = 0; pw < outW; ++pw)
+                    for (int pw = 0; pw < out.width; ++pw)
                    {
-                        int hstart = ph * strideH - padH;
-                        int wstart = pw * strideW - padW;
-                        int hend = min(hstart + kernelH, inpH);
-                        int wend = min(wstart + kernelW, inpW);
+                        int hstart = ph * stride.height - pad.height;
+                        int wstart = pw * stride.width - pad.width;
+                        int hend = min(hstart + kernel.height, inp.height);
+                        int wend = min(wstart + kernel.width, inp.width);
                        hstart = max(hstart, 0);
                        wstart = max(wstart, 0);
-                        const int poolIndex = ph * outW + pw;
+                        const int poolIndex = ph * out.width + pw;
                        float max_val = -FLT_MAX;

                        for (int h = hstart; h < hend; ++h)
                            for (int w = wstart; w < wend; ++w)
                            {
-                                const int index = h * inpW + w;
+                                const int index = h * inp.width + w;
                                if (srcData[index] > max_val)
                                    max_val = srcData[index];
                            }
@@ -158,49 +157,49 @@ namespace dnn
                float *srcData = input.ptrf(n, c);
                float *dstData = output.ptrf(n, c);

-                for (int ph = 0; ph < outH; ++ph)
+                for (int ph = 0; ph < out.height; ++ph)
                {
-                    for (int pw = 0; pw < outW; ++pw)
+                    for (int pw = 0; pw < out.width; ++pw)
                    {
-                        int hstart = ph * strideH - padH;
-                        int wstart = pw * strideW - padW;
-                        int hend = min(hstart + kernelH, inpH + padH);
-                        int wend = min(wstart + kernelW, inpW + padW);
+                        int hstart = ph * stride.height - pad.height;
+                        int wstart = pw * stride.width - pad.width;
+                        int hend = min(hstart + kernel.height, inp.height + pad.height);
+                        int wend = min(wstart + kernel.width, inp.width + pad.width);
                        int poolSize = (hend - hstart) * (wend - wstart);
                        hstart = max(hstart, 0);
                        wstart = max(wstart, 0);
-                        hend = min(hend, inpH);
-                        wend = min(wend, inpW);
+                        hend = min(hend, inp.height);
+                        wend = min(wend, inp.width);

-                        dstData[ph * outW + pw] = 0.f;
+                        dstData[ph * out.width + pw] = 0.f;

                        for (int h = hstart; h < hend; ++h)
                            for (int w = wstart; w < wend; ++w)
-                                dstData[ph * outW + pw] += srcData[h * inpW + w];
+                                dstData[ph * out.width + pw] += srcData[h * inp.width + w];

-                        dstData[ph * outW + pw] /= poolSize;
+                        dstData[ph * out.width + pw] /= poolSize;
                    }
                }
          }
        }
    }

-    void PoolingLayer::computeOutputShape(int inH, int inW)
+    void PoolingLayer::computeOutputShape(Size inpSz)
    {
        //Yeah, something strange Caffe scheme-)
-        outH = static_cast<int>(ceil(static_cast<float>(inH + 2 * padH - kernelH) / strideH)) + 1;
-        outW = static_cast<int>(ceil(static_cast<float>(inW + 2 * padW - kernelW) / strideW)) + 1;
+        out.height = static_cast<int>(ceil(static_cast<float>(inpSz.height + 2 * pad.height - kernel.height) / stride.height)) + 1;
+        out.width = static_cast<int>(ceil(static_cast<float>(inpSz.width + 2 * pad.width - kernel.width) / stride.width)) + 1;

-        if (padH || padW)
+        if (pad.height || pad.width)
        {
            // If we have padding, ensure that the last pooling starts strictly
            // inside the image (instead of at the padding); otherwise clip the last.
-            if ((outH - 1) * strideH >= inH + padH)
-                --outH;
-            if ((outW - 1) * strideW >= inW + padW)
-                --outW;
-            CV_Assert((outH - 1) * strideH < inH + padH);
-            CV_Assert((outW - 1) * strideW < inW + padW);
+            if ((out.height - 1) * stride.height >= inpSz.height + pad.height)
+                --out.height;
+            if ((out.width - 1) * stride.width >= inpSz.width + pad.width)
+                --out.width;
+            CV_Assert((out.height - 1) * stride.height < inpSz.height + pad.height);
+            CV_Assert((out.width - 1) * stride.width < inpSz.width + pad.width);
        }
    }
 }

--- a/modules/dnn/src/layers/pooling_layer.hpp
+++ b/modules/dnn/src/layers/pooling_layer.hpp
@@ -57,14 +57,10 @@ namespace dnn
        };

        int type;
-        int padH, padW;
-        int strideH, strideW;
-        int kernelH, kernelW;
+        Size kernel, pad, stride;
+        Size inp, out;

-        int inpH, inpW;
-        int outH, outW;
-
-        void computeOutputShape(int inpH, int inpW);
+        void computeOutputShape(Size inpSz);
        void maxPooling(Blob &input, Blob &output);
        void avePooling(Blob &input, Blob &output);


--- a/modules/dnn/src/opencl/lrn.cl
+++ b/modules/dnn/src/opencl/lrn.cl
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp)
+  out[index] = in[index] * pow(scale[index], negative_beta);
+}
+
+__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    in = in + offset;
+    scale = scale + offset;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+  }
+}
\ No newline at end of file
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -97,12 +97,22 @@ OCL_TEST(Layer_Test_Softmax, Accuracy)

 TEST(Layer_Test_LRN_spatial, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_lrn_spatial");
+     OCL_OFF(testLayerUsingCaffeModels("layer_lrn_spatial"));
+}
+OCL_TEST(Layer_Test_LRN_spatial, Accuracy)
+{
+     OCL_ON(testLayerUsingCaffeModels("layer_lrn_spatial"));
+     OCL_OFF();
 }

 TEST(Layer_Test_LRN_channels, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_lrn_channels");
+     OCL_OFF(testLayerUsingCaffeModels("layer_lrn_channels"));
+}
+OCL_TEST(Layer_Test_LRN_channels, Accuracy)
+{
+    OCL_ON(testLayerUsingCaffeModels("layer_lrn_channels"));
+    OCL_OFF();
 }

 TEST(Layer_Test_Convolution, Accuracy)