Added Halide OpenCL target for deep learning networks (#1246)

62ba5d75 · Dmitry Kurtaev · Vadim Pisarevsky · a4a8b84e · 62ba5d75 · 62ba5d75
Commit 62ba5d75 authored Jun 22, 2017 by Dmitry Kurtaev Committed by Vadim Pisarevsky Jun 22, 2017
18 changed files
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -69,7 +69,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     */
    enum Target
    {
-        DNN_TARGET_CPU
+        DNN_TARGET_CPU,
+        DNN_TARGET_OPENCL
    };
    /** @brief Initialize dnn module and built-in layers.
@@ -138,6 +139,11 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
        virtual ~BackendWrapper(); //!< Virtual destructor to make polymorphism.
+        /**
+         * @brief Transfer data to CPU host memory.
+         */
+        virtual void copyToHost() = 0;
        int backendId;  //!< Backend identifier.
        int targetId;   //!< Target identifier.
    };
@@ -220,14 +226,16 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
        * @param[in] node Backend node with Halide functions.
        * @param[in] inputs Blobs that will be used in forward invocations.
        * @param[in] outputs Blobs that will be used in forward invocations.
-        * @see BackendNode
+        * @param[in] targetId Target identifier
+        * @see BackendNode, Target
        *
        * Layer don't use own Halide::Func members because we can have applied
        * layers fusing. In this way the fused function should be scheduled.
        */
        virtual void applyHalideScheduler(Ptr<BackendNode>& node,
                                          const std::vector<Mat*> &inputs,
-                                          const std::vector<Mat> &outputs) const;
+                                          const std::vector<Mat> &outputs,
+                                          int targetId) const;
        /**
         * @brief Implement layers fusing.
@@ -394,6 +402,13 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         */
        void setPreferableBackend(int backendId);
+        /**
+         * @brief Ask network to make computations on specific target device.
+         * @param[in] targetId target identifier.
+         * @see Target
+         */
+        void setPreferableTarget(int targetId);
        /** @brief Sets the new value for the layer output blob
         *  @param name descriptor of the updating layer output blob.
         *  @param blob new blob.

--- a/modules/dnn/perf/perf_halide_net.cpp
+++ b/modules/dnn/perf/perf_halide_net.cpp
@@ -41,114 +41,131 @@ static void loadNet(std::string weights, std::string proto, std::string schedule
    net->setInput(blobFromImage(input, 1.0, false));
    net->setPreferableBackend(DNN_BACKEND_HALIDE);
+    net->setPreferableTarget(targetId);
    net->setHalideScheduler(scheduler);
    net->forward(outputLayer);
 }
+////////////////////////////////////////////////////////////////////////////////
+// CPU target
+////////////////////////////////////////////////////////////////////////////////
 PERF_TEST(GoogLeNet, HalidePerfTest)
 {
-    try {
+    Net net;
-        Net net;
+    loadNet("dnn/bvlc_googlenet2.caffemodel", "dnn/bvlc_googlenet.prototxt",
-        loadNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
+            "", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net);
-                "", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
-        TEST_CYCLE_N(10)
-        {
-            net.forward();
-        }
-        SANITY_CHECK_NOTHING();
-    } catch (SkipTestException& e) {
-        throw PerfSkipTestException();
-    }
 }
 PERF_TEST(AlexNet, HalidePerfTest)
 {
-    try {
+    Net net;
-        Net net;
+    loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
-        loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
+            "dnn/halide_scheduler_alexnet.yml", 227, 227, "prob", "caffe",
-                "dnn/halide_scheduler_alexnet.yml", 227, 227, "prob", "caffe",
+            DNN_TARGET_CPU, &net);
-                DNN_TARGET_CPU, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
-        TEST_CYCLE_N(10)
-        {
-            net.forward();
-        }
-        SANITY_CHECK_NOTHING();
-    } catch (SkipTestException& e) {
-        throw PerfSkipTestException();
-    }
 }
 PERF_TEST(ResNet50, HalidePerfTest)
 {
-    try {
+    Net net;
-        Net net;
+    loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
-        loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
+            "dnn/halide_scheduler_resnet_50.yml", 224, 224, "prob", "caffe",
-                "dnn/halide_scheduler_resnet_50.yml", 224, 224, "prob", "caffe",
+            DNN_TARGET_CPU, &net);
-                DNN_TARGET_CPU, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
-        TEST_CYCLE_N(10)
-        {
-            net.forward();
-        }
-        SANITY_CHECK_NOTHING();
-    } catch (SkipTestException& e) {
-        throw PerfSkipTestException();
-    }
 }
 PERF_TEST(SqueezeNet_v1_1, HalidePerfTest)
 {
-    try {
+    Net net;
-        Net net;
+    loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt",
-        loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt",
+            "dnn/halide_scheduler_squeezenet_v1_1.yml", 227, 227, "prob",
-                "dnn/halide_scheduler_squeezenet_v1_1.yml", 227, 227, "prob",
+            "caffe", DNN_TARGET_CPU, &net);
-                "caffe", DNN_TARGET_CPU, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
-        TEST_CYCLE_N(10)
-        {
-            net.forward();
-        }
-        SANITY_CHECK_NOTHING();
-    } catch (SkipTestException& e) {
-        throw PerfSkipTestException();
-    }
 }
 PERF_TEST(Inception_5h, HalidePerfTest)
 {
-    try {
+    Net net;
-        Net net;
+    loadNet("dnn/tensorflow_inception_graph.pb", "",
-        loadNet("dnn/tensorflow_inception_graph.pb", "",
+            "dnn/halide_scheduler_inception_5h.yml",
-                "dnn/halide_scheduler_inception_5h.yml",
+            224, 224, "softmax2", "tensorflow", DNN_TARGET_CPU, &net);
-                224, 224, "softmax2", "tensorflow", DNN_TARGET_CPU, &net);
+    TEST_CYCLE() net.forward("softmax2");
+    SANITY_CHECK_NOTHING();
-        TEST_CYCLE_N(10)
-        {
-            net.forward("softmax2");
-        }
-        SANITY_CHECK_NOTHING();
-    } catch (SkipTestException& e) {
-        throw PerfSkipTestException();
-    }
 }
 PERF_TEST(ENet, HalidePerfTest)
 {
-    try {
+    Net net;
-        Net net;
+    loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_enet.yml",
-        loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_enet.yml",
+            512, 256, "l367_Deconvolution", "torch", DNN_TARGET_CPU, &net);
-                512, 256, "l367_Deconvolution", "torch", DNN_TARGET_CPU, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
-        TEST_CYCLE_N(10)
+}
-        {
+////////////////////////////////////////////////////////////////////////////////
-            net.forward("l367_Deconvolution");
+// OpenCL target
-        }
+////////////////////////////////////////////////////////////////////////////////
-        SANITY_CHECK_NOTHING();
+PERF_TEST(GoogLeNet_opencl, HalidePerfTest)
-    } catch (SkipTestException& e) {
+{
-        throw PerfSkipTestException();
+    Net net;
-    }
+    loadNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
+            "", 227, 227, "prob", "caffe", DNN_TARGET_OPENCL, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
+}
+PERF_TEST(AlexNet_opencl, HalidePerfTest)
+{
+    Net net;
+    loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
+            "dnn/halide_scheduler_opencl_alexnet.yml", 227, 227, "prob", "caffe",
+            DNN_TARGET_OPENCL, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
+}
+PERF_TEST(ResNet50_opencl, HalidePerfTest)
+{
+    Net net;
+    loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
+            "dnn/halide_scheduler_opencl_resnet_50.yml", 224, 224, "prob", "caffe",
+            DNN_TARGET_OPENCL, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
+}
+PERF_TEST(SqueezeNet_v1_1_opencl, HalidePerfTest)
+{
+    Net net;
+    loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt",
+            "dnn/halide_scheduler_opencl_squeezenet_v1_1.yml", 227, 227, "prob",
+            "caffe", DNN_TARGET_OPENCL, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
+}
+PERF_TEST(Inception_5h_opencl, HalidePerfTest)
+{
+    Net net;
+    loadNet("dnn/tensorflow_inception_graph.pb", "",
+            "dnn/halide_scheduler_opencl_inception_5h.yml",
+            224, 224, "softmax2", "tensorflow", DNN_TARGET_OPENCL, &net);
+    TEST_CYCLE() net.forward("softmax2");
+    SANITY_CHECK_NOTHING();
+}
+PERF_TEST(ENet_opencl, HalidePerfTest)
+{
+    Net net;
+    loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_opencl_enet.yml",
+            512, 256, "l367_Deconvolution", "torch", DNN_TARGET_OPENCL, &net);
+    TEST_CYCLE() net.forward();
+    SANITY_CHECK_NOTHING();
 }
 #endif  // HAVE_HALIDE

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -205,7 +205,7 @@ struct LayerPin
 class BackendWrapManager
 {
 public:
-    Ptr<BackendWrapper> wrap(const Mat& m, int backendId, int targetId = DNN_TARGET_CPU)
+    Ptr<BackendWrapper> wrap(const Mat& m, int backendId, int targetId)
    {
        CV_Assert(backendId != DNN_BACKEND_DEFAULT);
@@ -236,7 +236,7 @@ public:
    }
    std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat*>& mats,
-                                           int backendId, int targetId = DNN_TARGET_CPU)
+                                           int backendId, int targetId)
    {
        const int num = mats.size();
        std::vector<Ptr<BackendWrapper> > dst(num);
@@ -248,7 +248,7 @@ public:
    }
    std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat>& mats,
-                                           int backendId, int targetId = DNN_TARGET_CPU)
+                                           int backendId, int targetId)
    {
        const int num = mats.size();
        std::vector<Ptr<BackendWrapper> > dst(num);
@@ -617,6 +617,7 @@ struct Net::Impl
        lastLayerId = 1;
        netWasAllocated = false;
        preferableBackend = DNN_BACKEND_DEFAULT;
+        preferableTarget = DNN_TARGET_CPU;
    }
    Ptr<DataLayer> netInputLayer;
@@ -626,6 +627,7 @@ struct Net::Impl
    std::map<String, int> layerNameToId;
    BlobManager blobManager;
    int preferableBackend;
+    int preferableTarget;
    String halideConfigFile;
    // Backend-specific wrapping manager.
    BackendWrapManager backendWrapper;
@@ -652,10 +654,11 @@ struct Net::Impl
                {
                    // Use automatic scheduling provided by layer.
                    layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
-                                                ld.inputBlobs, ld.outputBlobs);
+                                                ld.inputBlobs, ld.outputBlobs,
+                                                preferableTarget);
                }
                dnn::compileHalide(ld.outputBlobs, ld.backendNodes[DNN_BACKEND_HALIDE],
-                                   DNN_TARGET_CPU);
+                                   preferableTarget);
            }
        }
    }
@@ -859,7 +862,10 @@ struct Net::Impl
    {
        backendWrapper.reset();
        if (preferableBackend == DNN_BACKEND_DEFAULT)
+        {
+            CV_Assert(preferableTarget == DNN_TARGET_CPU);
            return;
+        }
        // Iterator to current layer.
        MapIdToLayerData::iterator it = layers.begin();
@@ -905,7 +911,8 @@ struct Net::Impl
            // No layers fusion.
            ldTop.skipFlags[preferableBackend] = false;
            std::vector<Ptr<BackendWrapper> > inputs =
-                backendWrapper.wrap(ldTop.inputBlobs, preferableBackend);
+                backendWrapper.wrap(ldTop.inputBlobs, preferableBackend,
+                                    preferableTarget);
            if (preferableBackend == DNN_BACKEND_HALIDE)
            {
                ldTop.backendNodes[DNN_BACKEND_HALIDE] = layerTop->initHalide(inputs);
@@ -1040,7 +1047,7 @@ struct Net::Impl
        else if (!ld.skipFlags[preferableBackend])
        {
            std::vector<Ptr<BackendWrapper> > outputs =
-                backendWrapper.wrap(ld.outputBlobs, preferableBackend);
+                backendWrapper.wrap(ld.outputBlobs, preferableBackend, preferableTarget);
            Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
            if (preferableBackend == DNN_BACKEND_HALIDE)
            {
@@ -1154,6 +1161,16 @@ struct Net::Impl
            CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) +
                                           " outputs, the #" + toString(pin.oid) + " was requsted");
        }
+        if (preferableBackend != DNN_BACKEND_DEFAULT)
+        {
+            // Transfer data to CPU if it's require.
+            backendWrapper.wrap(ld.outputBlobs[pin.oid], preferableBackend,
+                                preferableTarget)->copyToHost();
+        }
+        else
+        {
+            CV_Assert(preferableTarget == DNN_TARGET_CPU);
+        }
        return ld.outputBlobs[pin.oid];
    }
@@ -1314,6 +1331,13 @@ void Net::setPreferableBackend(int backendId)
    impl->preferableBackend = backendId;
 }
+void Net::setPreferableTarget(int targetId)
+{
+    impl->netWasAllocated = impl->netWasAllocated &&
+                            impl->preferableTarget == targetId;
+    impl->preferableTarget = targetId;
+}
 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
 {
    impl->netInputLayer->setNames(inputBlobNames);
@@ -1702,10 +1726,70 @@ Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
 }
 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
-                                 const std::vector<Mat> &outputs) const
+                                 const std::vector<Mat> &outputs, int targetId) const
 {
-    CV_Error(Error::StsNotImplemented, "Scheduling of " + type +
+#ifdef  HAVE_HALIDE
-                                       " layers is not implemented.");
+    Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
+                xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
+    Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
+    int outW, outH, outC, outN;
+    getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
+    if (targetId == DNN_TARGET_CPU)
+    {
+        if (outW == 1 && outH == 1)
+        {
+            if (outC + outN == 1)
+                return;
+            if (outC > 8)
+              top.split(c, co, ci, 8)
+                 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
+                 .parallel(tile)
+                 .vectorize(ci, 8);
+            else
+              top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
+                 .parallel(tile);
+        }
+        else
+        {
+            if (outH > 2)
+            {
+                top.reorder(x, c, y)
+                   .split(y, yo, yi, 2)
+                   .fuse(yo, n, tile)
+                   .parallel(tile)
+                   .unroll(yi)
+                   .vectorize(x, outW >= 16 ? 16 : outW);
+            }
+        }
+    }
+    else if (targetId == DNN_TARGET_OPENCL)
+    {
+        int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
+        if (outW == 1 && outH == 1)
+        {
+            top.split(c, co, ci, c_split)
+               .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
+               .gpu_blocks(tile)
+               .gpu_threads(ci);
+        }
+        else
+        {
+            int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
+            int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
+            top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
+               .split(c, co, ci, c_split)
+               .gpu_blocks(xo, yo, co)
+               .gpu_threads(xi, yi)
+               .reorder(xi, yi, ci, xo, yo, co)
+               .vectorize(ci);
+        }
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "Unknown target identifier");
+#endif  // HAVE_HALIDE
 }
 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)

--- a/modules/dnn/src/halide_scheduler.cpp
+++ b/modules/dnn/src/halide_scheduler.cpp
@@ -143,6 +143,26 @@ static void applyComputeRoot(const FileNode& directive, Halide::Func& func)
        func.compute_root();
 }
+static void applyGpuBlocks(const FileNode& directive, Halide::Func& func)
+{
+    std::string varName;
+    for (int i = 0, n = directive.size(); i < n; ++i)
+    {
+        directive[i] >> varName;
+        func.gpu_blocks(Halide::Var(varName));
+    }
+}
+static void applyGpuThreads(const FileNode& directive, Halide::Func& func)
+{
+    std::string varName;
+    for (int i = 0, n = directive.size(); i < n; ++i)
+    {
+        directive[i] >> varName;
+        func.gpu_threads(Halide::Var(varName));
+    }
+}
 static void apply(const FileNode& directives, Halide::Func& func,
                  std::map<std::string, Halide::Func>& funcsMap,
                  const FileNode& params)
@@ -167,6 +187,10 @@ static void apply(const FileNode& directives, Halide::Func& func,
            applyComputeAt(directive, func, funcsMap);
        else if (directive.name() == "compute_root")
            applyComputeRoot(directive, func);
+        else if (directive.name() == "gpu_blocks")
+            applyGpuBlocks(directive, func);
+        else if (directive.name() == "gpu_threads")
+            applyGpuThreads(directive, func);
        else
            CV_Error(Error::StsNotImplemented, "Scheduling directive " +
                     directive.name() + " is not implemented.");

--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -157,6 +157,8 @@ public:
            bias(i) = (hasBias ? biasData[i] : 0.0f) -
                      weights(i) * meanData[i] * varMeanScale;
        }
+        weights.set_host_dirty();
+        bias.set_host_dirty();
        top(x, y, c, n) = input * weights(c) + bias(c);
        return top;
    }

--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -130,29 +130,6 @@ public:
 #endif  // HAVE_HALIDE
        return Ptr<BackendNode>();
    }
-    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
-                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
-    {
-#ifdef  HAVE_HALIDE
-        Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
-        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
-        int outW, outH, outC, outN;
-        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
-        if (outW == 1 || outH <= 2)
-            return;
-        top.reorder(x, c, y)
-           .split(y, yo, yi, 2)
-           .fuse(yo, n, tile)
-           .parallel(tile)
-           .unroll(yi)
-           .vectorize(x, outW >= 16 ? 16 : outW);
-#endif  // HAVE_HALIDE
-    }
 };
 Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -99,9 +99,15 @@ public:
    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
+                                      const std::vector<Mat> &outputs,
+                                      int targetId) const
    {
 #ifdef HAVE_HALIDE
+        if (targetId != DNN_TARGET_CPU)
+        {
+            Layer::applyHalideScheduler(node, inputs, outputs, targetId);
+            return;
+        }
        Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"), co("co"), ci("ci");
        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs[1];
        Halide::Func& padded_input = node.dynamicCast<HalideBackendNode>()->funcs[0];

--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -422,7 +422,7 @@ struct ChannelsPReLUFunctor
    {
        Halide::Var x("x"), y("y"), c("c"), n("n");
        auto weights = wrapToHalideBuffer(scale, {(int)scale.total()});
-        top(x, y, c, n) = select(input > 0.0f, input, weights(c) * input);
+        top(x, y, c, n) = select(input >= 0.0f, input, weights(c) * input);
    }
 #endif  // HAVE_HALIDE

--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -198,29 +198,6 @@ public:
        return Ptr<BackendNode>();
    }
-    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
-                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
-    {
-#ifdef HAVE_HALIDE
-        Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
-        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
-        int outW, outH, outC, outN;
-        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
-        if (outW == 1 || outH <= 2)
-            return;
-        top.reorder(x, c, y)
-           .split(y, yo, yi, 2)
-           .fuse(yo, n, tile)
-           .parallel(tile)
-           .unroll(yi)
-           .vectorize(x, outW >= 16 ? 16 : outW);
-#endif  // HAVE_HALIDE
-    }
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const
    {

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -252,31 +252,6 @@ public:
        return Ptr<BackendNode>();
    }
-    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
-                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
-    {
-#ifdef HAVE_HALIDE
-        int outW, outH, outC, outN;
-        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
-        Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile");
-        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
-        if (outC + outN == 1)
-            return;
-        if (outC > 8)
-          top.split(c, co, ci, 8)
-             .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
-             .parallel(tile)
-             .vectorize(ci, 8);
-        else
-          top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
-             .parallel(tile);
-#endif  // HAVE_HALIDE
-    }
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const
    {

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -272,9 +272,15 @@ public:
    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
+                                      const std::vector<Mat> &outputs,
+                                      int targetId) const
    {
 #ifdef  HAVE_HALIDE
+        if (targetId != DNN_TARGET_CPU)
+        {
+            Layer::applyHalideScheduler(node, inputs, outputs, targetId);
+            return;
+        }
        int outW, outH, outC, outN;
        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);

--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@@ -117,26 +117,6 @@ public:
 #endif  // HAVE_HALIDE
        return Ptr<BackendNode>();
    }
-    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
-                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
-    {
-#ifdef HAVE_HALIDE
-        Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
-        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
-        int outW, outH, outC, outN;
-        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
-        top.reorder(x, c, y)
-           .split(y, yo, yi, 2)
-           .fuse(yo, n, tile)
-           .parallel(tile)
-           .unroll(yi)
-           .vectorize(x, outW >= 16 ? 16 : outW);
-#endif  // HAVE_HALIDE
-    }
 };
 Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)

--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@@ -10,6 +10,7 @@ Implementation of padding layer, which adds paddings to input blob.
 */
 #include "../precomp.hpp"
+#include "op_halide.hpp"
 #include <vector>
 namespace cv
@@ -52,6 +53,12 @@ public:
        return false;
    }
+    virtual bool supportBackend(int backendId)
+    {
+        return backendId == DNN_BACKEND_DEFAULT ||
+               backendId == DNN_BACKEND_HALIDE && haveHalide();
+    }
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for(int i = 0; i < inputs.size(); i++)
@@ -94,6 +101,23 @@ public:
        return inputDims > 0 && (int)shape.size() > inputDims ? paddingDim + 1 : paddingDim;
    }
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
+    {
+#ifdef HAVE_HALIDE
+        int inW, inH, inC, inN;
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Func padded =
+            Halide::BoundaryConditions::constant_exterior(inputBuffer, paddingValue);
+        top(x, y, c, n) = padded(x, y, c, n);
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
    int paddingDim, padding, inputDims, index;
    float paddingValue;
 };

--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -388,9 +388,15 @@ public:
    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
+                                      const std::vector<Mat> &outputs,
+                                      int targetId) const
    {
 #ifdef  HAVE_HALIDE
+        if (targetId != DNN_TARGET_CPU)
+        {
+            Layer::applyHalideScheduler(node, inputs, outputs, targetId);
+            return;
+        }
        Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
                    xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();

--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -187,33 +187,6 @@ public:
        return Ptr<BackendNode>();
    }
-    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
-                                      const std::vector<Mat*> &inputs,
-                                      const std::vector<Mat> &outputs) const
-    {
-#ifdef HAVE_HALIDE
-        int outW, outH, outC, outN;
-        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
-        // Most common case when SoftMax is a layer after fully-connected.
-        // So we just schedule it in the same way.
-        Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile");
-        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
-        if (outC + outN == 1)
-            return;
-        if (outC > 8)
-          top.split(c, co, ci, 8)
-             .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
-             .parallel(tile)
-             .vectorize(ci, 8);
-        else
-          top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
-             .parallel(tile);
-#endif  // HAVE_HALIDE
-    }
    int64 getFLOPS(const std::vector<MatShape> &inputs,
                  const std::vector<MatShape> &outputs) const
    {

--- a/modules/dnn/src/op_halide.cpp
+++ b/modules/dnn/src/op_halide.cpp
@@ -7,6 +7,10 @@
 #include "op_halide.hpp"
+#ifdef HAVE_HALIDE
+#include <HalideRuntimeOpenCL.h>
+#endif  // HAVE_HALIDE
 namespace cv
 {
 namespace dnn
@@ -72,7 +76,15 @@ HalideBackendWrapper::HalideBackendWrapper(int targetId, const cv::Mat& m)
    : BackendWrapper(DNN_BACKEND_HALIDE, targetId)
 {
    buffer = wrapToHalideBuffer(m);
-    if (targetId != DNN_TARGET_CPU)
+    if (targetId == DNN_TARGET_CPU)
+    {
+        return;
+    }
+    else if (targetId == DNN_TARGET_OPENCL)
+    {
+        buffer.copy_to_device(halide_opencl_device_interface());
+    }
+    else
        CV_Error(Error::StsNotImplemented, "Unknown target identifier");
 }
@@ -80,15 +92,32 @@ HalideBackendWrapper::HalideBackendWrapper(const Ptr<BackendWrapper>& base,
                                           const MatShape& shape)
    : BackendWrapper(DNN_BACKEND_HALIDE, base->targetId)
 {
-    if (base->targetId != DNN_TARGET_CPU)
-        CV_Error(Error::StsNotImplemented, "Unknown target identifier");
    int w, h, c, n;
    getCanonicalSize(shape, &w, &h, &c, &n);
    Halide::Buffer<float> baseBuffer = halideBuffer(base);
    buffer = Halide::Buffer<float>((float*)baseBuffer.raw_buffer()->host,
                                   {w, h, c, n});
-    buffer.set_host_dirty();  // Indicate that data is on CPU.
+    if (baseBuffer.has_device_allocation())
+    {
+        buffer.raw_buffer()->device = baseBuffer.raw_buffer()->device;
+        buffer.raw_buffer()->device_interface = baseBuffer.raw_buffer()->device_interface;
+        buffer.set_device_dirty();
+    }
+    else
+    {
+        buffer.set_host_dirty();  // Indicate that data is on CPU.
+        CV_Assert(targetId == DNN_TARGET_CPU);
+    }
+}
+void HalideBackendWrapper::copyToHost()
+{
+    CV_Assert(targetId == DNN_TARGET_CPU || buffer.device_dirty());
+    if (buffer.device_dirty())
+    {
+        buffer.device_sync();
+        buffer.copy_to_host();
+    }
 }
 #endif  // HAVE_HALIDE
@@ -144,6 +173,11 @@ void compileHalide(std::vector<Mat> &outputs, Ptr<BackendNode>& node, int target
    Halide::Target target = Halide::get_host_target();
    target.set_feature(Halide::Target::NoAsserts);
+    if (targetId == DNN_TARGET_OPENCL)
+    {
+        target.set_feature(Halide::Target::OpenCL);
+    }
+    CV_Assert(target.supported());
    top.compile_jit(target);
 #endif  // HAVE_HALIDE
 }

--- a/modules/dnn/src/op_halide.hpp
+++ b/modules/dnn/src/op_halide.hpp
@@ -57,6 +57,8 @@ namespace dnn
        HalideBackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape);
+        virtual void copyToHost();
        Halide::Buffer<float> buffer;
    };
 #endif  // HAVE_HALIDE

--- a/modules/dnn/test/test_halide_nets.cpp
+++ b/modules/dnn/test/test_halide_nets.cpp
@@ -48,6 +48,7 @@ static void test(const std::string& weights, const std::string& proto,
    netHalide.setInput(blobFromImage(input.clone(), 1.0f, false));
    netHalide.setPreferableBackend(DNN_BACKEND_HALIDE);
+    netHalide.setPreferableTarget(targetId);
    netHalide.setHalideScheduler(scheduler);
    outputHalide = netHalide.forward(outputLayer).clone();
@@ -62,15 +63,20 @@ static void test(const std::string& weights, const std::string& proto,
    // Swap backends.
    netHalide.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    netHalide.setPreferableTarget(DNN_TARGET_CPU);
    outputDefault = netHalide.forward(outputLayer).clone();
    netDefault.setPreferableBackend(DNN_BACKEND_HALIDE);
+    netDefault.setPreferableTarget(targetId);
    netDefault.setHalideScheduler(scheduler);
    outputHalide = netDefault.forward(outputLayer).clone();
    normAssert(outputDefault, outputHalide);
 }
+////////////////////////////////////////////////////////////////////////////////
+// CPU target
+////////////////////////////////////////////////////////////////////////////////
 TEST(Reproducibility_GoogLeNet_Halide, Accuracy)
 {
    test(findDataFile("dnn/bvlc_googlenet.caffemodel", false),
@@ -115,6 +121,53 @@ TEST(Reproducibility_ENet_Halide, Accuracy)
         findDataFile("dnn/halide_scheduler_enet.yml", false),
         512, 512, "l367_Deconvolution", "torch", DNN_TARGET_CPU);
 };
+////////////////////////////////////////////////////////////////////////////////
+// OpenCL target
+////////////////////////////////////////////////////////////////////////////////
+TEST(Reproducibility_GoogLeNet_Halide_opencl, Accuracy)
+{
+    test(findDataFile("dnn/bvlc_googlenet.caffemodel", false),
+         findDataFile("dnn/bvlc_googlenet.prototxt", false),
+         "", 227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
+};
+TEST(Reproducibility_AlexNet_Halide_opencl, Accuracy)
+{
+    test(findDataFile("dnn/bvlc_alexnet.caffemodel", false),
+         findDataFile("dnn/bvlc_alexnet.prototxt", false),
+         findDataFile("dnn/halide_scheduler_opencl_alexnet.yml", false),
+         227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
+};
+TEST(Reproducibility_ResNet_50_Halide_opencl, Accuracy)
+{
+    test(findDataFile("dnn/ResNet-50-model.caffemodel", false),
+         findDataFile("dnn/ResNet-50-deploy.prototxt", false),
+         findDataFile("dnn/halide_scheduler_opencl_resnet_50.yml", false),
+         224, 224, "prob", "caffe", DNN_TARGET_OPENCL);
+};
+TEST(Reproducibility_SqueezeNet_v1_1_Halide_opencl, Accuracy)
+{
+    test(findDataFile("dnn/squeezenet_v1_1.caffemodel", false),
+         findDataFile("dnn/squeezenet_v1_1.prototxt", false),
+         findDataFile("dnn/halide_scheduler_opencl_squeezenet_v1_1.yml", false),
+         227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
+};
+TEST(Reproducibility_Inception_5h_Halide_opencl, Accuracy)
+{
+    test(findDataFile("dnn/tensorflow_inception_graph.pb", false), "",
+         findDataFile("dnn/halide_scheduler_opencl_inception_5h.yml", false),
+         224, 224, "softmax2", "tensorflow", DNN_TARGET_OPENCL);
+};
+TEST(Reproducibility_ENet_Halide_opencl, Accuracy)
+{
+    test(findDataFile("dnn/Enet-model-best.net", false), "",
+         findDataFile("dnn/halide_scheduler_opencl_enet.yml", false),
+         512, 512, "l367_Deconvolution", "torch", DNN_TARGET_OPENCL);
+};
 #endif  // HAVE_HALIDE
 }  // namespace cvtest