Merge pull request #14301 from l-bat:conv3d

Support Convolution3D layer on IE backend (#14301) * Add Convolution3D layer * Disable CXX11 * Fixed tests * Add Pooling3D layer * Merge Conv2d with Conv3d and Pool2d with Pool3d layers * Split pads * Add Deconvolution layer * Refactoring * Deduplication * Refactoring * Add utils for Convolution and Pooling layers

Merge pull request #14301 from l-bat:conv3d
Support Convolution3D layer on IE backend (#14301) * Add Convolution3D layer * Disable CXX11 * Fixed tests * Add Pooling3D layer * Merge Conv2d with Conv3d and Pool2d with Pool3d layers * Split pads * Add Deconvolution layer * Refactoring * Deduplication * Refactoring * Add utils for Convolution and Pooling layers
77fa59c3 · Lubov Batanina · Alexander Alekhin · 3bcbd2a0 · 77fa59c3 · 77fa59c3
Commit 77fa59c3 authored Apr 30, 2019 by Lubov Batanina Committed by Alexander Alekhin Apr 30, 2019
10 changed files
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -210,7 +210,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    class CV_EXPORTS BaseConvolutionLayer : public Layer
    {
    public:
-        Size kernel, stride, pad, dilation, adjustPad;
+        CV_DEPRECATED_EXTERNAL Size kernel, stride, pad, dilation, adjustPad;
+        std::vector<size_t> adjust_pads;
+        std::vector<size_t> kernel_size, strides, dilations;
+        std::vector<size_t> pads_begin, pads_end;
        String padMode;
        int numOutput;
    };
@@ -243,9 +246,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    {
    public:
        int type;
-        Size kernel, stride;
+        std::vector<size_t> kernel_size, strides;
-        int pad_l, pad_t, pad_r, pad_b;
+        std::vector<size_t> pads_begin, pads_end;
-        CV_DEPRECATED_EXTERNAL Size pad;
+        CV_DEPRECATED_EXTERNAL Size kernel, stride, pad;
+        CV_DEPRECATED_EXTERNAL int pad_l, pad_t, pad_r, pad_b;
        bool globalPooling;
        bool computeMaxIdx;
        String padMode;

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -2263,6 +2263,7 @@ struct Net::Impl
                if (isAsync)
                    CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
+                CV_Assert(layer->supportBackend(DNN_BACKEND_OPENCV));
                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
                    std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -59,22 +59,20 @@ namespace cv
 {
 namespace dnn
 {
+void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
+                                std::vector<size_t>& pads_end, std::vector<size_t>& strides, std::vector<size_t>& dilations, cv::String &padMode);
-void getConvolutionKernelParams(const LayerParams &params, int &kernelH, int &kernelW, int &padT, int &padL, int &padB, int &padR,
+void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, bool &globalPooling,
-                                int &strideH, int &strideW, int &dilationH, int &dilationW, cv::String& padMode);
+                            std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end, std::vector<size_t>& strides, cv::String &padMode);
-void getPoolingKernelParams(const LayerParams &params, int &kernelH, int &kernelW, bool &globalPooling,
+void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
-                            int &padT, int &padL, int &padB, int &padR, int &strideH, int &strideW, cv::String& padMode);
+                          const std::vector<size_t>& stride, const String &padMode,
+                          const std::vector<size_t>& dilation, std::vector<int>& out);
-void getConvPoolOutParams(const Size& inp, const Size &kernel,
-                          const Size &stride, const String &padMode,
-                          const Size &dilation, Size& out);
-void getConvPoolPaddings(const Size& inp, const Size& out,
-                         const Size &kernel, const Size &stride,
-                         const String &padMode, const Size &dilation, int &padT, int &padL, int &padB, int &padR);
+ void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<int>& out,
+                          const std::vector<size_t>& kernel, const std::vector<size_t>& strides,
+                          const String &padMode, const std::vector<size_t>& dilation,
+                          std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
 }
 }

--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -72,6 +72,7 @@ public:
        computeMaxIdx = true;
        globalPooling = false;
        stride = Size(1, 1);
+        pad_t = pad_l = pad_b = pad_r = 0;
        if (params.has("pool") || params.has("kernel_size") ||
            params.has("kernel_w") || params.has("kernel_h"))
@@ -86,11 +87,17 @@ public:
            else
                CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
-            getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
+            getPoolingKernelParams(params, kernel_size, globalPooling, pads_begin, pads_end, strides, padMode);
-                                   pad_t, pad_l, pad_b, pad_r, stride.height, stride.width, padMode);
+            if (kernel_size.size() == 2) {
+                kernel = Size(kernel_size[1], kernel_size[0]);
+                stride = Size(strides[1], strides[0]);
+                pad = Size(pads_begin[1], pads_begin[0]);
-            pad.width = pad_l;
+                pad_t = pads_begin[0];
-            pad.height = pad_t;
+                pad_l = pads_begin[1];
+                pad_b = pads_end[0];
+                pad_r = pads_end[1];
+            }
        }
        else if (params.has("pooled_w") || params.has("pooled_h"))
        {
@@ -125,17 +132,24 @@ public:
        CV_Assert(!inputs.empty());
-        cv::Size inp(inputs[0].size[3], inputs[0].size[2]),
+        std::vector<int> inp;
-                out(outputs[0].size[3], outputs[0].size[2]);
+        std::vector<int> out;
+        for (int i = 2; i < inputs[0].dims; i++) {
-        if(globalPooling)
+            inp.push_back(inputs[0].size[i]);
-        {
+            out.push_back(outputs[0].size[i]);
-            kernel = inp;
+        }
+        if (globalPooling) {
+            kernel = Size(inp[1], inp[0]);
+            kernel_size = std::vector<size_t>(inp.begin(), inp.end());
        }
-        getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad_t, pad_l, pad_b, pad_r);
+        getConvPoolPaddings(inp, out, kernel_size, strides, padMode, std::vector<size_t>(kernel_size.size(), 1), pads_begin, pads_end);
-        pad.width = pad_l;
+        if (pads_begin.size() == 2) {
-        pad.height = pad_t;
+            pad_t = pads_begin[0];
+            pad_l = pads_begin[1];
+            pad_b = pads_end[0];
+            pad_r = pads_end[1];
+        }
 #ifdef HAVE_OPENCL
        poolOp.release();
@@ -148,6 +162,8 @@ public:
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        {
 #ifdef HAVE_INF_ENGINE
+            if (kernel_size.size() == 3)
+                return preferableTarget == DNN_TARGET_CPU;
            if (preferableTarget == DNN_TARGET_MYRIAD) {
                if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) {
                    return !isMyriadX();
@@ -161,9 +177,9 @@ public:
 #endif
        }
        else
-            return backendId == DNN_BACKEND_OPENCV ||
+            return (kernel_size.empty() || kernel_size.size() == 2) && (backendId == DNN_BACKEND_OPENCV ||
                   (backendId == DNN_BACKEND_HALIDE && haveHalide() &&
-                   (type == MAX || (type == AVE && !pad_t && !pad_l && !pad_b && !pad_r)));
+                   (type == MAX || (type == AVE && !pad_t && !pad_l && !pad_b && !pad_r))));
    }
 #ifdef HAVE_OPENCL
@@ -269,10 +285,12 @@ public:
        if (type == MAX || type == AVE)
        {
            InferenceEngine::Builder::PoolingLayer ieLayer(name);
-            ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
-            ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
+            ieLayer.setKernel(kernel_size);
-            ieLayer.setPaddingsBegin({(size_t)pad_t, (size_t)pad_l});
+            ieLayer.setStrides(strides);
-            ieLayer.setPaddingsEnd({(size_t)pad_b, (size_t)pad_r});
+            ieLayer.setPaddingsBegin(pads_begin);
+            ieLayer.setPaddingsEnd(pads_end);
            ieLayer.setPoolingType(type == MAX ?
                                   InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
                                   InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);
@@ -916,59 +934,56 @@ public:
                         std::vector<MatShape> &internals) const CV_OVERRIDE
    {
        CV_Assert(inputs.size() != 0);
-        Size in(inputs[0][3], inputs[0][2]), out;
+        std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
+        std::vector<int> outShape(inputs[0].begin(), inputs[0].begin() + 2);
        if (globalPooling)
        {
-            out.height = 1;
+            outShape.push_back(1);
-            out.width = 1;
+            outShape.push_back(1);
        }
        else if (type == ROI || type == PSROI)
        {
-            out.height = pooledSize.height;
+            outShape.push_back(pooledSize.height);
-            out.width = pooledSize.width;
+            outShape.push_back(pooledSize.width);
        }
        else if (padMode.empty())
        {
-            float height = (float)(in.height + pad_t + pad_b - kernel.height) / stride.height;
+            for (int i = 0; i < kernel_size.size(); i++) {
-            float width = (float)(in.width + pad_l + pad_r - kernel.width) / stride.width;
+                float dst = (float)(inpShape[i] + pads_begin[i] + pads_end[i] - kernel_size[i]) / strides[i];
-            out.height = 1 + (ceilMode ? ceil(height) : floor(height));
+                outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst)));
-            out.width = 1 + (ceilMode ? ceil(width) : floor(width));
+            }
-            if (pad_r || pad_b)
+            // If we have padding, ensure that the last pooling starts strictly
-            {
+            // inside the image (instead of at the padding); otherwise clip the last.
-                // If we have padding, ensure that the last pooling starts strictly
+            for (int i = 0; i < pads_end.size(); i++) {
-                // inside the image (instead of at the padding); otherwise clip the last.
+                if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) {
-                if ((out.height - 1) * stride.height >= in.height + pad_b)
+                    --outShape[2 + i];
-                    --out.height;
+                    CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]);
-                if ((out.width - 1) * stride.width >= in.width + pad_r)
+                }
-                    --out.width;
-                CV_Assert((out.height - 1) * stride.height < in.height + pad_b);
-                CV_Assert((out.width - 1) * stride.width < in.width + pad_r);
            }
        }
        else
        {
-            getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
+            getConvPoolOutParams(inpShape, kernel_size, strides, padMode, std::vector<size_t>(kernel_size.size(), 1), outShape);
        }
-        int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width};
        if (type == ROI)
        {
            CV_Assert(inputs.size() == 2);
-            dims[0] = inputs[1][0];  // Number of proposals;
+            outShape[0] = inputs[1][0];  // Number of proposals;
        }
        else if (type == PSROI)
        {
            CV_Assert(inputs.size() == 2);
            CV_Assert(psRoiOutChannels * pooledSize.width * pooledSize.height == inputs[0][1]);
-            dims[0] = inputs[1][0];  // Number of proposals;
+            outShape[0] = inputs[1][0];  // Number of proposals;
-            dims[1] = psRoiOutChannels;
+            outShape[1] = psRoiOutChannels;
        }
        int numOutputs = requiredOutputs ? requiredOutputs : (type == MAX ? 2 : 1);
        CV_Assert(numOutputs == 1 || (numOutputs == 2 && type == MAX));
-        outputs.assign(numOutputs, shape(dims, 4));
+        outputs.assign(numOutputs, outShape);
        return false;
    }

--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -184,6 +184,12 @@ std::map<std::string, Mat> ONNXImporter::getGraphTensors(
  return layers_weights;
 }
+static DictValue parse(const ::google::protobuf::RepeatedField< ::google::protobuf::int64>& src) {
+    std::vector<int32_t> dst(src.size());
+    convertInt64ToInt32(src, dst, src.size());
+    return DictValue::arrayInt(&dst[0], src.size());
+}
 LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_proto)
 {
    LayerParams lp;
@@ -194,15 +200,13 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
        if(attribute_name == "kernel_shape")
        {
-            CV_Assert(attribute_proto.ints_size() == 2);
+            CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
-            lp.set("kernel_h", saturate_cast<int32_t>(attribute_proto.ints(0)));
+            lp.set("kernel_size", parse(attribute_proto.ints()));
-            lp.set("kernel_w", saturate_cast<int32_t>(attribute_proto.ints(1)));
        }
        else if(attribute_name == "strides")
        {
-            CV_Assert(attribute_proto.ints_size() == 2);
+            CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
-            lp.set("stride_h", saturate_cast<int32_t>(attribute_proto.ints(0)));
+            lp.set("stride", parse(attribute_proto.ints()));
-            lp.set("stride_w", saturate_cast<int32_t>(attribute_proto.ints(1)));
        }
        else if(attribute_name == "pads")
        {
@@ -225,11 +229,8 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
            else
            {
                // Convolution or pooling.
-                CV_Assert(attribute_proto.ints_size() == 4);
+                CV_Assert(attribute_proto.ints_size() == 4 || attribute_proto.ints_size() == 6);
-                lp.set("pad_t", saturate_cast<int32_t>(attribute_proto.ints(0)));
+                lp.set("pad", parse(attribute_proto.ints()));
-                lp.set("pad_l", saturate_cast<int32_t>(attribute_proto.ints(1)));
-                lp.set("pad_b", saturate_cast<int32_t>(attribute_proto.ints(2)));
-                lp.set("pad_r", saturate_cast<int32_t>(attribute_proto.ints(3)));
            }
        }
        else if(attribute_name == "auto_pad")
@@ -243,9 +244,8 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
        }
        else if(attribute_name == "dilations")
        {
-            CV_Assert(attribute_proto.ints_size() == 2);
+            CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
-            lp.set("dilation_h",  saturate_cast<int32_t>(attribute_proto.ints(0)));
+            lp.set("dilation", parse(attribute_proto.ints()));
-            lp.set("dilation_w",  saturate_cast<int32_t>(attribute_proto.ints(1)));
        }
        else if (attribute_proto.has_i())
        {
@@ -270,10 +270,7 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
        }
        else if (attribute_proto.ints_size() > 0)
        {
-            const ::google::protobuf::RepeatedField< ::google::protobuf::int64> src = attribute_proto.ints();
+            lp.set(attribute_proto.name(), parse(attribute_proto.ints()));
-            std::vector<int32_t> dst(attribute_proto.ints_size());
-            convertInt64ToInt32(src, dst, attribute_proto.ints_size());
-            lp.set(attribute_proto.name(), DictValue::arrayInt(&dst[0], attribute_proto.ints_size()));
        }
        else if (attribute_proto.has_t())
        {
@@ -305,19 +302,6 @@ Mat ONNXImporter::getBlob(const opencv_onnx::NodeProto& node_proto,
    return constBlob->second;
 }
-bool ONNXImporter::isCeilMode(const LayerParams& layerParams) {
-    if (!layerParams.has("pad_mode")) {
-        if (layerParams.has("pad_h")) {
-            return layerParams.get<int>("pad_h") != layerParams.get<int>("pad_b") ||
-                        layerParams.get<int>("pad_w") != layerParams.get<int>("pad_r");
-        }
-        else
-            return false; // all pads == 0
-    }
-    return true;
-}
 void ONNXImporter::populateNet(Net dstNet)
 {
    CV_Assert(model_proto.has_graph());
@@ -384,13 +368,13 @@ void ONNXImporter::populateNet(Net dstNet)
        {
            layerParams.type = "Pooling";
            layerParams.set("pool", "MAX");
-            layerParams.set("ceil_mode", isCeilMode(layerParams));
+            layerParams.set("ceil_mode", layerParams.has("pad_mode"));
        }
        else if (layer_type == "AveragePool")
        {
            layerParams.type = "Pooling";
            layerParams.set("pool", "AVE");
-            layerParams.set("ceil_mode", isCeilMode(layerParams));
+            layerParams.set("ceil_mode", layerParams.has("pad_mode"));
            layerParams.set("ave_pool_padded_area", framework_name == "pytorch");
        }
        else if (layer_type == "GlobalAveragePool" || layer_type == "GlobalMaxPool")
@@ -600,8 +584,9 @@ void ONNXImporter::populateNet(Net dstNet)
                if (outShape.size() != 4)
                    CV_Error(Error::StsNotImplemented, "Output shape must have 4 elements.");
-                const int strideY = layerParams.get<int>("stride_h", 1);
+                DictValue stride = layerParams.get("stride");
-                const int strideX = layerParams.get<int>("stride_w", 1);
+                const int strideY = stride.getIntValue(0);
+                const int strideX = stride.getIntValue(1);
                const int outH = outShape.getIntValue(2);
                const int outW = outShape.getIntValue(3);
@@ -612,15 +597,13 @@ void ONNXImporter::populateNet(Net dstNet)
                }
                else if (layerParams.get<String>("pad_mode") == "VALID")
                {
-                    if (!layerParams.has("kernel_h") || !layerParams.has("kernel_w"))
+                    if (!layerParams.has("kernel_size"))
                        CV_Error(Error::StsNotImplemented,
-                                 "Required attributes 'kernel_h' and 'kernel_w' are not present.");
+                                 "Required attribute 'kernel_size' is not present.");
-                    int kernelH = layerParams.get<int>("kernel_h");
-                    int kernelW = layerParams.get<int>("kernel_w");
-                    layerParams.set("adj_w", (outW - kernelW) % strideX);
+                    DictValue kernel = layerParams.get("kernel_size");
-                    layerParams.set("adj_h", (outH - kernelH) % strideY);
+                    layerParams.set("adj_h", (outH - kernel.getIntValue(0)) % strideY);
+                    layerParams.set("adj_w", (outW - kernel.getIntValue(1)) % strideX);
                }
            }
            else if (layerParams.has("output_padding"))

--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -51,6 +51,7 @@ enum DataLayout
 {
    DATA_LAYOUT_NHWC,
    DATA_LAYOUT_NCHW,
+    DATA_LAYOUT_NDHWC,
    DATA_LAYOUT_UNKNOWN,
    DATA_LAYOUT_PLANAR  // 2-dimensional outputs (matmul, flatten, reshape to 2d)
 };
@@ -258,6 +259,8 @@ static int getDataLayout(const tensorflow::NodeDef& layer)
            return DATA_LAYOUT_NHWC;
        else if (format == "NCHW" || format == "channels_first")
            return DATA_LAYOUT_NCHW;
+        else if (format == "NDHWC")
+            return DATA_LAYOUT_NDHWC;
        else
            CV_Error(Error::StsParseError, "Unknown data_format value: " + format);
    }
@@ -281,21 +284,34 @@ void setStrides(LayerParams &layerParams, const tensorflow::NodeDef &layer)
    if (hasLayerAttr(layer, "strides"))
    {
        const tensorflow::AttrValue& val = getLayerAttr(layer, "strides");
-        int dimX, dimY, dimC;
+        int dimX, dimY, dimC, dimD;
        int layout = getDataLayout(layer);
        if (layout == DATA_LAYOUT_NCHW)
        {
            dimC = 1; dimY = 2; dimX = 3;
        }
+        else if (layout == DATA_LAYOUT_NDHWC)
+        {
+            dimD = 1; dimY = 2; dimX = 3; dimC = 4;
+        }
        else
        {
            dimY = 1; dimX = 2; dimC = 3;
        }
-        if (val.list().i_size() != 4 ||
+        if (!(val.list().i_size() == 4 || val.list().i_size() == 5) ||
            val.list().i(0) != 1 || val.list().i(dimC) != 1)
            CV_Error(Error::StsError, "Unsupported strides");
-        layerParams.set("stride_h", static_cast<int>(val.list().i(dimY)));
+        if (layout == DATA_LAYOUT_NDHWC) {
-        layerParams.set("stride_w", static_cast<int>(val.list().i(dimX)));
+            int strides[] = {static_cast<int>(val.list().i(dimD)),
+                             static_cast<int>(val.list().i(dimY)),
+                             static_cast<int>(val.list().i(dimX))};
+            layerParams.set("stride",  DictValue::arrayInt(strides, 3));
+        }
+        else
+        {
+            layerParams.set("stride_h", static_cast<int>(val.list().i(dimY)));
+            layerParams.set("stride_w", static_cast<int>(val.list().i(dimX)));
+        }
    }
 }
@@ -318,21 +334,35 @@ void setKSize(LayerParams &layerParams, const tensorflow::NodeDef &layer)
    if (hasLayerAttr(layer, "ksize"))
    {
        const tensorflow::AttrValue& val = getLayerAttr(layer, "ksize");
-        int dimX, dimY, dimC;
+        int dimX, dimY, dimC, dimD;
        int layout = getDataLayout(layer);
        if (layout == DATA_LAYOUT_NCHW)
        {
            dimC = 1; dimY = 2; dimX = 3;
        }
+        else if (layout == DATA_LAYOUT_NDHWC)
+        {
+            dimD = 1; dimY = 2; dimX = 3; dimC = 4;
+        }
        else
        {
            dimY = 1; dimX = 2; dimC = 3;
        }
-        if (val.list().i_size() != 4 ||
+        if (!(val.list().i_size() == 4 || val.list().i_size() == 5) ||
            val.list().i(0) != 1 || val.list().i(dimC) != 1)
            CV_Error(Error::StsError, "Unsupported ksize");
-        layerParams.set("kernel_h", static_cast<int>(val.list().i(dimY)));
-        layerParams.set("kernel_w", static_cast<int>(val.list().i(dimX)));
+        if (layout == DATA_LAYOUT_NDHWC) {
+            int kernel[] = {static_cast<int>(val.list().i(dimD)),
+                            static_cast<int>(val.list().i(dimY)),
+                            static_cast<int>(val.list().i(dimX))};
+            layerParams.set("kernel_size",  DictValue::arrayInt(kernel, 3));
+        }
+        else
+        {
+            layerParams.set("kernel_h", static_cast<int>(val.list().i(dimY)));
+            layerParams.set("kernel_w", static_cast<int>(val.list().i(dimX)));
+        }
    }
    else
    {
@@ -456,12 +486,26 @@ void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &ds
    // TODO: other blob types
    CV_Assert(tensor.dtype() == tensorflow::DT_FLOAT ||
              tensor.dtype() == tensorflow::DT_HALF);
-    CV_Assert(dims == 4);
+    CV_Assert(dims == 4 || dims == 5);
-    // REORDER kernel HWIO to OIHW
+    int out_c, input_c, depth, height, width;
-    swap(shape[0], shape[2]); // IWHO
+    if (dims == 4)
-    swap(shape[1], shape[3]); // IOHW
+    {
-    swap(shape[0], shape[1]); // OIHW
+        // REORDER kernel HWIO to OIHW
+        swap(shape[0], shape[2]); // IWHO
+        swap(shape[1], shape[3]); // IOHW
+        swap(shape[0], shape[1]); // OIHW
+        depth = 1; height = shape[2]; width = shape[3];
+    }
+    else
+    {
+        // REORDER kernel DHWIO to OIDHW
+        swap(shape[0], shape[4]); // OHWID
+        swap(shape[1], shape[3]); // OIWHD
+        swap(shape[2], shape[4]); // OIDHW
+        depth = shape[2]; height = shape[3]; width = shape[4];
+    }
+    out_c = shape[0]; input_c = shape[1];
    dstBlob.create(shape, CV_32F);
@@ -472,17 +516,20 @@ void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &ds
    float *dstData = dstBlob.ptr<float>();
    const float *data = reinterpret_cast<const float*>(tensorContent.data);
-    int out_c = shape[0], input_c = shape[1], height = shape[2], width = shape[3];
+    int total = out_c * input_c * depth * height * width;
-    int total = out_c*input_c*height*width;
+    for (int i_oc = 0; i_oc < out_c; i_oc++) {
-    for(int i_oc = 0; i_oc < out_c; i_oc++) {
+        for (int i_ic = 0; i_ic < input_c; i_ic++) {
-        for(int i_ic = 0; i_ic < input_c; i_ic++) {
+            for (int i_d = 0; i_d < depth; i_d++) {
-            for(int i_h = 0; i_h < height; i_h++) {
+                for (int i_h = 0; i_h < height; i_h++) {
-                for(int i_w = 0; i_w < width; i_w++) {
+                    for (int i_w = 0; i_w < width; i_w++) {
-                    int dst_i = input_c*height*width*i_oc + height*width*i_ic + width*i_h + i_w;
+                        int dst_i = input_c * depth * height * width * i_oc +
-                    int src_i = out_c*input_c*width*i_h + out_c*input_c*i_w + out_c*i_ic + i_oc;
+                                    depth * height * width * i_ic + height * width * i_d + width * i_h + i_w;
-                    CV_Assert(dst_i < total);
+                        int src_i = out_c * input_c * width * height * i_d +
-                    CV_Assert(src_i < total);
+                                    out_c * input_c * width * i_h + out_c * input_c * i_w + out_c * i_ic + i_oc;
-                   dstData[dst_i] = data[src_i];
+                        CV_Assert(dst_i < total);
+                        CV_Assert(src_i < total);
+                       dstData[dst_i] = data[src_i];
+                   }
                }
            }
        }
@@ -745,7 +792,7 @@ void TFImporter::populateNet(Net dstNet)
        int predictedLayout = predictOutputDataLayout(net, layer, data_layouts);
        data_layouts[name] = predictedLayout;
-        if (type == "Conv2D" || type == "SpaceToBatchND" || type == "DepthwiseConv2dNative" || type == "Pad")
+        if (type == "Conv2D" || type == "SpaceToBatchND" || type == "DepthwiseConv2dNative" || type == "Pad" || type == "Conv3D")
        {
            // The first node of dilated convolution subgraph.
            // Extract input node, dilation rate and paddings.
@@ -917,9 +964,9 @@ void TFImporter::populateNet(Net dstNet)
            {
                layerParams.blobs[0] = sharedWeightsIt->second;
            }
+            Mat weights = layerParams.blobs[0];
+            layerParams.set("kernel_size",  DictValue::arrayInt(&weights.size[2], weights.dims - 2));
-            layerParams.set("kernel_h", layerParams.blobs[0].size[2]);
-            layerParams.set("kernel_w", layerParams.blobs[0].size[3]);
            layerParams.set("num_output", layerParams.blobs[0].size[0]);
            setStrides(layerParams, layer);
@@ -1290,7 +1337,7 @@ void TFImporter::populateNet(Net dstNet)
                connect(layer_id, dstNet, inp, id, ii - from);
            }
        }
-        else if (type == "MaxPool")
+        else if (type == "MaxPool" || type == "MaxPool3D")
        {
            layerParams.set("pool", "max");
@@ -1303,11 +1350,10 @@ void TFImporter::populateNet(Net dstNet)
            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
        }
-        else if (type == "AvgPool")
+        else if (type == "AvgPool" || type == "AvgPool3D")
        {
            layerParams.set("pool", "ave");
            layerParams.set("ave_pool_padded_area", false);
            setKSize(layerParams, layer);
            setStrides(layerParams, layer);
            setPadding(layerParams, layer);

--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -81,6 +81,13 @@ TEST_P(Test_ONNX_layers, Convolution)
    testONNXModels("convolution");
 }
+TEST_P(Test_ONNX_layers, Convolution3D)
+{
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
+        throw SkipTestException("Only DLIE backend on CPU is supported");
+    testONNXModels("conv3d");
+    testONNXModels("conv3d_bias");
+}
 TEST_P(Test_ONNX_layers, Two_convolution)
 {
@@ -138,6 +145,20 @@ TEST_P(Test_ONNX_layers, AveragePooling)
    testONNXModels("average_pooling");
 }
+TEST_P(Test_ONNX_layers, MaxPooling3D)
+{
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
+        throw SkipTestException("Only DLIE backend on CPU is supported");
+    testONNXModels("max_pool3d");
+}
+TEST_P(Test_ONNX_layers, AvePooling3D)
+{
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
+        throw SkipTestException("Only DLIE backend on CPU is supported");
+    testONNXModels("ave_pool3d");
+}
 TEST_P(Test_ONNX_layers, BatchNormalization)
 {
    testONNXModels("batch_norm");

--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -131,6 +131,13 @@ TEST_P(Test_TensorFlow_layers, conv)
    runTensorFlowNet("conv_pool_nchw");
 }
+TEST_P(Test_TensorFlow_layers, Convolution3D)
+{
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
+            throw SkipTestException("Only DLIE backend on CPU is supported");
+    runTensorFlowNet("conv3d");
+}
 TEST_P(Test_TensorFlow_layers, padding)
 {
    runTensorFlowNet("padding_valid");
@@ -212,6 +219,20 @@ TEST_P(Test_TensorFlow_layers, ave_pool_same)
    runTensorFlowNet("ave_pool_same");
 }
+TEST_P(Test_TensorFlow_layers, MaxPooling3D)
+{
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
+        throw SkipTestException("Only DLIE backend on CPU is supported");
+    runTensorFlowNet("max_pool3d");
+}
+TEST_P(Test_TensorFlow_layers, AvePooling3D)
+{
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE || target != DNN_TARGET_CPU)
+        throw SkipTestException("Only DLIE backend on CPU is supported");
+    runTensorFlowNet("ave_pool3d");
+}
 TEST_P(Test_TensorFlow_layers, deconvolution)
 {
    runTensorFlowNet("deconvolution");