ngraph-1676 batch dot fusion (#1071)

* batch dot pattern wip. * batch dot pattern wip. * added batch dot op. * batch dot compute testing. * correct gemm parameters. * renaming matrix fusions passes and update tests. * clean up. * clang format. * more clean ups. * clang format. * added CPUBatchDotFusion to default cpu passes. * added missing header. * added element type check.

ngraph-1676 batch dot fusion (#1071)
* batch dot pattern wip. * batch dot pattern wip. * added batch dot op. * batch dot compute testing. * correct gemm parameters. * renaming matrix fusions passes and update tests. * clean up. * clang format. * more clean ups. * clang format. * added CPUBatchDotFusion to default cpu passes. * added missing header. * added element type check.
6f5e3ac7 · Louis Feng · Scott Cyphers · 16d16df7 · 6f5e3ac7 · 6f5e3ac7
Commit 6f5e3ac7 authored Jun 07, 2018 by Louis Feng Committed by Scott Cyphers Jun 07, 2018
10 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -34,6 +34,7 @@ set(SRC
    mkldnn_emitter.cpp
    mkldnn_invoke.cpp
    mkldnn_utils.cpp
+    op/batch_dot.cpp
    op/batch_norm_relu.cpp
    op/conv_bias.cpp
    op/conv_relu.cpp
@@ -50,7 +51,7 @@ set(SRC
    pass/cpu_layout.cpp
    pass/cpu_post_layout_optimizations.cpp
    pass/cpu_rnn_fusion.cpp
-    pass/cpu_rnn_mat_fusion.cpp
+    pass/cpu_mat_fusion.cpp
    pass/cpu_shuffle_folding.cpp
    pass/cpu_workspace_insertion.cpp
 )

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -93,6 +93,7 @@
 #include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/op/batch_dot.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
@@ -370,6 +371,95 @@ namespace ngraph

                writer.block_end();
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::BatchDot)
+            {
+                const ngraph::op::BatchDot* batch_dot =
+                    static_cast<const ngraph::op::BatchDot*>(node);
+
+                auto mat_a = args[0];
+                auto mat_b = args[1];
+                auto mat_c = out[0];
+                const Shape& shape_a = mat_a.get_shape();
+                const Shape& shape_b = mat_b.get_shape();
+
+                static const char* cblas_transpose = "cblas::Transpose::Transpose";
+                static const char* cblas_no_transpose = "cblas::Transpose::None";
+
+                size_t m = shape_a[1];
+                size_t k = shape_a[2];
+                size_t n = shape_b[2];
+                size_t lda = std::max(1UL, k);
+                size_t ldb = std::max(1UL, n);
+                const char* transpose_a = cblas_no_transpose;
+                const char* transpose_b = cblas_no_transpose;
+                if (batch_dot->get_is_a_transposed())
+                {
+                    transpose_a = cblas_transpose;
+                    m = shape_a[2];
+                    k = shape_a[1];
+                    lda = std::max(1UL, m);
+                }
+                if (batch_dot->get_is_b_transposed())
+                {
+                    transpose_b = cblas_transpose;
+                    n = shape_b[1];
+                    ldb = std::max(1UL, k);
+                }
+                size_t ldc = max(1UL, n);
+                const size_t offset_a = m * k;
+                const size_t offset_b = k * n;
+                const size_t offset_c = m * n;
+
+                writer.block_begin();
+
+                const size_t group_count = 1;
+                const size_t group_size = shape_a[0];
+
+                auto populate_array =
+                    [&writer](const std::string& var, size_t size, size_t offset) {
+                        for (size_t i = 0; i < size; ++i)
+                        {
+                            if (i < size - 1)
+                            {
+                                writer << var << "+" << i * offset << ", ";
+                            }
+                            else
+                            {
+                                writer << var << "+" << i * offset;
+                            }
+                        }
+                    };
+                writer << "cblas::Transpose transa_array[] = {" << transpose_a << "};\n";
+                writer << "cblas::Transpose transb_array[] = {" << transpose_b << "};\n";
+                writer << "int64_t m_array[] = {" << m << "};\n";
+                writer << "int64_t n_array[] = {" << n << "};\n";
+                writer << "int64_t k_array[] = {" << k << "};\n";
+                writer << "float alpha_array[] = {1.0f};\n";
+                writer << "std::vector<const float*> a{";
+                populate_array(mat_a.get_name(), group_size, offset_a);
+                writer << "};\n";
+                writer << "const float** a_array = &a[0];\n";
+                writer << "int64_t lda_array[] = {" << lda << "};\n";
+                writer << "std::vector<const float*> b{";
+                populate_array(mat_b.get_name(), group_size, offset_b);
+                writer << "};\n";
+                writer << "const float** b_array = &b[0];\n";
+                writer << "int64_t ldb_array[] = {" << ldb << "};\n";
+                writer << "float beta_array[] = {0.0f};\n";
+                writer << "std::vector<float*> c{";
+                populate_array(mat_c.get_name(), group_size, offset_c);
+                writer << "};\n";
+                writer << "float** c_array = &c[0];\n";
+                writer << "int64_t ldc_array[] = {" << ldc << "};\n";
+                writer << "int64_t group_size[] = {" << group_size << "};\n";
+
+                writer << "cblas_sgemm_batch(cblas::Layout::RowMajor, ";
+                writer << "transa_array, transb_array, m_array, n_array, k_array, \n";
+                writer << "alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, \n";
+                writer << "c_array, ldc_array, " << group_count << ", group_size);\n";
+                writer.block_end();
+            }

            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::Lstm)

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -116,6 +116,7 @@
 #include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/cpu/cpu_tracing.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/op/batch_dot.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
@@ -130,6 +131,7 @@
 #include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_rnn_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_shuffle_folding.hpp"
@@ -211,6 +213,7 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Multiply), &runtime::cpu::CPU_Emitter::emit<op::Multiply>},
    {TI(ngraph::op::Parameter), &runtime::cpu::CPU_Emitter::nop},
    {TI(ngraph::op::Abs), &runtime::cpu::CPU_Emitter::emit<op::Abs>},
+    {TI(ngraph::op::BatchDot), &runtime::cpu::CPU_Emitter::emit<op::BatchDot>},
    {TI(ngraph::op::Concat), &runtime::cpu::CPU_Emitter::emit<op::Concat>},
    {TI(ngraph::op::Divide), &runtime::cpu::CPU_Emitter::emit<op::Divide>},
    {TI(ngraph::op::Equal), &runtime::cpu::CPU_Emitter::emit<op::Equal>},
@@ -335,6 +338,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    pass_manager.register_pass<ngraph::pass::NopElimination>();
    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUBatchDotFusion>();
    pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
    pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
    pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();

--- a/src/ngraph/runtime/cpu/cpu_kernels.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.hpp
@@ -87,6 +87,23 @@ namespace cblas
                     const float beta,
                     float* C,
                     const int64_t ldc);
+
+    void cblas_sgemm_batch(const Layout Layout,
+                           const Transpose* transa_array,
+                           const Transpose* transb_array,
+                           const int64_t* m_array,
+                           const int64_t* n_array,
+                           const int64_t* k_array,
+                           const float* alpha_array,
+                           const float** a_array,
+                           const int64_t* lda_array,
+                           const float** b_array,
+                           const int64_t* ldb_array,
+                           const float* beta_array,
+                           float** c_array,
+                           const int64_t* ldc_array,
+                           const int64_t group_count,
+                           const int64_t* group_size);
    }
 }


--- a/src/ngraph/runtime/cpu/op/batch_dot.cpp
+++ b/src/ngraph/runtime/cpu/op/batch_dot.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "batch_dot.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+shared_ptr<Node> op::BatchDot::copy_with_new_args(const NodeVector& new_args) const
+{
+    if (new_args.size() != 2)
+    {
+        throw ngraph_error("Incorrect number of new arguments");
+    }
+
+    return make_shared<BatchDot>(new_args.at(0), new_args.at(1), m_transpose_a, m_transpose_b);
+}
+
+op::BatchDot::BatchDot(shared_ptr<Node> a, shared_ptr<Node> b, bool transpose_a, bool transpose_b)
+    : RequiresTensorViewArgs("BatchDot", vector<shared_ptr<Node>>{a, b})
+    , m_transpose_a(transpose_a)
+    , m_transpose_b(transpose_b)
+{
+    const auto& shape_a = a->get_shape();
+    const auto& shape_b = b->get_shape();
+    if (shape_a.size() != 3 || shape_b.size() != 3)
+    {
+        NGRAPH_DEBUG << "shape_a = " << vector_to_string(shape_a);
+        NGRAPH_DEBUG << "shape_b = " << vector_to_string(shape_b);
+        throw ngraph_error("shape rank != 3 while creating BatchDot");
+    }
+    if (a->get_element_type() != b->get_element_type())
+    {
+        throw ngraph_error("input element types did not match while creating BatchDot");
+    }
+    size_t dot_dimension_a = (transpose_a) ? 1 : 2;
+    size_t dot_dimension_b = (transpose_b) ? 2 : 1;
+
+    NGRAPH_DEBUG << "dot_dimension_a = " << dot_dimension_a
+                 << " , dot_dimension_b = " << dot_dimension_b;
+    NGRAPH_DEBUG << "a shape = " << vector_to_string(shape_a)
+                 << " , b shape = " << vector_to_string(shape_b);
+
+    if (shape_a.at(dot_dimension_a) != shape_b.at(dot_dimension_b))
+    {
+        throw ngraph_error("product dimensions are not equal while creating BatchDot");
+    }
+
+    Shape dot_shape{
+        shape_a.at(0), shape_a.at(3 - dot_dimension_a), shape_b.at(3 - dot_dimension_b)};
+    NGRAPH_DEBUG << "dot_shape shape = " << vector_to_string(dot_shape);
+
+    add_output(a->get_element_type(), dot_shape);
+}
--- a/src/ngraph/runtime/cpu/op/batch_dot.hpp
+++ b/src/ngraph/runtime/cpu/op/batch_dot.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "ngraph/op/util/requires_tensor_view_args.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        class BatchDot : public util::RequiresTensorViewArgs
+        {
+        public:
+            BatchDot(std::shared_ptr<Node> a,
+                     std::shared_ptr<Node> b,
+                     bool transpose_a,
+                     bool transpose_b);
+
+            bool get_is_a_transposed() const { return m_transpose_a; }
+            bool get_is_b_transposed() const { return m_transpose_b; }
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+
+        private:
+            bool m_transpose_a;
+            bool m_transpose_b;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_rnn_mat_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_rnn_mat_fusion.cpp
@@ -22,12 +22,14 @@
 #include <typeindex>
 #include <unordered_map>

-#include "cpu_rnn_mat_fusion.hpp"
+#include "cpu_mat_fusion.hpp"
 #include "ngraph/op/add.hpp"
 #include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/concat.hpp"
 #include "ngraph/op/dot.hpp"
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/slice.hpp"
+#include "ngraph/runtime/cpu/op/batch_dot.hpp"

 #include "ngraph/pattern/matcher.hpp"
 #include "ngraph/pattern/op/label.hpp"
@@ -199,3 +201,100 @@ bool runtime::cpu::pass::CPURnnMatFusion::run_on_function(std::shared_ptr<Functi
    }
    return modified;
 }
+
+#define TI(x) std::type_index(typeid(x))
+
+std::shared_ptr<Node> fuse_batch_dot(const std::shared_ptr<Node>& n)
+{
+    const int num_op_branches = 2;
+    std::shared_ptr<pattern::op::Label> input[num_op_branches];
+    std::shared_ptr<op::Reshape> reshape[num_op_branches];
+    for (int i = 0; i < num_op_branches; ++i)
+    {
+        input[i] = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 2, 2});
+        auto slice =
+            std::make_shared<op::Slice>(input[i], Coordinate{0, 0, 0}, Coordinate{1, 2, 2});
+        auto skip = std::make_shared<pattern::op::Skip>(slice, pattern::has_class<op::Reshape>());
+        reshape[i] = std::make_shared<op::Reshape>(skip, AxisVector{0, 1, 2}, Shape{2, 2});
+    }
+    auto dot = std::make_shared<op::Dot>(reshape[0], reshape[1]);
+    auto final_reshape = std::make_shared<op::Reshape>(dot, AxisVector{0, 1}, Shape{1, 2, 2});
+
+    auto matcher = std::make_shared<pattern::Matcher>(final_reshape);
+    std::shared_ptr<Node> fuse_input[num_op_branches];
+    bool transpose[num_op_branches] = {false, false};
+    const int num_expected_reshape_with_trans = 3;
+
+    // check each input arg matches the pattern
+    for (auto arg : n->get_arguments())
+    {
+        if (matcher->match(arg))
+        {
+            auto pattern_map = matcher->get_pattern_map();
+            int reshape_count[num_op_branches] = {0, 0};
+            // we found a match, determine whether we have to transpose for each input by
+            // counting the number of reshapes in each branch, if transpose is applied, there
+            // should be 3 reshapes.
+            for (int i = 0; i < num_op_branches; ++i)
+            {
+                auto iter = matcher->get_match_root();
+                auto& input_node = pattern_map[input[i]];
+                do
+                {
+                    if (std::dynamic_pointer_cast<op::Reshape>(iter) != nullptr)
+                    {
+                        ++reshape_count[i];
+                        if (reshape_count[i] == num_expected_reshape_with_trans)
+                        {
+                            transpose[i] = true;
+                            break;
+                        }
+                    }
+                    // branch to either input 0 or 1 depending on which one we are traversing
+                    iter =
+                        iter->get_input_size() > 1 ? iter->get_argument(i) : iter->get_argument(0);
+                } while (iter != input_node);
+            }
+            // keep track of the input data, make sure they all match
+            for (int i = 0; i < num_op_branches; ++i)
+            {
+                auto& input_node = pattern_map[input[i]];
+                if (fuse_input[i] == nullptr)
+                {
+                    fuse_input[i] = input_node;
+                }
+                // found different input nodes between different args, can't fuse.
+                else if (fuse_input[i] != input_node)
+                {
+                    return {nullptr};
+                }
+            }
+        }
+    }
+    if (fuse_input[0] && fuse_input[1])
+    {
+        return std::make_shared<op::BatchDot>(
+            fuse_input[0], fuse_input[1], transpose[0], transpose[1]);
+    }
+    return {nullptr};
+}
+
+bool runtime::cpu::pass::CPUBatchDotFusion::run_on_function(std::shared_ptr<Function> func)
+{
+    bool modified = false;
+
+    for (auto n : func->get_ordered_ops())
+    {
+        const Node& node = *n;
+        if (TI(node) == TI(op::Concat))
+        {
+            auto fused_node = fuse_batch_dot(n);
+            if (fused_node)
+            {
+                func->replace_node(n, fused_node);
+                modified = true;
+            }
+        }
+    }
+    return modified;
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_rnn_mat_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_rnn_mat_fusion.hpp
@@ -31,6 +31,11 @@ namespace ngraph
                public:
                    bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
                };
+                class CPUBatchDotFusion : public ngraph::pass::FunctionPass
+                {
+                public:
+                    bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
+                };
            }
        }
    }

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -42,6 +42,7 @@
 #include "ngraph/pattern/op/label.hpp"
 #include "ngraph/pattern/op/skip.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/op/batch_dot.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
@@ -53,9 +54,9 @@
 #include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_rnn_fusion.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_rnn_mat_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_workspace_insertion.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
@@ -2035,3 +2036,42 @@ TEST(cpu_fusion, sigmoid_multiply_fusion_backward)
                                                 expected_1);
    }
 }
+
+TEST(cpu_fusion, fuse_batch_dot)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUBatchDotFusion>();
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/batch_dot_3.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t ccg = count_ops_of_type<op::BatchDot>(func);
+    ASSERT_EQ(ccg, 1);
+}
+
+TEST(cpu_fusion, fuse_batch_dot_forward)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUBatchDotFusion>();
+
+    const std::string file_name("mxnet/batch_dot_3.json");
+    auto cpu_f = make_function(file_name);
+    auto int_f = make_function(file_name);
+    pass_manager.run_passes(cpu_f);
+    test::Uniform<float> rng(0.0f, 1.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < int_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
--- a/test/models/mxnet/batch_dot_3.json
+++ b/test/models/mxnet/batch_dot_3.json
+[{
+  "name" : "Function_0",
+  "ops" : [
+    {
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_1",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_1_0"],
+      "shape" : [ 3, 3, 2 ]
+    },
+    {
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_0",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_0_0"],
+      "shape" : [ 3, 4, 2 ]
+    },
+    {
+      "inputs" : ["Parameter_1"],
+      "lower_bounds" : [ 1, 0, 0 ],
+      "name" : "Slice_11",
+      "op" : "Slice",
+      "outputs" : ["Slice_11_0"],
+      "strides" : [ 1, 1, 1 ],
+      "upper_bounds" : [ 2, 3, 2 ]
+    },
+    {
+      "inputs" : ["Parameter_1"],
+      "lower_bounds" : [ 2, 0, 0 ],
+      "name" : "Slice_18",
+      "op" : "Slice",
+      "outputs" : ["Slice_18_0"],
+      "strides" : [ 1, 1, 1 ],
+      "upper_bounds" : [ 3, 3, 2 ]
+    },
+    {
+      "inputs" : ["Parameter_1"],
+      "lower_bounds" : [ 0, 0, 0 ],
+      "name" : "Slice_4",
+      "op" : "Slice",
+      "outputs" : ["Slice_4_0"],
+      "strides" : [ 1, 1, 1 ],
+      "upper_bounds" : [ 1, 3, 2 ]
+    },
+    {
+      "inputs" : ["Parameter_0"],
+      "lower_bounds" : [ 1, 0, 0 ],
+      "name" : "Slice_9",
+      "op" : "Slice",
+      "outputs" : ["Slice_9_0"],
+      "strides" : [ 1, 1, 1 ],
+      "upper_bounds" : [ 2, 4, 2 ]
+    },
+    {
+      "inputs" : ["Parameter_0"],
+      "lower_bounds" : [ 2, 0, 0 ],
+      "name" : "Slice_16",
+      "op" : "Slice",
+      "outputs" : ["Slice_16_0"],
+      "strides" : [ 1, 1, 1 ],
+      "upper_bounds" : [ 3, 4, 2 ]
+    },
+    {
+      "inputs" : ["Parameter_0"],
+      "lower_bounds" : [ 0, 0, 0 ],
+      "name" : "Slice_2",
+      "op" : "Slice",
+      "outputs" : ["Slice_2_0"],
+      "strides" : [ 1, 1, 1 ],
+      "upper_bounds" : [ 1, 4, 2 ]
+    },
+    {
+      "input_order" : [ 0, 1, 2 ],
+      "inputs" : ["Slice_11"],
+      "name" : "Reshape_12",
+      "op" : "Reshape",
+      "output_shape" : [ 3, 2 ],
+      "outputs" : ["Reshape_12_0"]
+    },
+    {
+      "input_order" : [ 0, 1, 2 ],
+      "inputs" : ["Slice_18"],
+      "name" : "Reshape_19",
+      "op" : "Reshape",
+      "output_shape" : [ 3, 2 ],
+      "outputs" : ["Reshape_19_0"]
+    },
+    {
+      "input_order" : [ 0, 1, 2 ],
+      "inputs" : ["Slice_4"],
+      "name" : "Reshape_5",
+      "op" : "Reshape",
+      "output_shape" : [ 3, 2 ],
+      "outputs" : ["Reshape_5_0"]
+    },
+    {
+      "input_order" : [ 0, 1, 2 ],
+      "inputs" : ["Slice_9"],
+      "name" : "Reshape_10",
+      "op" : "Reshape",
+      "output_shape" : [ 4, 2 ],
+      "outputs" : ["Reshape_10_0"]
+    },
+    {
+      "input_order" : [ 0, 1, 2 ],
+      "inputs" : ["Slice_16"],
+      "name" : "Reshape_17",
+      "op" : "Reshape",
+      "output_shape" : [ 4, 2 ],
+      "outputs" : ["Reshape_17_0"]
+    },
+    {
+      "input_order" : [ 0, 1, 2 ],
+      "inputs" : ["Slice_2"],
+      "name" : "Reshape_3",
+      "op" : "Reshape",
+      "output_shape" : [ 4, 2 ],
+      "outputs" : ["Reshape_3_0"]
+    },
+    {
+      "input_order" : [ 1, 0 ],
+      "inputs" : ["Reshape_12"],
+      "name" : "Reshape_13",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3 ],
+      "outputs" : ["Reshape_13_0"]
+    },
+    {
+      "input_order" : [ 1, 0 ],
+      "inputs" : ["Reshape_19"],
+      "name" : "Reshape_20",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3 ],
+      "outputs" : ["Reshape_20_0"]
+    },
+    {
+      "input_order" : [ 1, 0 ],
+      "inputs" : ["Reshape_5"],
+      "name" : "Reshape_6",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3 ],
+      "outputs" : ["Reshape_6_0"]
+    },
+    {
+      "inputs" : [ "Reshape_10", "Reshape_13" ],
+      "name" : "Dot_14",
+      "op" : "Dot",
+      "outputs" : ["Dot_14_0"],
+      "reduction_axes_count" : 1
+    },
+    {
+      "inputs" : [ "Reshape_17", "Reshape_20" ],
+      "name" : "Dot_21",
+      "op" : "Dot",
+      "outputs" : ["Dot_21_0"],
+      "reduction_axes_count" : 1
+    },
+    {
+      "inputs" : [ "Reshape_3", "Reshape_6" ],
+      "name" : "Dot_7",
+      "op" : "Dot",
+      "outputs" : ["Dot_7_0"],
+      "reduction_axes_count" : 1
+    },
+    {
+      "input_order" : [ 0, 1 ],
+      "inputs" : ["Dot_14"],
+      "name" : "Reshape_15",
+      "op" : "Reshape",
+      "output_shape" : [ 1, 4, 3 ],
+      "outputs" : ["Reshape_15_0"]
+    },
+    {
+      "input_order" : [ 0, 1 ],
+      "inputs" : ["Dot_21"],
+      "name" : "Reshape_22",
+      "op" : "Reshape",
+      "output_shape" : [ 1, 4, 3 ],
+      "outputs" : ["Reshape_22_0"]
+    },
+    {
+      "input_order" : [ 0, 1 ],
+      "inputs" : ["Dot_7"],
+      "name" : "Reshape_8",
+      "op" : "Reshape",
+      "output_shape" : [ 1, 4, 3 ],
+      "outputs" : ["Reshape_8_0"]
+    },
+    {
+      "axis" : 0,
+      "inputs" : [ "Reshape_8", "Reshape_15", "Reshape_22" ],
+      "name" : "Concat_23",
+      "op" : "Concat",
+      "outputs" : ["Concat_23_0"]
+    },
+    {
+      "inputs" : ["Concat_23"],
+      "name" : "Result_24",
+      "op" : "Result",
+      "outputs" : ["Result_24_0"]
+    }
+  ],
+  "parameters" : [ "Parameter_0", "Parameter_1" ],
+  "result" : ["Result_24"]
+}]