Optimize Broadcast in MatMulBias (#604)

* remove broadcast from matmulbias * fix comments * working gemm-based broadcast * fix clang warning

Optimize Broadcast in MatMulBias (#604)
* remove broadcast from matmulbias * fix comments * working gemm-based broadcast * fix clang warning
9cca4073 · Nick Korovaiko · GitHub · 529362b5 · 9cca4073 · 9cca4073
Unverified Commit 9cca4073 authored Mar 08, 2018 by Nick Korovaiko Committed by GitHub Mar 08, 2018
5 changed files
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -272,13 +272,6 @@ namespace ngraph

                const char* cbeta = "0.0f";

-                if (args.size() > 2)
-                {
-                    writer << "memcpy(" << out[0].get_name() << ", " << args[2].get_name() << ", "
-                           << out[0].get_size() * out[0].get_element_type().size() << ");\n";
-                    cbeta = "1.0f";
-                }
-
                writer << "cblas::cblas_sgemm("
                       << "cblas::Layout::RowMajor, " << tranpose_a << tranpose_b << m << ", " << n
                       << ", " << k << ",\n"
@@ -287,6 +280,101 @@ namespace ngraph
                       << "        " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
                       << ");\n";

+                if (args.size() > 2)
+                {
+                    auto axes = cg->get_broadcast_axes();
+                    if (axes.size() == 1)
+                    {
+                        if (*(axes.begin()) == 0)
+                        {
+                            writer << "static " << out[0].get_element_type().c_type_string()
+                                   << " ones_row[" << arg2_shape[0] << "]"
+                                   << " = { 1.0f";
+                            for (size_t i = 1; i < arg2_shape[0]; ++i)
+                            {
+                                writer << ", 1.0f";
+                            }
+                            writer << "};\n";
+
+                            writer << "cblas::cblas_sgemm("
+                                   << "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
+                                   << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                                   << ",\n"
+                                   << "        1.0f, ones_row, "
+                                   << "1"
+                                   << ", " << args[2].get_name() << ", " << max(1UL, arg2_shape[1])
+                                   << ", "
+                                   << "1.0f"
+                                   << ",\n"
+                                   << "        " << out[0].get_name() << ", "
+                                   << max(1UL, arg2_shape[1]) << ");\n";
+                        }
+                        else
+                        {
+                            writer << "static " << out[0].get_element_type().c_type_string()
+                                   << " ones_col[" << arg2_shape[1] << "]"
+                                   << " = { 1.0f";
+                            for (size_t i = 1; i < arg2_shape[1]; ++i)
+                            {
+                                writer << ", 1.0f";
+                            }
+                            writer << "};\n";
+
+                            writer << "cblas::cblas_sgemm("
+                                   << "cblas::Layout::RowMajor, " << cnotranspose << ctranspose
+                                   << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                                   << ",\n"
+                                   << "        1.0f, ones_col," << max(1UL, arg2_shape[1]) << ", "
+                                   << args[2].get_name() << ", "
+                                   << "1"
+                                   << ", "
+                                   << "1.0f"
+                                   << ",\n"
+                                   << "        " << out[0].get_name() << ", "
+                                   << max(1UL, arg2_shape[1]) << ");\n";
+                        }
+                    }
+                    else
+                    {
+                        if (axes.size() != 2)
+                        {
+                            throw ngraph_error("unexpected broadcast rank");
+                        }
+
+                        writer << out[0].get_element_type().c_type_string() << " bias["
+                               << arg2_shape[1] << "]"
+                               << " = { " << args[2].get_name() << "[0]";
+                        for (size_t i = 1; i < arg2_shape[1]; ++i)
+                        {
+                            writer << "," << args[2].get_name() << "[0]";
+                        }
+                        writer << "};\n";
+
+                        writer << "static " << out[0].get_element_type().c_type_string()
+                               << " ones_scalar[" << arg2_shape[0] << "]"
+                               << " = { 1.0f";
+                        for (size_t i = 1; i < arg2_shape[0]; ++i)
+                        {
+                            writer << ", 1.0f";
+                        }
+                        writer << "};\n";
+
+                        writer << "cblas::cblas_sgemm("
+                               << "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
+                               << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                               << ",\n"
+                               << "        1.0f, ones_scalar, "
+                               << "1"
+                               << ", "
+                               << "bias"
+                               << ", " << max(1UL, arg2_shape[1]) << ", "
+                               << "1.0f"
+                               << ",\n"
+                               << "        " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
+                               << ");\n";
+                    }
+                }
+
                writer.indent--;
                writer << "}\n";
            }

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
@@ -32,7 +32,8 @@ std::shared_ptr<ngraph::Node>
                                        m_shape_w,
                                        m_shape_x,
                                        m_transpose_w,
-                                        m_transpose_x);
+                                        m_transpose_x,
+                                        m_broadcast_axes);
 }

 ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
@@ -41,7 +42,8 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
                                   Shape shape_w,
                                   Shape shape_x,
                                   bool transpose_w,
-                                   bool transpose_x)
+                                   bool transpose_x,
+                                   AxisSet axes)
    : RequiresTensorViewArgs("MatMulBias",
                             b == nullptr ? std::vector<std::shared_ptr<Node>>{W, x}
                                          : std::vector<std::shared_ptr<Node>>{W, x, b})
@@ -49,8 +51,24 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
    , m_shape_x(shape_x)
    , m_transpose_w(transpose_w)
    , m_transpose_x(transpose_x)
+    , m_broadcast_axes(axes)

 {
+    if (axes.size() == 0 && b != nullptr)
+    {
+        throw ngraph_error("Bias but no broadcast axes");
+    }
+
+    if (b == nullptr && axes.size() != 0)
+    {
+        throw ngraph_error("Broadcast axes but no bias");
+    }
+
+    if (axes.size() > 2)
+    {
+        throw ngraph_error("Broadcasting to > 2D tensor");
+    }
+
    if (shape_w.size() != 2)
    {
        NGRAPH_DEBUG << "W shape = " << vector_to_string(shape_w);

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include "ngraph/axis_set.hpp"
 #include "ngraph/ops/util/requires_tensor_view_args.hpp"

 namespace ngraph
@@ -31,12 +32,14 @@ namespace ngraph
                       Shape shape_w,
                       Shape shape_x,
                       bool transpose_w,
-                       bool transpose_x);
+                       bool transpose_x,
+                       AxisSet axes = AxisSet{});

            bool get_is_arg0_transposed() const { return m_transpose_w; }
            bool get_is_arg1_transposed() const { return m_transpose_x; }
            Shape get_arg0_shape() const { return m_shape_w; }
            Shape get_arg1_shape() const { return m_shape_x; }
+            const AxisSet& get_broadcast_axes() const { return m_broadcast_axes; }
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;

@@ -45,6 +48,7 @@ namespace ngraph
            Shape m_shape_x;
            bool m_transpose_w;
            bool m_transpose_x;
+            AxisSet m_broadcast_axes;
        };
    }
 }
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -134,12 +134,21 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_matmulbias_pattern()
                     << m.match_root()->get_name();

        auto mpattern = m.match_root(); //add
-        auto m_matmul = mpattern->get_input_op(0);
-        auto m_broadcast = mpattern->get_input_op(1);
+        auto m_matmul = std::dynamic_pointer_cast<op::MatmulBias>(mpattern->get_input_op(0));
+        auto m_broadcast = std::dynamic_pointer_cast<op::Broadcast>(mpattern->get_input_op(1));
+        auto m_bias = m_broadcast->get_input_op(0);
        auto pattern_map = m.get_pattern_map();

-        return m_matmul->copy_with_new_args(
-            NodeVector{pattern_map[W], pattern_map[x], m_broadcast});
+        auto mmb = std::make_shared<op::MatmulBias>(pattern_map[W],
+                                                    pattern_map[x],
+                                                    m_bias,
+                                                    m_matmul->get_arg0_shape(),
+                                                    m_matmul->get_arg1_shape(),
+                                                    m_matmul->get_is_arg0_transposed(),
+                                                    m_matmul->get_is_arg1_transposed(),
+                                                    m_broadcast->get_broadcast_axes());
+
+        return mmb;
    };

    auto m = std::make_shared<ngraph::pattern::Matcher>(padd, callback);

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -91,11 +91,89 @@ TEST(cpu_fusion, gemm_pattern)
    ASSERT_EQ(n.get_pattern_map()[x], B);
    ASSERT_EQ(n.get_pattern_map()[b], C);

-    auto cg =
-        make_shared<op::MatmulBias>(W, x, broadcast, W->get_shape(), x->get_shape(), false, false);
+    auto cg = make_shared<op::MatmulBias>(
+        W, x, C, W->get_shape(), x->get_shape(), false, false, AxisSet{0});
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_row)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+
+    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0});
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    cf->call({a, b}, {result});
+    vector<float> expected{10, 28, 37, 109};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
 }

-TEST(cpu_fusion, gemm_cpu)
+TEST(cpu_fusion, gemm_cpu_broadcast_column)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+
+    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{1});
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    cf->call({a, b}, {result});
+    vector<float> expected{10, 28, 37, 109};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
 {
    Shape shapeA{3, 2};
    Shape shapeB{2, 3};
@@ -109,8 +187,8 @@ TEST(cpu_fusion, gemm_cpu)
    auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});

    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1});
-    auto cg =
-        make_shared<op::MatmulBias>(A, B, broadcast, A->get_shape(), B->get_shape(), true, true);
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0, 1});

    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});

@@ -212,7 +290,7 @@ TEST(cpu_fusion, cpu_fusion_pass_matmul_bias)
    pass_manager.run_passes(func);
    auto gmm = graph->get_input_op(0);
    ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm));
-    ASSERT_EQ(gmm->get_input_op(2), broadcast);
+    ASSERT_EQ(gmm->get_input_op(2), b);
 }

 TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias)