Unverified Commit 9cca4073 authored by Nick Korovaiko's avatar Nick Korovaiko Committed by GitHub

Optimize Broadcast in MatMulBias (#604)

* remove broadcast from matmulbias

* fix comments

* working gemm-based broadcast

* fix clang warning
parent 529362b5
...@@ -272,13 +272,6 @@ namespace ngraph ...@@ -272,13 +272,6 @@ namespace ngraph
const char* cbeta = "0.0f"; const char* cbeta = "0.0f";
if (args.size() > 2)
{
writer << "memcpy(" << out[0].get_name() << ", " << args[2].get_name() << ", "
<< out[0].get_size() * out[0].get_element_type().size() << ");\n";
cbeta = "1.0f";
}
writer << "cblas::cblas_sgemm(" writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << tranpose_a << tranpose_b << m << ", " << n << "cblas::Layout::RowMajor, " << tranpose_a << tranpose_b << m << ", " << n
<< ", " << k << ",\n" << ", " << k << ",\n"
...@@ -287,6 +280,101 @@ namespace ngraph ...@@ -287,6 +280,101 @@ namespace ngraph
<< " " << out[0].get_name() << ", " << max(1UL, arg2_shape[1]) << " " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
<< ");\n"; << ");\n";
if (args.size() > 2)
{
auto axes = cg->get_broadcast_axes();
if (axes.size() == 1)
{
if (*(axes.begin()) == 0)
{
writer << "static " << out[0].get_element_type().c_type_string()
<< " ones_row[" << arg2_shape[0] << "]"
<< " = { 1.0f";
for (size_t i = 1; i < arg2_shape[0]; ++i)
{
writer << ", 1.0f";
}
writer << "};\n";
writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
<< arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
<< ",\n"
<< " 1.0f, ones_row, "
<< "1"
<< ", " << args[2].get_name() << ", " << max(1UL, arg2_shape[1])
<< ", "
<< "1.0f"
<< ",\n"
<< " " << out[0].get_name() << ", "
<< max(1UL, arg2_shape[1]) << ");\n";
}
else
{
writer << "static " << out[0].get_element_type().c_type_string()
<< " ones_col[" << arg2_shape[1] << "]"
<< " = { 1.0f";
for (size_t i = 1; i < arg2_shape[1]; ++i)
{
writer << ", 1.0f";
}
writer << "};\n";
writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << cnotranspose << ctranspose
<< arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
<< ",\n"
<< " 1.0f, ones_col," << max(1UL, arg2_shape[1]) << ", "
<< args[2].get_name() << ", "
<< "1"
<< ", "
<< "1.0f"
<< ",\n"
<< " " << out[0].get_name() << ", "
<< max(1UL, arg2_shape[1]) << ");\n";
}
}
else
{
if (axes.size() != 2)
{
throw ngraph_error("unexpected broadcast rank");
}
writer << out[0].get_element_type().c_type_string() << " bias["
<< arg2_shape[1] << "]"
<< " = { " << args[2].get_name() << "[0]";
for (size_t i = 1; i < arg2_shape[1]; ++i)
{
writer << "," << args[2].get_name() << "[0]";
}
writer << "};\n";
writer << "static " << out[0].get_element_type().c_type_string()
<< " ones_scalar[" << arg2_shape[0] << "]"
<< " = { 1.0f";
for (size_t i = 1; i < arg2_shape[0]; ++i)
{
writer << ", 1.0f";
}
writer << "};\n";
writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
<< arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
<< ",\n"
<< " 1.0f, ones_scalar, "
<< "1"
<< ", "
<< "bias"
<< ", " << max(1UL, arg2_shape[1]) << ", "
<< "1.0f"
<< ",\n"
<< " " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
<< ");\n";
}
}
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
......
...@@ -32,7 +32,8 @@ std::shared_ptr<ngraph::Node> ...@@ -32,7 +32,8 @@ std::shared_ptr<ngraph::Node>
m_shape_w, m_shape_w,
m_shape_x, m_shape_x,
m_transpose_w, m_transpose_w,
m_transpose_x); m_transpose_x,
m_broadcast_axes);
} }
ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W, ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
...@@ -41,7 +42,8 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W, ...@@ -41,7 +42,8 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
Shape shape_w, Shape shape_w,
Shape shape_x, Shape shape_x,
bool transpose_w, bool transpose_w,
bool transpose_x) bool transpose_x,
AxisSet axes)
: RequiresTensorViewArgs("MatMulBias", : RequiresTensorViewArgs("MatMulBias",
b == nullptr ? std::vector<std::shared_ptr<Node>>{W, x} b == nullptr ? std::vector<std::shared_ptr<Node>>{W, x}
: std::vector<std::shared_ptr<Node>>{W, x, b}) : std::vector<std::shared_ptr<Node>>{W, x, b})
...@@ -49,8 +51,24 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W, ...@@ -49,8 +51,24 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
, m_shape_x(shape_x) , m_shape_x(shape_x)
, m_transpose_w(transpose_w) , m_transpose_w(transpose_w)
, m_transpose_x(transpose_x) , m_transpose_x(transpose_x)
, m_broadcast_axes(axes)
{ {
if (axes.size() == 0 && b != nullptr)
{
throw ngraph_error("Bias but no broadcast axes");
}
if (b == nullptr && axes.size() != 0)
{
throw ngraph_error("Broadcast axes but no bias");
}
if (axes.size() > 2)
{
throw ngraph_error("Broadcasting to > 2D tensor");
}
if (shape_w.size() != 2) if (shape_w.size() != 2)
{ {
NGRAPH_DEBUG << "W shape = " << vector_to_string(shape_w); NGRAPH_DEBUG << "W shape = " << vector_to_string(shape_w);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#pragma once #pragma once
#include "ngraph/axis_set.hpp"
#include "ngraph/ops/util/requires_tensor_view_args.hpp" #include "ngraph/ops/util/requires_tensor_view_args.hpp"
namespace ngraph namespace ngraph
...@@ -31,12 +32,14 @@ namespace ngraph ...@@ -31,12 +32,14 @@ namespace ngraph
Shape shape_w, Shape shape_w,
Shape shape_x, Shape shape_x,
bool transpose_w, bool transpose_w,
bool transpose_x); bool transpose_x,
AxisSet axes = AxisSet{});
bool get_is_arg0_transposed() const { return m_transpose_w; } bool get_is_arg0_transposed() const { return m_transpose_w; }
bool get_is_arg1_transposed() const { return m_transpose_x; } bool get_is_arg1_transposed() const { return m_transpose_x; }
Shape get_arg0_shape() const { return m_shape_w; } Shape get_arg0_shape() const { return m_shape_w; }
Shape get_arg1_shape() const { return m_shape_x; } Shape get_arg1_shape() const { return m_shape_x; }
const AxisSet& get_broadcast_axes() const { return m_broadcast_axes; }
virtual std::shared_ptr<Node> virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override; copy_with_new_args(const NodeVector& new_args) const override;
...@@ -45,6 +48,7 @@ namespace ngraph ...@@ -45,6 +48,7 @@ namespace ngraph
Shape m_shape_x; Shape m_shape_x;
bool m_transpose_w; bool m_transpose_w;
bool m_transpose_x; bool m_transpose_x;
AxisSet m_broadcast_axes;
}; };
} }
} }
...@@ -134,12 +134,21 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_matmulbias_pattern() ...@@ -134,12 +134,21 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_matmulbias_pattern()
<< m.match_root()->get_name(); << m.match_root()->get_name();
auto mpattern = m.match_root(); //add auto mpattern = m.match_root(); //add
auto m_matmul = mpattern->get_input_op(0); auto m_matmul = std::dynamic_pointer_cast<op::MatmulBias>(mpattern->get_input_op(0));
auto m_broadcast = mpattern->get_input_op(1); auto m_broadcast = std::dynamic_pointer_cast<op::Broadcast>(mpattern->get_input_op(1));
auto m_bias = m_broadcast->get_input_op(0);
auto pattern_map = m.get_pattern_map(); auto pattern_map = m.get_pattern_map();
return m_matmul->copy_with_new_args( auto mmb = std::make_shared<op::MatmulBias>(pattern_map[W],
NodeVector{pattern_map[W], pattern_map[x], m_broadcast}); pattern_map[x],
m_bias,
m_matmul->get_arg0_shape(),
m_matmul->get_arg1_shape(),
m_matmul->get_is_arg0_transposed(),
m_matmul->get_is_arg1_transposed(),
m_broadcast->get_broadcast_axes());
return mmb;
}; };
auto m = std::make_shared<ngraph::pattern::Matcher>(padd, callback); auto m = std::make_shared<ngraph::pattern::Matcher>(padd, callback);
......
...@@ -91,11 +91,89 @@ TEST(cpu_fusion, gemm_pattern) ...@@ -91,11 +91,89 @@ TEST(cpu_fusion, gemm_pattern)
ASSERT_EQ(n.get_pattern_map()[x], B); ASSERT_EQ(n.get_pattern_map()[x], B);
ASSERT_EQ(n.get_pattern_map()[b], C); ASSERT_EQ(n.get_pattern_map()[b], C);
auto cg = auto cg = make_shared<op::MatmulBias>(
make_shared<op::MatmulBias>(W, x, broadcast, W->get_shape(), x->get_shape(), false, false); W, x, C, W->get_shape(), x->get_shape(), false, false, AxisSet{0});
}
TEST(cpu_fusion, gemm_cpu_broadcast_row)
{
Shape shapeA{3, 2};
Shape shapeB{2, 3};
Shape shapeC{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeB);
auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0});
auto cg = make_shared<op::MatmulBias>(
A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
auto manager = runtime::Manager::get("CPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
shared_ptr<runtime::TensorView> result =
backend->make_primary_tensor_view(element::f32, shapeC);
vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
copy_data(a, dataA);
copy_data(b, dataB);
cf->call({a, b}, {result});
vector<float> expected{10, 28, 37, 109};
ASSERT_TRUE(read_vector<float>(result) == expected);
} }
TEST(cpu_fusion, gemm_cpu) TEST(cpu_fusion, gemm_cpu_broadcast_column)
{
Shape shapeA{3, 2};
Shape shapeB{2, 3};
Shape shapeC{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeB);
auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{1});
auto cg = make_shared<op::MatmulBias>(
A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
auto manager = runtime::Manager::get("CPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
shared_ptr<runtime::TensorView> result =
backend->make_primary_tensor_view(element::f32, shapeC);
vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
copy_data(a, dataA);
copy_data(b, dataB);
cf->call({a, b}, {result});
vector<float> expected{10, 28, 37, 109};
ASSERT_TRUE(read_vector<float>(result) == expected);
}
TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
{ {
Shape shapeA{3, 2}; Shape shapeA{3, 2};
Shape shapeB{2, 3}; Shape shapeB{2, 3};
...@@ -109,8 +187,8 @@ TEST(cpu_fusion, gemm_cpu) ...@@ -109,8 +187,8 @@ TEST(cpu_fusion, gemm_cpu)
auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f}); auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1}); auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1});
auto cg = auto cg = make_shared<op::MatmulBias>(
make_shared<op::MatmulBias>(A, B, broadcast, A->get_shape(), B->get_shape(), true, true); A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0, 1});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B}); auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
...@@ -212,7 +290,7 @@ TEST(cpu_fusion, cpu_fusion_pass_matmul_bias) ...@@ -212,7 +290,7 @@ TEST(cpu_fusion, cpu_fusion_pass_matmul_bias)
pass_manager.run_passes(func); pass_manager.run_passes(func);
auto gmm = graph->get_input_op(0); auto gmm = graph->get_input_op(0);
ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm)); ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm));
ASSERT_EQ(gmm->get_input_op(2), broadcast); ASSERT_EQ(gmm->get_input_op(2), b);
} }
TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias) TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment