Commit 5f914429 authored by nmostafa's avatar nmostafa

Move CompiledKernel op under mlir_backend lib. Add compiler to op. Track compilation status

parent 607445a4
......@@ -23,6 +23,8 @@ set(SRC
memory_manager.cpp
pass/mlir_subgraph_extraction.cpp
pass/mlir_subgraph_extraction.hpp
compiled_kernel.cpp
compiled_kernel.hpp
)
if (NGRAPH_MLIR_ENABLE)
......
......@@ -14,7 +14,7 @@
// limitations under the License.
//*****************************************************************************
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "compiled_kernel.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
......@@ -67,6 +67,8 @@ ngraph::op::CompiledKernel::CompiledKernel(const NodeVector& node_list,
: Op("CompiledKernel", check_single_output_args({args}))
, m_node_list(node_list)
, m_output_nodes(outputs)
, m_mlir_compiler(this)
, m_is_compiled(false)
{
constructor_validate_and_infer_types();
set_output_size(m_output_nodes.size());
......
......@@ -18,6 +18,7 @@
#include "ngraph/op/op.hpp"
#include "ngraph/util.hpp"
#include "contrib/mlir/compiler.hpp"
namespace ngraph
{
......@@ -38,11 +39,34 @@ namespace ngraph
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
const NodeVector& get_node_list() const { return m_node_list; }
const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
const NodeVector& get_node_list() const { return m_node_list; }
const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
/// Compiles the sub-graph associated with this CompiledKernel
void compile()
{
if (m_is_compiled)
{
return;
}
m_mlir_compiler.compile();
m_is_compiled = true;
}
/// Runs the sub-graph
void run(std::vector<void*>& ptr_args)
{
NGRAPH_CHECK(m_is_compiled, "CompiledKernel node not compiled yet");
m_mlir_compiler.set_args(&ptr_args);
m_mlir_compiler.run();
}
bool is_compiled() const
{
return m_is_compiled;
}
private:
NodeVector m_node_list;
NodeVector m_output_nodes;
ngraph::runtime::ngmlir::MLIRCompiler m_mlir_compiler;
bool m_is_compiled;
};
}
}
......@@ -29,7 +29,7 @@
#include "ngraph/op/concat.hpp"
#include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "compiled_kernel.hpp"
#include "ngraph/op/greater.hpp"
#include "ngraph/op/less.hpp"
#include "ngraph/op/maximum.hpp"
......@@ -69,16 +69,6 @@ using namespace ngraph::runtime::ngmlir;
#define COMPILE_OP_DECL(op_name) \
create_op<op_name>(MLIRCompiler & compiler, const ngraph::Node* ng_node)
MLIRCompiler::MLIRCompiler(const ngraph::op::CompiledKernel* compiled_kernel,
const std::vector<void*>& external_tensors)
: m_compiled_kernel(compiled_kernel)
, m_external_tensors(external_tensors)
{
NGRAPH_CHECK((m_compiled_kernel->get_arguments().size() +
m_compiled_kernel->get_kernel_outputs().size()) == external_tensors.size(),
"Number of arguments and outputs doesn't match number of tensors");
}
void MLIRCompiler::init_mlir()
{
// Mutex to safely initialize MLIR.
......@@ -96,11 +86,24 @@ void MLIRCompiler::init_mlir()
}
}
void MLIRCompiler::compile_and_run()
void MLIRCompiler::set_args(std::vector<void*>* external_tensors)
{
NGRAPH_CHECK(m_compiled_kernel, "No compiled kernel set for compiler");
NGRAPH_CHECK((m_compiled_kernel->get_arguments().size() +
m_compiled_kernel->get_kernel_outputs().size()) == external_tensors->size(),
"Number of arguments and outputs doesn't match number of tensors");
m_external_tensors = external_tensors;
}
void MLIRCompiler::compile()
{
build_ng_dialect_module();
lower_ng_dialect();
optimize();
}
void MLIRCompiler::run()
{
bind_arguments();
execute();
cleanup();
......@@ -471,13 +474,13 @@ void MLIRCompiler::bind_arguments()
NGRAPH_CHECK(expected_arguments.size(), "Arguments can't be created");
m_invoke_args = std::move(expected_arguments);
NGRAPH_CHECK(m_invoke_args.size() == m_external_tensors.size(),
NGRAPH_CHECK(m_invoke_args.size() == m_external_tensors->size(),
"Number of external tensors doesn't match number of function arguments");
// Assign external tensor pointers to invocation arguments.
for (size_t i = 0, num_args = m_invoke_args.size(); i < num_args; ++i)
{
((mlir::StaticFloatMemRef*)m_invoke_args[i])->data = (float*)m_external_tensors[i];
((mlir::StaticFloatMemRef*)m_invoke_args[i])->data = (float*)(*m_external_tensors)[i];
}
// Add pointer to memory manager
......
......@@ -60,11 +60,15 @@ namespace ngraph
using TensorList = std::vector<descriptor::Tensor*>;
using TypeList = llvm::SmallVector<mlir::Type, 4>;
MLIRCompiler(const ngraph::op::CompiledKernel* compiled_kernel,
const std::vector<void*>& external_tensors);
MLIRCompiler(const ngraph::op::CompiledKernel* compiled_kernel)
: m_compiled_kernel(compiled_kernel) {}
/// Compiles and runs a subgraph in MLIR.
void compile_and_run();
/// Set runtime tensor arguments for the sub-graph
void set_args(std::vector<void*>* external_tensors);
/// Compiles a subgraph with MLIR
void compile();
/// Executes a pre-compiled subgraph
void run();
/// Returns the memory manager used by this sub-graph compiler.
MLIRMemMgr& get_mem_mgr() { return m_mem_mgr; }
......@@ -134,7 +138,7 @@ namespace ngraph
const ngraph::op::CompiledKernel* m_compiled_kernel;
// Pointers to externally allocated memory for sub-graph's input and output tensors.
const std::vector<void*>& m_external_tensors;
std::vector<void*>* m_external_tensors;
// Arguments for the MLIR function generated for the nGraph sub-graph.
llvm::SmallVector<void*, 8> m_invoke_args;
......
......@@ -24,7 +24,7 @@
#include "ngraph/op/concat.hpp"
#include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "contrib/mlir/compiled_kernel.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp"
#include "ngraph/op/less.hpp"
......
......@@ -174,8 +174,6 @@ set (SRC
op/experimental/quantized_dot.hpp
op/experimental/quantized_dot_bias.cpp
op/experimental/quantized_dot_bias.hpp
op/experimental/compiled_kernel.cpp
op/experimental/compiled_kernel.hpp
op/experimental/transpose.cpp
op/experimental/transpose.hpp
op/experimental/layers/ctc_greedy_decoder.cpp
......
......@@ -17,7 +17,7 @@
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "contrib/mlir/compiler.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "contrib/mlir/compiled_kernel.hpp"
#include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
using namespace ngraph;
......@@ -65,14 +65,10 @@ namespace ngraph
{
ptr_args.push_back(ctx->buffer_data[buffer_index]);
}
// Compile nodes within the CompiledKernel op.
auto* compiled_kernel = static_cast<const CompiledKernel*>(node);
MLIRCompiler mlir_compiler(compiled_kernel, ptr_args);
// TODO: Decouple 'compile' and 'run' APIs. We want to be able to run the same
// jitted code on different arguments.
mlir_compiler.compile_and_run();
CompiledKernel* compiled_kernel = static_cast<CompiledKernel*>(const_cast<Node*>(node));
compiled_kernel->compile();
compiled_kernel->run(ptr_args);
};
functors.emplace_back(functor);
......
......@@ -38,7 +38,7 @@
#include "ngraph/op/divide.hpp"
#include "ngraph/op/equal.hpp"
#include "ngraph/op/exp.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "contrib/mlir/compiled_kernel.hpp"
#include "ngraph/op/floor.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp"
......
......@@ -73,7 +73,7 @@
#include "ngraph/op/erf.hpp"
#include "ngraph/op/exp.hpp"
#include "ngraph/op/experimental/batch_mat_mul.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "contrib/mlir/compiled_kernel.hpp"
#include "ngraph/op/experimental/generate_mask.hpp"
#include "ngraph/op/experimental/quantized_avg_pool.hpp"
#include "ngraph/op/experimental/quantized_concat.hpp"
......
......@@ -52,7 +52,6 @@
#include "ngraph/op/erf.hpp"
#include "ngraph/op/exp.hpp"
#include "ngraph/op/experimental/batch_mat_mul.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/experimental/dyn_broadcast.hpp"
#include "ngraph/op/experimental/dyn_pad.hpp"
#include "ngraph/op/experimental/dyn_replace_slice.hpp"
......
......@@ -30,7 +30,6 @@
#include "ngraph/op/batch_norm.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/op/dequantize.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/experimental/generate_mask.hpp"
#include "ngraph/op/experimental/quantized_concat.hpp"
#include "ngraph/op/experimental/quantized_conv_bias.hpp"
......@@ -1543,241 +1542,6 @@ TEST(cpu_fusion, backwards_maxpool_with_indices_n4_c1_hw4_2x2_max)
EXPECT_TRUE(test::all_close_f(read_vector<float>(output), expected, MIN_FLOAT_TOLERANCE_BITS));
}
#if defined(NGRAPH_HALIDE)
TEST(cpu_fusion, compiled_kernel_one_input_one_output_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto relu_a = make_shared<op::Relu>(A);
auto relu_relu_a = make_shared<op::Relu>(relu_a);
auto ck = make_shared<op::CompiledKernel>(
NodeVector{relu_a, relu_relu_a}, NodeVector{relu_relu_a}, NodeVector{A});
auto f = make_shared<Function>(NodeVector{ck}, ParameterVector{A});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeA);
vector<float> dataA{-1, 4, -1, 4};
copy_data(a, dataA);
vector<float> expected{0, 4, 0, 4};
auto handle = backend->compile(f);
handle->call_with_validate({result}, {a});
EXPECT_TRUE(test::all_close(read_vector<float>(result), expected));
}
TEST(cpu_fusion, compiled_kernel_two_input_two_output_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto relu_a = make_shared<op::Relu>(A);
auto add_ab = make_shared<op::Add>(relu_a, B);
auto ck = make_shared<op::CompiledKernel>(
NodeVector{relu_a, add_ab}, NodeVector{relu_a, add_ab}, NodeVector{A, B});
auto goe1 = make_shared<op::GetOutputElement>(ck, 0);
auto goe2 = make_shared<op::GetOutputElement>(ck, 1);
auto f = make_shared<Function>(NodeVector{goe1, goe2}, ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result_relu = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result_add = backend->create_tensor(element::f32, shapeA);
vector<float> dataA{-1, 4, -1, 4};
vector<float> dataB{0, 4, 0, 4};
copy_data(a, dataA);
copy_data(b, dataB);
vector<float> expected_relu{0, 4, 0, 4};
vector<float> expected_add{4, 4, 4, 4};
auto handle = backend->compile(f);
handle->call_with_validate({result_relu, result_add}, {a, b});
EXPECT_TRUE(test::all_close(read_vector<float>(result_relu), expected_relu));
}
TEST(cpu_fusion, compiled_kernel_embedded_graph_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
auto add = neg_a + neg_b;
auto ck =
make_shared<op::CompiledKernel>(NodeVector{add}, NodeVector{add}, NodeVector{neg_a, neg_b});
auto f = make_shared<Function>(NodeVector{ck}, ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeA);
vector<float> dataA{1, 4, 1, 4};
copy_data(a, dataA);
vector<float> dataB{1, 2, 3, 4};
copy_data(b, dataB);
vector<float> expected{-2, -6, -4, -8};
auto handle = backend->compile(f);
handle->call_with_validate({result}, {a, b});
EXPECT_TRUE(test::all_close_f(read_vector<float>(result), expected, MIN_FLOAT_TOLERANCE_BITS));
}
TEST(cpu_fusion, compiled_kernel_two_inputs_one_output_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto add = A + B;
auto ck = make_shared<op::CompiledKernel>(NodeVector{add}, NodeVector{add}, NodeVector{A, B});
auto f = make_shared<Function>(NodeVector{ck}, ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeA);
vector<float> dataA{1, 4, 1, 4};
copy_data(a, dataA);
vector<float> dataB{1, 2, 3, 4};
copy_data(b, dataB);
vector<float> expected{2, 6, 4, 8};
auto handle = backend->compile(f);
handle->call_with_validate({result}, {a, b});
EXPECT_TRUE(test::all_close_f(read_vector<float>(result), expected, MIN_FLOAT_TOLERANCE_BITS));
}
TEST(cpu_fusion, compiled_kernel_multiple_outputs_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto C = make_shared<op::Parameter>(element::f32, shapeA);
auto D = make_shared<op::Parameter>(element::f32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
auto add_ab = neg_a + neg_b;
auto add_cd = C + B;
auto add_cd_abs = make_shared<op::Abs>(add_cd);
auto add_ab_abs = make_shared<op::Abs>(add_ab);
auto add_aab = add_ab_abs + A;
auto add_cdd = add_cd_abs + D;
auto ck = make_shared<op::CompiledKernel>(
NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
NodeVector{add_aab, add_cdd, neg_b},
NodeVector{A, B, C, D});
auto add_aab_goe = std::make_shared<op::GetOutputElement>(ck, 0);
auto add_cdd_goe = std::make_shared<op::GetOutputElement>(ck, 1);
auto neg_b_goe = std::make_shared<op::GetOutputElement>(ck, 2);
auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
ParameterVector{A, B, C, D});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> d = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::f32, shapeA);
vector<float> dataA{1, 4, 1, 4};
vector<float> dataB{3, 3, 3, 9};
vector<float> dataC{1, 2, 3, 4};
vector<float> dataD{-2, 2, -1, 1};
copy_data(a, dataA);
copy_data(b, dataB);
copy_data(c, dataC);
copy_data(d, dataD);
auto handle = backend->compile(f);
handle->call_with_validate({r1, r2, r3}, {a, b, c, d});
vector<float> expected1{5, 11, 5, 17};
vector<float> expected2{2, 7, 5, 14};
vector<float> expected3{-3, -3, -3, -9};
EXPECT_TRUE(test::all_close_f(read_vector<float>(r1), expected1, MIN_FLOAT_TOLERANCE_BITS));
EXPECT_TRUE(test::all_close_f(read_vector<float>(r2), expected2, MIN_FLOAT_TOLERANCE_BITS));
EXPECT_TRUE(test::all_close_f(read_vector<float>(r3), expected3, MIN_FLOAT_TOLERANCE_BITS));
}
TEST(cpu_fusion, compiled_kernel_copy_with_new_args)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto C = make_shared<op::Parameter>(element::i32, shapeA);
auto D = make_shared<op::Parameter>(element::i32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
auto add_ab = neg_a + neg_b;
auto add_cd = C + B;
auto add_cd_abs = make_shared<op::Abs>(add_cd);
auto add_ab_abs = make_shared<op::Abs>(add_ab);
auto add_aab = add_ab_abs + A;
auto add_cdd = add_cd_abs + D;
auto ck = make_shared<op::CompiledKernel>(
NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
NodeVector{add_aab, add_cdd, neg_b},
NodeVector{A, B, C, D});
auto add_aab_goe = std::make_shared<op::GetOutputElement>(ck, 0);
auto add_cdd_goe = std::make_shared<op::GetOutputElement>(ck, 1);
auto neg_b_goe = std::make_shared<op::GetOutputElement>(ck, 2);
auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
ParameterVector{A, B, C, D});
auto copy_f = clone_function(*f);
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> c = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> d = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> copy_r1 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> copy_r2 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> copy_r3 = backend->create_tensor(element::i32, shapeA);
vector<int> dataA{1, 4, 1, 4};
vector<int> dataB{3, 3, 3, 9};
vector<int> dataC{1, 2, 3, 4};
vector<int> dataD{-2, 2, -1, 1};
copy_data(a, dataA);
copy_data(b, dataB);
copy_data(c, dataC);
copy_data(d, dataD);
auto handle = backend->compile(f);
handle->call_with_validate({r1, r2, r3}, {a, b, c, d});
auto h1 = backend->compile(copy_f);
h1->call_with_validate({copy_r1, copy_r2, copy_r3}, {a, b, c, d});
EXPECT_EQ(read_vector<int>(r1), read_vector<int>(copy_r1));
EXPECT_EQ(read_vector<int>(r2), read_vector<int>(copy_r2));
EXPECT_EQ(read_vector<int>(r3), read_vector<int>(copy_r3));
}
#endif
static std::shared_ptr<ngraph::Function> make_forward_function()
{
Shape shape_a{10, 3, 28, 28};
......@@ -2296,202 +2060,6 @@ TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
EXPECT_TRUE(test::all_close(expected_ct, read_vector<float>(result_ct)));
}
#if 0
TEST(cpu_fusion, compiled_kernel_fusion_multiple_groups_pruned)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto b = make_shared<op::Parameter>(element::f32, shape);
auto c = make_shared<op::Parameter>(element::f32, shape);
auto add_ab = a + b;
auto add_abs = std::make_shared<op::Abs>(add_ab);
auto abs_neg = std::make_shared<op::Negative>(add_abs);
auto sub_c_neg = c - abs_neg;
auto d = make_shared<op::Parameter>(element::f32, shape);
auto d_abs = std::make_shared<op::Abs>(d);
auto add_d = d_abs + add_ab;
auto neg_d = std::make_shared<op::Negative>(add_d);
auto mul_cd = neg_d * sub_c_neg;
auto f =
std::make_shared<Function>(ngraph::NodeVector{mul_cd}, ParameterVector{a, b, c, d});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(3);
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_fusion, compiled_kernel_fusion_bounded_relu)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto relu = make_shared<op::Relu>(a);
auto upper_bound =
op::Constant::create<float>(element::f32, shape, std::vector<float>{6.0f});
auto minn = make_shared<op::Minimum>(relu, upper_bound);
auto absn = make_shared<op::Abs>(minn);
auto negn = std::make_shared<op::Negative>(absn);
auto f = std::make_shared<Function>(ngraph::NodeVector{negn}, ParameterVector{a});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<pass::VisualizeTree>("before_relu_fusion.png");
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(3);
pass_manager.register_pass<pass::VisualizeTree>("after_relu_fusion.png");
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_fusion, compiled_kernel_fusion_multiple_groups)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto b = make_shared<op::Parameter>(element::f32, shape);
auto c = make_shared<op::Parameter>(element::f32, shape);
auto add_ab = a + b;
auto add_abs = std::make_shared<op::Abs>(add_ab);
auto abs_neg = std::make_shared<op::Negative>(add_abs);
auto sub_c_neg = c - abs_neg;
auto d = make_shared<op::Parameter>(element::f32, shape);
auto d_abs = std::make_shared<op::Abs>(d);
auto add_d = d_abs + add_ab;
auto neg_d = std::make_shared<op::Negative>(add_d);
auto mul_cd = neg_d * sub_c_neg;
auto f =
std::make_shared<Function>(ngraph::NodeVector{mul_cd}, ParameterVector{a, b, c, d});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(2);
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_fusion, compiled_kernel_fusion_one_group)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto b = make_shared<op::Parameter>(element::f32, shape);
auto c = make_shared<op::Parameter>(element::f32, shape);
auto add_ab = a + b;
auto add_abs = std::make_shared<op::Abs>(add_ab);
auto abs_neg = std::make_shared<op::Negative>(add_abs);
auto sub_c_neg = c - abs_neg;
auto d = make_shared<op::Parameter>(element::f32, shape);
auto add_d = sub_c_neg + d;
auto abs_add_d = std::make_shared<op::Abs>(add_d);
auto e = make_shared<op::Parameter>(element::f32, shape);
auto add_e = e + abs_add_d;
auto neg_e = std::make_shared<op::Negative>(add_e);
auto f = std::make_shared<Function>(ngraph::NodeVector{neg_e},
ParameterVector{a, b, c, d, e});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(2);
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
#endif
void sigmoid_multiply_fusion_forward_compute(runtime::Backend* backend,
const ParameterVector& input_params,
const vector<vector<float>>& input_data,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment