Unverified Commit fc9a7dea authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

Merge pull request #3298 from NervanaSystems/nmostafa/recompile

[MLIR] Re-compile sub-graph once on first invocation
parents 5d3456e4 956e8b3a
...@@ -68,22 +68,11 @@ using llvm::SmallVector; ...@@ -68,22 +68,11 @@ using llvm::SmallVector;
using llvm::StringRef; using llvm::StringRef;
using llvm::make_unique; using llvm::make_unique;
using llvm::ArrayRef; using llvm::ArrayRef;
using namespace ngraph::runtime::ngmlir; using namespace ngraph::runtime::ngmlir;
#define COMPILE_OP_DECL(op_name) \ #define COMPILE_OP_DECL(op_name) \
create_op<op_name>(MLIRCompiler & compiler, const ngraph::Node* ng_node) create_op<op_name>(MLIRCompiler & compiler, const ngraph::Node* ng_node)
MLIRCompiler::MLIRCompiler(const ngraph::op::CompiledKernel* compiled_kernel,
const std::vector<void*>& external_tensors)
: m_compiled_kernel(compiled_kernel)
, m_external_tensors(external_tensors)
{
NGRAPH_CHECK((m_compiled_kernel->get_arguments().size() +
m_compiled_kernel->get_kernel_outputs().size()) == external_tensors.size(),
"Number of arguments and outputs doesn't match number of tensors");
}
void MLIRCompiler::init_mlir() void MLIRCompiler::init_mlir()
{ {
// Mutex to safely initialize MLIR. // Mutex to safely initialize MLIR.
...@@ -101,12 +90,15 @@ void MLIRCompiler::init_mlir() ...@@ -101,12 +90,15 @@ void MLIRCompiler::init_mlir()
} }
} }
void MLIRCompiler::compile_and_run() void MLIRCompiler::compile()
{ {
build_ng_dialect_module(); build_ng_dialect_module();
lower_ng_dialect(); lower_ng_dialect();
optimize(); }
bind_arguments();
void MLIRCompiler::run(std::vector<void*>& external_tensors)
{
bind_arguments(external_tensors);
execute(); execute();
cleanup(); cleanup();
} }
...@@ -241,9 +233,10 @@ MLIRCompiler::TensorInfo MLIRCompiler::get_tensor_value(descriptor::Tensor* tens ...@@ -241,9 +233,10 @@ MLIRCompiler::TensorInfo MLIRCompiler::get_tensor_value(descriptor::Tensor* tens
return it->second; return it->second;
} }
// Lowers nGraph dialect to affine dialect. // Lowers nGraph dialect all the way to LLVM module.
void MLIRCompiler::lower_ng_dialect() void MLIRCompiler::lower_ng_dialect()
{ {
// Lower NG dialect to Affine
mlir::PassManager pm; mlir::PassManager pm;
pm.addPass(mlir::createDialectLoweringPass(this)); pm.addPass(mlir::createDialectLoweringPass(this));
pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(mlir::createCanonicalizerPass());
...@@ -256,13 +249,48 @@ void MLIRCompiler::lower_ng_dialect() ...@@ -256,13 +249,48 @@ void MLIRCompiler::lower_ng_dialect()
} }
dump_mlir_module("Affine Dialect Dump:"); dump_mlir_module("Affine Dialect Dump:");
optimize();
NGRAPH_CHECK(m_module, "MLIR module is not ready.");
// Lower Standard dialect to LLVM dialect.
// TODO: Do this via PassManager
mlir::LLVMTypeConverter llvm_converter(&m_context);
OwningRewritePatternList patterns;
mlir::populateStdToLLVMConversionPatterns(llvm_converter, patterns);
mlir::ConversionTarget target(m_context);
target.addLegalDialect<mlir::LLVM::LLVMDialect>();
auto result = applyConversionPatterns(*m_module, target, llvm_converter, std::move(patterns));
NGRAPH_CHECK(succeeded(result), "Standard to LLVM dialect conversion failed");
dump_mlir_module("LLVM-IR Dialect Dump:");
// Lower to LLVM BC and optimize
// Initialize LLVM targets.
llvm::InitializeNativeTarget();
llvm::InitializeNativeTargetAsmPrinter();
unsigned opt_level = 3;
if (char* opt_level_str = std::getenv("NGRAPH_MLIR_OPT_LEVEL"))
{
opt_level = std::stoi(opt_level_str);
NGRAPH_CHECK(opt_level >= 0 && opt_level <= 3, "Invalid optimization level");
}
// Create an MLIR execution engine. We use a null MLIR pass manager for now to make sure we
// don't run MLIR passes that were already run. We also pass a default transformer to run
// LLVM optimizations at level 3.
auto llvm_transformer =
mlir::makeOptimizingTransformer(opt_level /*optLevel*/, 0 /*sizeLevel*/);
auto maybeEngine = mlir::ExecutionEngine::create(m_module.get(), llvm_transformer);
NGRAPH_CHECK(maybeEngine, "failed to construct an execution engine");
m_engine = std::move(maybeEngine.get());
} }
// Receives affine dialect as input and applies affine and standard dialect based optimizations.
// Lowering from affine dialect to standard dialect happens along the way. Output consists of
// standard dialect only ops.
void MLIRCompiler::optimize() void MLIRCompiler::optimize()
{ {
// Lower Affine to Std Dialect
mlir::PassManager pm; mlir::PassManager pm;
// Lower affine ops // Lower affine ops
pm.addPass(mlir::createLowerAffinePass()); pm.addPass(mlir::createLowerAffinePass());
...@@ -458,33 +486,39 @@ mlir::Operation* MLIRCompiler::create_index_reduction(const ngraph::Node* ng_nod ...@@ -458,33 +486,39 @@ mlir::Operation* MLIRCompiler::create_index_reduction(const ngraph::Node* ng_nod
op->setAttr("axes", red_axes_attr); op->setAttr("axes", red_axes_attr);
return op; return op;
} }
// Binds MLIR function arguments to the proper values. This includes externally allocated tensors // Binds MLIR function arguments to the proper values. This includes externally allocated tensors
// helpers to be used inside the function. // helpers to be used inside the function.
void MLIRCompiler::bind_arguments() void MLIRCompiler::bind_arguments(std::vector<void*>& external_tensors)
{ {
NGRAPH_CHECK(m_module, "MLIR module is not ready."); NGRAPH_CHECK(m_module, "MLIR module is not ready.");
mlir::Function* func = m_module->getNamedFunction("main"); mlir::Function* func = m_module->getNamedFunction("main");
NGRAPH_CHECK(func && !func->getBlocks().empty(), "Function not found"); NGRAPH_CHECK(func && !func->getBlocks().empty(), "Function not found");
// Set external arguments
NGRAPH_CHECK(m_compiled_kernel, "No compiled kernel set for compiler");
NGRAPH_CHECK((m_compiled_kernel->get_arguments().size() +
m_compiled_kernel->get_kernel_outputs().size()) == external_tensors.size(),
"Number of arguments and outputs doesn't match number of tensors");
m_external_tensors = &external_tensors;
// Create list with a type-erased double pointer for each invocation arguments. // Create list with a type-erased double pointer for each invocation arguments.
// We currently use 'allocateMemRefArguments', which creates a // We currently use 'allocateMemRefArguments', which creates a
// SmallVector<StaticFloatMemref*>. StaticFloatMemref is just a struct with the // SmallVector<StaticFloatMemref*>. StaticFloatMemref is just a struct with the
// actual pointer to the data. // actual pointer to the data.
// create MemRef args // create MemRef args
auto expected_arguments = allocate_memref_args(func); auto expected_arguments = allocate_memref_args();
NGRAPH_CHECK(expected_arguments.size(), "Arguments can't be created"); NGRAPH_CHECK(expected_arguments.size(), "Arguments can't be created");
m_invoke_args = std::move(expected_arguments); m_invoke_args = std::move(expected_arguments);
NGRAPH_CHECK(m_invoke_args.size() == m_external_tensors.size(), NGRAPH_CHECK(m_invoke_args.size() == m_external_tensors->size(),
"Number of external tensors doesn't match number of function arguments"); "Number of external tensors doesn't match number of function arguments");
// Assign external tensor pointers to invocation arguments. // Assign external tensor pointers to invocation arguments.
for (size_t i = 0, num_args = m_invoke_args.size(); i < num_args; ++i) for (size_t i = 0, num_args = m_invoke_args.size(); i < num_args; ++i)
{ {
((mlir::StaticFloatMemRef*)m_invoke_args[i])->data = (float*)m_external_tensors[i]; ((mlir::StaticFloatMemRef*)m_invoke_args[i])->data = (float*)(*m_external_tensors)[i];
} }
// Add pointer to memory manager // Add pointer to memory manager
...@@ -501,39 +535,6 @@ void MLIRCompiler::bind_arguments() ...@@ -501,39 +535,6 @@ void MLIRCompiler::bind_arguments()
// Lowers standard dialect to LLVM dialect and uses the MLIR execution engine to execute the code. // Lowers standard dialect to LLVM dialect and uses the MLIR execution engine to execute the code.
void MLIRCompiler::execute() void MLIRCompiler::execute()
{ {
NGRAPH_CHECK(m_module, "MLIR module is not ready.");
// Lower Standard dialect to LLVM dialect.
mlir::LLVMTypeConverter llvm_converter(&m_context);
OwningRewritePatternList patterns;
mlir::populateStdToLLVMConversionPatterns(llvm_converter, patterns);
mlir::ConversionTarget target(m_context);
target.addLegalDialect<mlir::LLVM::LLVMDialect>();
auto result = applyConversionPatterns(*m_module, target, llvm_converter, std::move(patterns));
NGRAPH_CHECK(succeeded(result), "Standard to LLVM dialect conversion failed");
dump_mlir_module("LLVM-IR Dialect Dump:");
// Initialize LLVM targets.
llvm::InitializeNativeTarget();
llvm::InitializeNativeTargetAsmPrinter();
unsigned opt_level = 3;
if (char* opt_level_str = std::getenv("NGRAPH_MLIR_OPT_LEVEL"))
{
opt_level = std::stoi(opt_level_str);
NGRAPH_CHECK(opt_level >= 0 && opt_level <= 3, "Invalid optimization level");
}
// Create an MLIR execution engine. We use a null MLIR pass manager for now to make sure we
// don't run MLIR passes that were already run. We also pass a default transformer to run
// LLVM optimizations at level 3.
auto llvm_transformer =
mlir::makeOptimizingTransformer(opt_level /*optLevel*/, 0 /*sizeLevel*/);
auto maybeEngine = mlir::ExecutionEngine::create(m_module.get(), llvm_transformer);
NGRAPH_CHECK(maybeEngine, "failed to construct an execution engine");
m_engine = std::move(maybeEngine.get());
// Invoke the JIT-compiled function with the arguments. Note that, for API // Invoke the JIT-compiled function with the arguments. Note that, for API
// uniformity reasons, it takes a list of type-erased pointers to arguments. // uniformity reasons, it takes a list of type-erased pointers to arguments.
// Please, note that 'invoke' method is overloaded with a parameter pack version. // Please, note that 'invoke' method is overloaded with a parameter pack version.
...@@ -560,32 +561,19 @@ void MLIRCompiler::cleanup() ...@@ -560,32 +561,19 @@ void MLIRCompiler::cleanup()
m_mem_mgr.freeAll(); m_mem_mgr.freeAll();
} }
SmallVector<void*, 8> MLIRCompiler::allocate_memref_args(mlir::Function* func) SmallVector<void*, 8> MLIRCompiler::allocate_memref_args()
{ {
SmallVector<void*, 8> args; SmallVector<void*, 8> args;
args.reserve(func->getNumArguments()); for (auto i = 0; i < m_external_tensors->size(); i++)
for (const auto& arg : func->getArguments())
{ {
auto descriptor = allocate_memref_descriptor(arg->getType()); auto descriptor = allocate_memref_descriptor();
if (!descriptor)
{
continue;
}
args.push_back(descriptor); args.push_back(descriptor);
} }
return args; return args;
} }
mlir::StaticFloatMemRef* MLIRCompiler::allocate_memref_descriptor(mlir::Type type) mlir::StaticFloatMemRef* MLIRCompiler::allocate_memref_descriptor()
{ {
auto memRefType = type.dyn_cast<mlir::MemRefType>();
if (!memRefType)
{
return nullptr;
}
NGRAPH_CHECK(memRefType.getNumDynamicDims() == 0, "No support for dynamic shapes");
// We only use StaticFloatMemRef because that's what MLIR currently offers. // We only use StaticFloatMemRef because that's what MLIR currently offers.
// We should expand this with different types and dynamic MemRefs // We should expand this with different types and dynamic MemRefs
auto* descriptor = auto* descriptor =
......
...@@ -63,11 +63,16 @@ namespace ngraph ...@@ -63,11 +63,16 @@ namespace ngraph
using TensorList = std::vector<descriptor::Tensor*>; using TensorList = std::vector<descriptor::Tensor*>;
using TypeList = llvm::SmallVector<mlir::Type, 4>; using TypeList = llvm::SmallVector<mlir::Type, 4>;
MLIRCompiler(const ngraph::op::CompiledKernel* compiled_kernel, MLIRCompiler(const ngraph::op::CompiledKernel* compiled_kernel)
const std::vector<void*>& external_tensors); : m_compiled_kernel(compiled_kernel)
{
}
/// Compiles a subgraph with MLIR
void compile();
/// Compiles and runs a subgraph in MLIR. /// Executes a pre-compiled subgraph
void compile_and_run(); void run(std::vector<void*>& external_tensors);
/// Returns the memory manager used by this sub-graph compiler. /// Returns the memory manager used by this sub-graph compiler.
MLIRMemMgr& get_mem_mgr() { return m_mem_mgr; } MLIRMemMgr& get_mem_mgr() { return m_mem_mgr; }
...@@ -88,7 +93,7 @@ namespace ngraph ...@@ -88,7 +93,7 @@ namespace ngraph
void build_ng_dialect_module(); void build_ng_dialect_module();
void lower_ng_dialect(); void lower_ng_dialect();
void optimize(); void optimize();
void bind_arguments(); void bind_arguments(std::vector<void*>& external_tensors);
void execute(); void execute();
void cleanup(); void cleanup();
...@@ -120,10 +125,10 @@ namespace ngraph ...@@ -120,10 +125,10 @@ namespace ngraph
void create_return(); void create_return();
/// Helper to create memref arguments for MLIR function signature /// Helper to create memref arguments for MLIR function signature
llvm::SmallVector<void*, 8> allocate_memref_args(mlir::Function* func); llvm::SmallVector<void*, 8> allocate_memref_args();
/// Helper to allocate a mem ref object. Handles static shapes only for now. /// Helper to allocate a mem ref object. Handles static shapes only for now.
mlir::StaticFloatMemRef* allocate_memref_descriptor(mlir::Type type); mlir::StaticFloatMemRef* allocate_memref_descriptor();
/// Helper to dump MLIR module into llvm::dbgs prepended by the message \p msg. /// Helper to dump MLIR module into llvm::dbgs prepended by the message \p msg.
void dump_mlir_module(const std::string msg); void dump_mlir_module(const std::string msg);
...@@ -133,7 +138,7 @@ namespace ngraph ...@@ -133,7 +138,7 @@ namespace ngraph
const ngraph::op::CompiledKernel* m_compiled_kernel; const ngraph::op::CompiledKernel* m_compiled_kernel;
// Pointers to externally allocated memory for sub-graph's input and output tensors. // Pointers to externally allocated memory for sub-graph's input and output tensors.
const std::vector<void*>& m_external_tensors; std::vector<void*>* m_external_tensors;
// Arguments for the MLIR function generated for the nGraph sub-graph. // Arguments for the MLIR function generated for the nGraph sub-graph.
llvm::SmallVector<void*, 8> m_invoke_args; llvm::SmallVector<void*, 8> m_invoke_args;
......
...@@ -65,14 +65,25 @@ namespace ngraph ...@@ -65,14 +65,25 @@ namespace ngraph
{ {
ptr_args.push_back(ctx->buffer_data[buffer_index]); ptr_args.push_back(ctx->buffer_data[buffer_index]);
} }
// Compile nodes within the CompiledKernel op. // Compile nodes within the CompiledKernel op.
auto* compiled_kernel = static_cast<const CompiledKernel*>(node); CompiledKernel* compiled_kernel =
static_cast<CompiledKernel*>(const_cast<Node*>(node));
bool is_module_ready = true;
auto it = ctx->mlir_compilers.find(compiled_kernel);
if (it == ctx->mlir_compilers.end())
{
// create a new compiler for the CK
ctx->mlir_compilers.emplace(compiled_kernel, compiled_kernel);
is_module_ready = false;
}
MLIRCompiler mlir_compiler(compiled_kernel, ptr_args); MLIRCompiler& mlir_compiler = ctx->mlir_compilers.find(compiled_kernel)->second;
// TODO: Decouple 'compile' and 'run' APIs. We want to be able to run the same if (!is_module_ready)
// jitted code on different arguments. {
mlir_compiler.compile_and_run(); mlir_compiler.compile();
}
mlir_compiler.run(ptr_args);
}; };
functors.emplace_back(functor); functors.emplace_back(functor);
......
...@@ -25,6 +25,11 @@ ...@@ -25,6 +25,11 @@
#include <tbb/flow_graph.h> #include <tbb/flow_graph.h>
#include <tbb/global_control.h> #include <tbb/global_control.h>
#include <tbb/task_scheduler_init.h> #include <tbb/task_scheduler_init.h>
#include "ngraph/op/experimental/compiled_kernel.hpp"
#ifdef NGRAPH_MLIR_ENABLE
#include "contrib/mlir/compiler.hpp"
#endif
namespace mkldnn namespace mkldnn
{ {
...@@ -66,6 +71,14 @@ namespace ngraph ...@@ -66,6 +71,14 @@ namespace ngraph
State* const* states; State* const* states;
std::set<size_t> breakpoints; std::set<size_t> breakpoints;
size_t pc; size_t pc;
#ifdef NGRAPH_MLIR_ENABLE
/// Maps CompiledKernel nodes to their MLIR compiler
/// The MLIR compiler caches the compiled code on the first invocation,
/// and may in the future support re-compilation
std::unordered_map<ngraph::op::CompiledKernel*,
ngraph::runtime::ngmlir::MLIRCompiler>
mlir_compilers;
#endif
}; };
} }
......
...@@ -248,3 +248,36 @@ NGRAPH_TEST(${BACKEND_NAME}, mlir_subgraphs_cycle) ...@@ -248,3 +248,36 @@ NGRAPH_TEST(${BACKEND_NAME}, mlir_subgraphs_cycle)
EXPECT_TRUE( EXPECT_TRUE(
test::all_close_f(read_vector<float>(result), vector<float>{70, 80, 90, 136, 164, 192})); test::all_close_f(read_vector<float>(result), vector<float>{70, 80, 90, 136, 164, 192}));
} }
NGRAPH_TEST(${BACKEND_NAME}, mlir_multi_call)
{
Shape shape_in1{2, 3};
Shape shape_in2{3, 3};
Shape shape_out{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_in1);
auto B = make_shared<op::Parameter>(element::f32, shape_in2);
auto dot = make_shared<op::Dot>(A, B);
auto C = make_shared<op::Parameter>(element::f32, shape_in1);
auto add = make_shared<op::Add>(dot, C);
auto f = make_shared<Function>(add, ParameterVector{A, B, C});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape_in1);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
copy_data(c, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
auto handle = backend->compile(f);
handle->call_with_validate({result}, {a, b, c});
handle->call_with_validate({result}, {a, b, c});
handle->call_with_validate({result}, {a, b, c});
handle->call_with_validate({result}, {a, b, c});
EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
vector<float>{35.f, 40.f, 45.f, 68.f, 82.f, 96.f}));
}
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment